-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerateResourceEmbeddingsBatch.ts
131 lines (111 loc) · 4.26 KB
/
generateResourceEmbeddingsBatch.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/**
* Script to generate embeddings for Resources in batches
* Addresses ENG-721: Simplify Resources embedding process
*/
import { getPayload } from 'payload'
import { openai } from '@ai-sdk/openai'
import { embedMany } from 'ai'
import config from '../payload.config'
const MODEL = 'text-embedding-3-large'
const DIMENSIONS = 256
const BATCH_SIZE = 50
const PROBLEMATIC_RESOURCE_ID = '67dd4e7ec37e99e7ed48ffa2'
/**
* Prepares text content from a resource for embedding
* @param resource Resource document to extract text from
* @returns String of text to embed
*/
function prepareResourceTextForEmbedding(resource: any): string {
const parts = [
resource.name || '',
resource.content || '',
resource.yaml || '',
typeof resource.data === 'object' ? JSON.stringify(resource.data) : resource.data || ''
]
return parts.filter(Boolean).join(' ').trim()
}
/**
* Generate embeddings for resources without embeddings using batch processing
* @param batchSize Number of resources to process in a batch
*/
async function generateResourceEmbeddingsBatch(batchSize = BATCH_SIZE) {
try {
console.log('Starting batch embedding generation for Resources...')
const payload = await getPayload({ config })
const response = await payload.find({
collection: 'resources',
where: {
embedding: { exists: false },
},
limit: batchSize,
})
const resources = response.docs
if (resources.length === 0) {
console.log('No resources found without embeddings')
return
}
console.log(`Found ${resources.length} Resources without embeddings`)
const problematicResourceIndex = resources.findIndex(r => r.id === PROBLEMATIC_RESOURCE_ID)
if (problematicResourceIndex !== -1) {
console.log(`Found problematic resource ${PROBLEMATIC_RESOURCE_ID}, will process separately`)
const problematicResource = resources.splice(problematicResourceIndex, 1)[0]
try {
const textToEmbed = prepareResourceTextForEmbedding(problematicResource)
console.log(`Processing problematic resource with text length: ${textToEmbed.length}`)
const embeddingModel = openai.textEmbeddingModel(MODEL)
const result = await embeddingModel.doEmbed({ values: [textToEmbed] })
const embedding = result.embeddings[0]
await payload.update({
collection: 'resources',
id: problematicResource.id,
data: {
embedding: { vectors: embedding },
},
depth: 0,
})
console.log(`✓ Successfully processed problematic resource ${problematicResource.id}`)
} catch (error) {
console.error(`Error processing problematic resource ${problematicResource.id}:`, error)
}
}
if (resources.length === 0) {
console.log('No remaining resources to process in batch')
return
}
const textsToEmbed = resources.map(resource => {
return prepareResourceTextForEmbedding(resource)
})
console.log(`Generating embeddings for ${resources.length} resources in batch...`)
const { embeddings } = await embedMany({
model: openai.embedding(MODEL),
values: textsToEmbed,
})
console.log('Updating resources with generated embeddings...')
const updatePromises = resources.map((resource, i) => {
return payload.update({
collection: 'resources',
id: resource.id,
data: {
embedding: { vectors: embeddings[i] },
},
depth: 0,
})
.then(() => console.log(`✓ Successfully updated embedding for Resource ${resource.id}`))
.catch(error => console.error(`Error updating embedding for Resource ${resource.id}:`, error))
})
await Promise.all(updatePromises)
console.log(`Processed batch of ${resources.length} resources`)
} catch (error) {
console.error('Error in batch embedding generation:', error)
}
}
const customBatchSize = process.argv[2] ? parseInt(process.argv[2], 10) : BATCH_SIZE
generateResourceEmbeddingsBatch(customBatchSize)
.then(() => {
console.log('Batch embedding generation completed')
process.exit(0)
})
.catch(error => {
console.error('Fatal error in batch embedding generation:', error)
process.exit(1)
})