db-vector
Version:
Client adapters for vector databases with utilities
54 lines • 2.45 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.processDocument = processDocument;
const text_splitter_1 = require("langchain/text_splitter");
const createExtractor_1 = require("./createExtractor");
const defaultConfig = {
batchSize: 10,
modelName: 'mixedbread-ai/mxbai-embed-large-v1',
onProgress: () => { }
};
async function processDocument(client, indexname, namespace, doc, config = {}) {
const finalConfig = { ...defaultConfig, ...config };
const extractor = await (0, createExtractor_1.createExtractor)(finalConfig.modelName);
const splitter = new text_splitter_1.RecursiveCharacterTextSplitter();
const documentChunks = await splitter.splitText(doc.pageContent);
const state = {
totalDocumentChunks: documentChunks.length,
totalDocumentChunksUpserted: 0
};
const filename = getFilename(doc.metadata.source);
let chunkBatchIndex = 0;
while (documentChunks.length > 0) {
chunkBatchIndex++;
const chunkBatch = documentChunks.splice(0, finalConfig.batchSize);
await processOneBatch(client, indexname, namespace, extractor, chunkBatch, chunkBatchIndex, filename, state, finalConfig.onProgress);
}
finalConfig.onProgress(filename, state.totalDocumentChunks, state.totalDocumentChunksUpserted, true);
return state;
}
function getFilename(filename) {
const docname = filename.substring(filename.lastIndexOf("/") + 1);
return docname.substring(0, docname.lastIndexOf(".")) || docname;
}
async function processOneBatch(client, indexname, namespace, extractor, chunkBatch, chunkBatchIndex, filename, state, onProgress) {
const output = await extractor(chunkBatch.map(str => str.replace(/\n/g, ' ')), {
pooling: 'cls'
});
const embeddingsBatch = output.tolist();
const vectorBatch = [];
for (let i = 0; i < chunkBatch.length; i++) {
const chunk = chunkBatch[i];
const embedding = embeddingsBatch[i];
vectorBatch.push({
id: `${filename}-${chunkBatchIndex}-${i}`,
values: embedding,
metadata: { chunk }
});
}
const index = client.Index(indexname).namespace(namespace);
await index.upsert(vectorBatch);
state.totalDocumentChunksUpserted += vectorBatch.length;
onProgress(filename, state.totalDocumentChunks, state.totalDocumentChunksUpserted, false);
}
//# sourceMappingURL=processDocument.js.map