UNPKG

db-vector

Version:

Client adapters for vector databases with utilities

54 lines 2.45 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.processDocument = processDocument; const text_splitter_1 = require("langchain/text_splitter"); const createExtractor_1 = require("./createExtractor"); const defaultConfig = { batchSize: 10, modelName: 'mixedbread-ai/mxbai-embed-large-v1', onProgress: () => { } }; async function processDocument(client, indexname, namespace, doc, config = {}) { const finalConfig = { ...defaultConfig, ...config }; const extractor = await (0, createExtractor_1.createExtractor)(finalConfig.modelName); const splitter = new text_splitter_1.RecursiveCharacterTextSplitter(); const documentChunks = await splitter.splitText(doc.pageContent); const state = { totalDocumentChunks: documentChunks.length, totalDocumentChunksUpserted: 0 }; const filename = getFilename(doc.metadata.source); let chunkBatchIndex = 0; while (documentChunks.length > 0) { chunkBatchIndex++; const chunkBatch = documentChunks.splice(0, finalConfig.batchSize); await processOneBatch(client, indexname, namespace, extractor, chunkBatch, chunkBatchIndex, filename, state, finalConfig.onProgress); } finalConfig.onProgress(filename, state.totalDocumentChunks, state.totalDocumentChunksUpserted, true); return state; } function getFilename(filename) { const docname = filename.substring(filename.lastIndexOf("/") + 1); return docname.substring(0, docname.lastIndexOf(".")) || docname; } async function processOneBatch(client, indexname, namespace, extractor, chunkBatch, chunkBatchIndex, filename, state, onProgress) { const output = await extractor(chunkBatch.map(str => str.replace(/\n/g, ' ')), { pooling: 'cls' }); const embeddingsBatch = output.tolist(); const vectorBatch = []; for (let i = 0; i < chunkBatch.length; i++) { const chunk = chunkBatch[i]; const embedding = embeddingsBatch[i]; vectorBatch.push({ id: `${filename}-${chunkBatchIndex}-${i}`, values: embedding, metadata: { chunk } }); } const index = client.Index(indexname).namespace(namespace); await index.upsert(vectorBatch); state.totalDocumentChunksUpserted += vectorBatch.length; onProgress(filename, state.totalDocumentChunks, state.totalDocumentChunksUpserted, false); } //# sourceMappingURL=processDocument.js.map