UNPKG

@j-o-r/vdb

Version:

In-memory vector embeddings database using embeddings for efficient querying text documents

319 lines (292 loc) 10.5 kB
import readline from 'node:readline'; import fs from 'node:fs'; import Cache from '@j-o-r/cache'; import { FlagEmbedding, EmbeddingModel } from 'fastembed'; /** * @typedef VResult * @property {number} idx - The dabase line * @property {number} similarity - The higher the better (value between 0 -1) * @property {string} text - The retrieved text line from the file */ /** * @typedef VResultRaw * @property {number} idx - The dabase line * @property {number} similarity - The higher the better (value between 0 -1) */ /** * @typedef VSelector * @property {number} results - the number of results to search * @property {number} preRead - The number of lines to return before the found index * @property {number} postRead - The number of lines to return after the found index * @property {number} treshhold - FLoat between (0 - 1). Only pass results equal or higher then treshhold */ /** db types */ const TYPES = { VEC: 'vec', SRC: 'src' }; /** * Find base names from an array of items. * @param {string[]} arr - Array of items. * @returns {string[]} Array of base names. */ const findBaseNames = (arr) => { const baseNames = new Set(); const srcSet = new Set(); const vecSet = new Set(); arr.forEach(item => { const SRC = '_' + TYPES.SRC; const VEC = '_' + TYPES.VEC; if (item.endsWith(SRC)) { srcSet.add(item.replace(SRC, '')); } else if (item.endsWith(VEC)) { vecSet.add(item.replace(VEC, '')); } }); srcSet.forEach(base => { if (vecSet.has(base)) { baseNames.add(base); } }); return Array.from(baseNames); }; /** * Initialize the embedding model. * @returns {Promise<FlagEmbedding>} The initialized embedding model. */ const initializeModel = async () => { return await FlagEmbedding.init({ model: EmbeddingModel.BGEBaseEN }); }; /** * Read a document from a file path. * @param {string} filePath - The file path. * @returns {Promise<string[]>} Array of lines from the document. */ const readDocument = async (filePath) => { const fileStream = fs.createReadStream(filePath); const rl = readline.createInterface({ input: fileStream, crlfDelay: Infinity }); const lines = []; for await (const line of rl) { lines.push(line); } return lines; }; /** * Generate embeddings for an array of lines. * @param {FlagEmbedding} embeddingModel - The embedding model. * @param {string[]} lines - Array of lines. * @param {number} [batchSize=256] - Batch size for processing. * @returns {Promise<Float32Array[]>} Array of embeddings. */ const generateEmbeddings = async (embeddingModel, lines, batchSize = 256) => { console.log('Generating embeddings, patience'); let counter = 1; const embeddings = embeddingModel.embed(lines, batchSize); const allEmbeddings = []; for await (const batch of embeddings) { // @ts-ignore process.stdout.clearLine(); process.stdout.cursorTo(0); process.stdout.write(`Processing: ${counter}`); allEmbeddings.push(...batch); counter++; } console.log(''); // @ts-ignore return allEmbeddings; }; /** * Calculate cosine similarity between two vectors. * @param {Float32Array} vecA - First vector. * @param {Float32Array} vecB - Second vector. * @returns {number} Cosine similarity between 0 and 1. */ const cosineSimilarity = (vecA, vecB) => { const dotProduct = vecA.reduce((sum, a, idx) => sum + a * vecB[idx], 0); const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0)); const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0)); return dotProduct / (magnitudeA * magnitudeB); }; /** * Query embeddings for a given query. * @param {FlagEmbedding} embeddingModel - The embedding model. * @param {Float32Array[]} allEmbeddings - Array of embeddings. * @param {string} query - The query string. * @param {number} [results=5] - Number of results to return. * @returns {Promise<VResult[]>} Array of results. */ const queryEmbeddings = async (embeddingModel, allEmbeddings, query, results = 5) => { const queryEmbedding = await embeddingModel.queryEmbed(query); const similarities = allEmbeddings.map((embedding, idx) => ({ idx, // @ts-ignore similarity: cosineSimilarity(embedding, queryEmbedding), text: '' })); similarities.sort((a, b) => b.similarity - a.similarity); return similarities.slice(0, results); }; /** * Create an embeddings database from a text document. * @param {Cache} storage - Storage instance. * @param {string} file - Path to the text document. * @param {string} dbName - Name of the database. * @param {number} [batchSize=256] - Batch size for processing. * @param {function(string, number):string} [filter] */ const create = async (storage, file, dbName, filter, batchSize = 256) => { const lines = await readDocument(file); // Apply the filter and remove invalid strings // Apply the filter with index and remove invalid strings const filteredLines = lines .map((line, index) => (filter ? filter(line, index) : line)) // Pass index to filter .filter(line => line !== undefined && line !== null && line !== ''); // Keep valid strings const embeddingModel = await initializeModel(); const allEmbeddings = await generateEmbeddings(embeddingModel, filteredLines, batchSize); storage.write(`${dbName}_${TYPES.VEC}`, allEmbeddings); storage.write(`${dbName}_${TYPES.SRC}`, filteredLines); }; /** * Get surrounding lines from a document based on the found indexes * @param {string[]} lines - Document lines. * @param {number[]} indexes - Search hit indexes. * @param {number} [preRead=4] - Number of lines before the index. * @param {number} [postRead=4] - Number of lines after the index. * @returns {string[]} Array of surrounding lines. */ const getSurroundingLines = (lines, indexes, preRead = 4, postRead = 4) => { const result = new Set(); indexes.forEach(index => { const start = Math.max(0, index - preRead); const end = Math.min(lines.length, index + postRead + 1); for (let i = start; i < end; i++) { result.add(lines[i]); } }); return Array.from(result); }; /** * Search the database and return raw results. * @param {Cache} storage - Storage instance. * @param {string} dbName - Name of the database. * @param {string} query - Search query. * @param {number} [results=5] - Number of results to return. * @returns {Promise<VResultRaw[]>} Array of results. */ const executeSearch = async (storage, dbName, query, results = 5) => { const embeddingModel = await initializeModel(); if (!findBaseNames(storage.list()).includes(dbName)) { throw new Error(`Database not found: ${dbName}`); } const allEmbeddings = storage.read(`${dbName}_${TYPES.VEC}`); return queryEmbeddings(embeddingModel, allEmbeddings, query, results); }; /** * Search the database and return results including the line from the document. * @param {Cache} storage - Storage instance. * @param {string} dbName - Name of the database. * @param {string} query - Search query. * @param {number} [results=5] - Number of results to return. * @returns {Promise<VResult[]>} Array of results. */ const searchRaw = async (storage, dbName, query, results = 5) => { const response = []; const lines = storage.read(`${dbName}_${TYPES.SRC}`); const list = await executeSearch(storage, dbName, query, results); list.forEach(record => { record['text'] = lines[record.idx]; response.push(record); }); return response; }; /** * Search the database and return matched lines * @param {Cache} storage - Storage instance. * @param {string} dbName - Name of the database. * @param {string} query - Search query. * @param {VSelector} [select] - Selector options. * @returns {Promise<string>} Formatted search results. */ const search = async (storage, dbName, query, select) => { let results = 5; let preRead = 0; let postRead = 0; let treshhold = 0; if (select && select.results) results = select.results; if (select && select.preRead) preRead = select.preRead; if (select && select.postRead) postRead = select.postRead; if (select && select.treshhold) treshhold = select.treshhold; const list = await executeSearch(storage, dbName, query, results); const lines = storage.read(`${dbName}_${TYPES.SRC}`); const indexes = list .filter(obj => obj.similarity > treshhold) // Filter objects based on similarity treshold .map(obj => obj.idx); // Map the filtered objects to their idx values return getSurroundingLines(lines, indexes, preRead, postRead).join('\n'); }; /** * Simple vector database class. */ class Vdb { #storage; /** * Constructor for Vdb. * @param {string} storagePath - Path to storage folder. */ constructor(storagePath) { this.#storage = new Cache(storagePath); } /** * Get a list of available databases. * @returns {string[]} List of database names. */ list() { return findBaseNames(this.#storage.list()); } /** * Delete a database. * @param {string} dbName - Name of the database to delete. */ delete(dbName) { if (!this.list().includes(dbName)) { console.log(`Database '${dbName}' not found.`); return; } this.#storage.delete(`${dbName}_${TYPES.SRC}`); this.#storage.delete(`${dbName}_${TYPES.VEC}`); console.log(`Database '${dbName}' deleted.`); } /** * Create or overwrite an embeddings database from a text document. * @param {string} file - Path to the text document. * @param {string} dbName - Name of the database. * @param {function(string):string} filter * @param {number} [batchSize=256] - Batch size for processing. */ async create(file, dbName, filter, batchSize = 256) { return create(this.#storage, file, dbName, filter, batchSize); } /** * Search the database. * @param {string} dbName - Name of the database. * @param {string} query - Search query. * @param {VSelector} [selector] - Selector options. * @returns {Promise<string>} Formatted search results. */ async search(dbName, query, selector) { return search(this.#storage, dbName, query, selector); } /** * Get raw search results from the database. * @param {string} dbName - Name of the database. * @param {string} query - Search query. * @param {number} [results=5] - Number of results to return. * @returns {Promise<VResult[]>} Array of results. */ async getResult(dbName, query, results) { return searchRaw(this.#storage, dbName, query, results); } } export { Vdb as default };