@j-o-r/vdb
Version:
In-memory vector embeddings database using embeddings for efficient querying text documents
319 lines (292 loc) • 10.5 kB
JavaScript
import readline from 'node:readline';
import fs from 'node:fs';
import Cache from '@j-o-r/cache';
import { FlagEmbedding, EmbeddingModel } from 'fastembed';
/**
* @typedef VResult
* @property {number} idx - The dabase line
* @property {number} similarity - The higher the better (value between 0 -1)
* @property {string} text - The retrieved text line from the file
*/
/**
* @typedef VResultRaw
* @property {number} idx - The dabase line
* @property {number} similarity - The higher the better (value between 0 -1)
*/
/**
* @typedef VSelector
* @property {number} results - the number of results to search
* @property {number} preRead - The number of lines to return before the found index
* @property {number} postRead - The number of lines to return after the found index
* @property {number} treshhold - FLoat between (0 - 1). Only pass results equal or higher then treshhold
*/
/** db types */
const TYPES = {
VEC: 'vec',
SRC: 'src'
};
/**
* Find base names from an array of items.
* @param {string[]} arr - Array of items.
* @returns {string[]} Array of base names.
*/
const findBaseNames = (arr) => {
const baseNames = new Set();
const srcSet = new Set();
const vecSet = new Set();
arr.forEach(item => {
const SRC = '_' + TYPES.SRC;
const VEC = '_' + TYPES.VEC;
if (item.endsWith(SRC)) {
srcSet.add(item.replace(SRC, ''));
} else if (item.endsWith(VEC)) {
vecSet.add(item.replace(VEC, ''));
}
});
srcSet.forEach(base => {
if (vecSet.has(base)) {
baseNames.add(base);
}
});
return Array.from(baseNames);
};
/**
* Initialize the embedding model.
* @returns {Promise<FlagEmbedding>} The initialized embedding model.
*/
const initializeModel = async () => {
return await FlagEmbedding.init({ model: EmbeddingModel.BGEBaseEN });
};
/**
* Read a document from a file path.
* @param {string} filePath - The file path.
* @returns {Promise<string[]>} Array of lines from the document.
*/
const readDocument = async (filePath) => {
const fileStream = fs.createReadStream(filePath);
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity
});
const lines = [];
for await (const line of rl) {
lines.push(line);
}
return lines;
};
/**
* Generate embeddings for an array of lines.
* @param {FlagEmbedding} embeddingModel - The embedding model.
* @param {string[]} lines - Array of lines.
* @param {number} [batchSize=256] - Batch size for processing.
* @returns {Promise<Float32Array[]>} Array of embeddings.
*/
const generateEmbeddings = async (embeddingModel, lines, batchSize = 256) => {
console.log('Generating embeddings, patience');
let counter = 1;
const embeddings = embeddingModel.embed(lines, batchSize);
const allEmbeddings = [];
for await (const batch of embeddings) {
// @ts-ignore
process.stdout.clearLine();
process.stdout.cursorTo(0);
process.stdout.write(`Processing: ${counter}`);
allEmbeddings.push(...batch);
counter++;
}
console.log('');
// @ts-ignore
return allEmbeddings;
};
/**
* Calculate cosine similarity between two vectors.
* @param {Float32Array} vecA - First vector.
* @param {Float32Array} vecB - Second vector.
* @returns {number} Cosine similarity between 0 and 1.
*/
const cosineSimilarity = (vecA, vecB) => {
const dotProduct = vecA.reduce((sum, a, idx) => sum + a * vecB[idx], 0);
const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
return dotProduct / (magnitudeA * magnitudeB);
};
/**
* Query embeddings for a given query.
* @param {FlagEmbedding} embeddingModel - The embedding model.
* @param {Float32Array[]} allEmbeddings - Array of embeddings.
* @param {string} query - The query string.
* @param {number} [results=5] - Number of results to return.
* @returns {Promise<VResult[]>} Array of results.
*/
const queryEmbeddings = async (embeddingModel, allEmbeddings, query, results = 5) => {
const queryEmbedding = await embeddingModel.queryEmbed(query);
const similarities = allEmbeddings.map((embedding, idx) => ({
idx,
// @ts-ignore
similarity: cosineSimilarity(embedding, queryEmbedding),
text: ''
}));
similarities.sort((a, b) => b.similarity - a.similarity);
return similarities.slice(0, results);
};
/**
* Create an embeddings database from a text document.
* @param {Cache} storage - Storage instance.
* @param {string} file - Path to the text document.
* @param {string} dbName - Name of the database.
* @param {number} [batchSize=256] - Batch size for processing.
* @param {function(string, number):string} [filter]
*/
const create = async (storage, file, dbName, filter, batchSize = 256) => {
const lines = await readDocument(file);
// Apply the filter and remove invalid strings
// Apply the filter with index and remove invalid strings
const filteredLines = lines
.map((line, index) => (filter ? filter(line, index) : line)) // Pass index to filter
.filter(line => line !== undefined && line !== null && line !== ''); // Keep valid strings
const embeddingModel = await initializeModel();
const allEmbeddings = await generateEmbeddings(embeddingModel, filteredLines, batchSize);
storage.write(`${dbName}_${TYPES.VEC}`, allEmbeddings);
storage.write(`${dbName}_${TYPES.SRC}`, filteredLines);
};
/**
* Get surrounding lines from a document based on the found indexes
* @param {string[]} lines - Document lines.
* @param {number[]} indexes - Search hit indexes.
* @param {number} [preRead=4] - Number of lines before the index.
* @param {number} [postRead=4] - Number of lines after the index.
* @returns {string[]} Array of surrounding lines.
*/
const getSurroundingLines = (lines, indexes, preRead = 4, postRead = 4) => {
const result = new Set();
indexes.forEach(index => {
const start = Math.max(0, index - preRead);
const end = Math.min(lines.length, index + postRead + 1);
for (let i = start; i < end; i++) {
result.add(lines[i]);
}
});
return Array.from(result);
};
/**
* Search the database and return raw results.
* @param {Cache} storage - Storage instance.
* @param {string} dbName - Name of the database.
* @param {string} query - Search query.
* @param {number} [results=5] - Number of results to return.
* @returns {Promise<VResultRaw[]>} Array of results.
*/
const executeSearch = async (storage, dbName, query, results = 5) => {
const embeddingModel = await initializeModel();
if (!findBaseNames(storage.list()).includes(dbName)) {
throw new Error(`Database not found: ${dbName}`);
}
const allEmbeddings = storage.read(`${dbName}_${TYPES.VEC}`);
return queryEmbeddings(embeddingModel, allEmbeddings, query, results);
};
/**
* Search the database and return results including the line from the document.
* @param {Cache} storage - Storage instance.
* @param {string} dbName - Name of the database.
* @param {string} query - Search query.
* @param {number} [results=5] - Number of results to return.
* @returns {Promise<VResult[]>} Array of results.
*/
const searchRaw = async (storage, dbName, query, results = 5) => {
const response = [];
const lines = storage.read(`${dbName}_${TYPES.SRC}`);
const list = await executeSearch(storage, dbName, query, results);
list.forEach(record => {
record['text'] = lines[record.idx];
response.push(record);
});
return response;
};
/**
* Search the database and return matched lines
* @param {Cache} storage - Storage instance.
* @param {string} dbName - Name of the database.
* @param {string} query - Search query.
* @param {VSelector} [select] - Selector options.
* @returns {Promise<string>} Formatted search results.
*/
const search = async (storage, dbName, query, select) => {
let results = 5;
let preRead = 0;
let postRead = 0;
let treshhold = 0;
if (select && select.results) results = select.results;
if (select && select.preRead) preRead = select.preRead;
if (select && select.postRead) postRead = select.postRead;
if (select && select.treshhold) treshhold = select.treshhold;
const list = await executeSearch(storage, dbName, query, results);
const lines = storage.read(`${dbName}_${TYPES.SRC}`);
const indexes = list
.filter(obj => obj.similarity > treshhold) // Filter objects based on similarity treshold
.map(obj => obj.idx); // Map the filtered objects to their idx values
return getSurroundingLines(lines, indexes, preRead, postRead).join('\n');
};
/**
* Simple vector database class.
*/
class Vdb {
#storage;
/**
* Constructor for Vdb.
* @param {string} storagePath - Path to storage folder.
*/
constructor(storagePath) {
this.#storage = new Cache(storagePath);
}
/**
* Get a list of available databases.
* @returns {string[]} List of database names.
*/
list() {
return findBaseNames(this.#storage.list());
}
/**
* Delete a database.
* @param {string} dbName - Name of the database to delete.
*/
delete(dbName) {
if (!this.list().includes(dbName)) {
console.log(`Database '${dbName}' not found.`);
return;
}
this.#storage.delete(`${dbName}_${TYPES.SRC}`);
this.#storage.delete(`${dbName}_${TYPES.VEC}`);
console.log(`Database '${dbName}' deleted.`);
}
/**
* Create or overwrite an embeddings database from a text document.
* @param {string} file - Path to the text document.
* @param {string} dbName - Name of the database.
* @param {function(string):string} filter
* @param {number} [batchSize=256] - Batch size for processing.
*/
async create(file, dbName, filter, batchSize = 256) {
return create(this.#storage, file, dbName, filter, batchSize);
}
/**
* Search the database.
* @param {string} dbName - Name of the database.
* @param {string} query - Search query.
* @param {VSelector} [selector] - Selector options.
* @returns {Promise<string>} Formatted search results.
*/
async search(dbName, query, selector) {
return search(this.#storage, dbName, query, selector);
}
/**
* Get raw search results from the database.
* @param {string} dbName - Name of the database.
* @param {string} query - Search query.
* @param {number} [results=5] - Number of results to return.
* @returns {Promise<VResult[]>} Array of results.
*/
async getResult(dbName, query, results) {
return searchRaw(this.#storage, dbName, query, results);
}
}
export { Vdb as default };