UNPKG

@n2flowjs/nbase

Version:

Neural Vector Database for efficient similarity search

624 lines 25.2 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const fs_1 = require("fs"); const config_1 = __importDefault(require("../config")); const log_1 = require("../utils/log"); /** * Locality-Sensitive Hashing (LSH) implementation * Used for approximate nearest neighbor search by hashing similar vectors * to the same buckets with high probability. * Supports vectors of different dimensions. */ /** * Locality-Sensitive Hashing (LSH) implementation for approximate nearest neighbor search. * * LSH accelerates vector similarity search by hashing similar vectors into the same buckets. * This implementation uses random hyperplanes to partition the vector space, supporting: * - Multi-dimensional vectors (vectors of different sizes) * - Multi-probing to increase recall * - Automatic index building * - Serialization for persistence * * @example * ```typescript * // Create a new LSH index * const lsh = new LSH(vectorDB, { * dimensions: 1536, * numberOfHashes: 8, * numberOfBuckets: 150 * }); * * // Build the index * await lsh.buildIndex({ * progressCallback: (progress) => console.log(`Indexing: ${progress * 100}%`) * }); * * // Query for nearest neighbors * const results = lsh.findNearest(queryVector, 10); * ``` * * @remarks * The implementation uses random hyperplane hashing, where vectors are assigned to buckets * based on which side of random hyperplanes they fall. Vectors that are close to each other * in the original space have a higher probability of being assigned to the same bucket. * * For improved recall, consider using multi-probing which checks neighboring buckets. * For higher precision, increase the number of hash functions (numberOfHashes). * For better performance but potentially lower recall, increase numberOfBuckets. * * @see {@link BuildIndexOptions} for index building options * @see {@link LSHOptions} for constructor options */ class LSH { constructor(db, options) { this.initialized = false; this.db = db; // Set default parameters this.defaultDimensions = options.dimensions || this.db.vectorSize() || config_1.default.defaults.vectorSize || 1024; this.numberOfHashes = options.numberOfHashes || 10; this.numberOfBuckets = options.numberOfBuckets || 100; this.allowMismatchedDimensions = options.allowMismatchedDimensions !== false; // Initialize data structures for multi-dimensional support this.hashFunctions = new Map(); this.buckets = new Map(); this.vectorDimensions = new Map(); this.dimensionGroups = new Map(); // Generate hash functions for default dimension this._generateHashFunctions(this.defaultDimensions); } /** * Generate random hyperplanes for LSH * @param dimension - The vector dimension to generate hash functions for */ _generateHashFunctions(dimension) { // Skip if hash functions already exist for this dimension if (this.hashFunctions.has(dimension)) { return; } const hyperplanes = []; for (let i = 0; i < this.numberOfHashes; i++) { const hashFunctions = []; for (let j = 0; j < this.numberOfBuckets; j++) { // Generate a random hyperplane (normal vector) const hyperplane = new Float32Array(dimension); for (let d = 0; d < dimension; d++) { // Use normal distribution for better results hyperplane[d] = this._randomNormal(); } hashFunctions.push(hyperplane); } hyperplanes.push(hashFunctions); } this.hashFunctions.set(dimension, hyperplanes); // Initialize buckets for this dimension const dimensionBuckets = Array(this.numberOfHashes) .fill(null) .map(() => new Map()); this.buckets.set(dimension, dimensionBuckets); } /** * Standard normal distribution using Box-Muller transform */ _randomNormal() { let u = 0, v = 0; while (u === 0) u = Math.random(); while (v === 0) v = Math.random(); return Math.sqrt(-2.0 * Math.log(u)) * Math.cos(2.0 * Math.PI * v); } /** * Compute hash for a vector * @param vector - Vector to hash * @returns Array of hash values */ _hashVector(vector) { const dimension = vector.length; // Generate hash functions for this dimension if they don't exist if (!this.hashFunctions.has(dimension)) { this._generateHashFunctions(dimension); } const hashFunctions = this.hashFunctions.get(dimension); const hashes = []; for (let i = 0; i < this.numberOfHashes; i++) { const hyperplanes = hashFunctions[i]; let hash = 0; // Compute hash by checking which side of each hyperplane the vector falls on for (let j = 0; j < hyperplanes.length && j < 31; j++) { const hyperplane = hyperplanes[j]; // Compute dot product let dotProduct = 0; for (let d = 0; d < dimension; d++) { dotProduct += vector[d] * hyperplane[d]; } // Set the corresponding bit based on sign of dot product if (dotProduct >= 0) { hash |= 1 << j; } } hashes.push(hash % this.numberOfBuckets); } return { dimension, hashes }; } /** * Index a vector * @param id - Vector identifier * @param vector - Vector to index * @returns Vector ID */ indexVector(id, vector) { const { dimension, hashes } = this._hashVector(vector); // Store dimension information this.vectorDimensions.set(id, dimension); // Add to dimension group if (!this.dimensionGroups.has(dimension)) { this.dimensionGroups.set(dimension, new Set()); } this.dimensionGroups.get(dimension).add(id); // Get buckets for this dimension const dimensionBuckets = this.buckets.get(dimension); if (!dimensionBuckets) { console.warn(`No buckets for dimension ${dimension}`); return id; } // Add to each hash table for (let i = 0; i < hashes.length; i++) { const hash = hashes[i]; const bucket = dimensionBuckets[i]; if (!bucket.has(hash)) { bucket.set(hash, []); } const ids = bucket.get(hash); ids.push(id); } return id; } /** * Build index for all vectors in database * @param options - Build options */ async buildIndex(options = {}) { const progressCallback = options.progressCallback || (() => { }); const useDimensionGroups = options.dimensionGroups !== false; // Reset all data structures this.hashFunctions.clear(); this.buckets.clear(); this.vectorDimensions.clear(); this.dimensionGroups.clear(); // Get all vectors const ids = Array.from(this.db.memoryStorage.keys()); const totalVectors = ids.length; if (totalVectors === 0) { console.log("No vectors to index"); return; } // Phase 1: Collect dimensions progressCallback(0); let processedCount = 0; for (let i = 0; i < ids.length; i++) { const id = ids[i]; let dimension; // Try to get dimension from db.getVectorDimension if available (faster) if (this.db.getVectorDimension) { dimension = this.db.getVectorDimension(id) || 0; } else { // Fallback to getting dimension from vector const vector = this.db.getVector(id); dimension = vector ? vector.length : 0; } if (dimension > 0) { // Store dimension info this.vectorDimensions.set(id, dimension); // Group by dimension if (!this.dimensionGroups.has(dimension)) { this.dimensionGroups.set(dimension, new Set()); } this.dimensionGroups.get(dimension).add(id); } // Report progress for phase 1 (0-10%) if (i % 1000 === 0) { progressCallback((i / totalVectors) * 0.1); } } // Initialize hash functions for each dimension for (const dimension of this.dimensionGroups.keys()) { this._generateHashFunctions(dimension); } // Phase 2: Index vectors processedCount = 0; // If using dimension groups, process each group separately if (useDimensionGroups) { for (const [dimension, idSet] of this.dimensionGroups.entries()) { const idsInDimension = Array.from(idSet); (0, log_1.log)('info', `dimension: ${dimension} in ids`); for (let i = 0; i < idsInDimension.length; i++) { const id = idsInDimension[i]; const vector = this.db.getVector(id); if (vector) { this.indexVector(id, vector); } processedCount++; // Report progress for phase 2 (10-100%) if (processedCount % 100 === 0) { progressCallback(0.1 + (processedCount / totalVectors) * 0.9); } } } } else { // Process all vectors regardless of dimension for (let i = 0; i < ids.length; i++) { const id = ids[i]; const vector = this.db.getVector(id); if (vector) { this.indexVector(id, vector); } // Report progress for phase 2 (10-100%) if (i % 100 === 0) { progressCallback(0.1 + (i / totalVectors) * 0.9); } } } this.initialized = true; progressCallback(1.0); } /** * Query for approximate nearest neighbors * @param vector - Query vector * @param multiProbe - Number of neighboring buckets to check (0 for exact bucket only) * @param options - Query options * @returns Array of candidate IDs */ query(vector, multiProbe = 0, options = {}) { const { dimension, hashes } = this._hashVector(vector); const exactDimensions = options.exactDimensions || false; const candidateIds = new Set(); // If exact dimensions is true, only query the matching dimension if (exactDimensions) { // If no buckets for this dimension, return empty results if (!this.buckets.has(dimension)) { return []; } const dimensionBuckets = this.buckets.get(dimension); for (let i = 0; i < hashes.length; i++) { const hash = hashes[i]; const bucket = dimensionBuckets[i]; // Get exact bucket if (bucket.has(hash)) { for (const id of bucket.get(hash)) { candidateIds.add(id); } } // Multi-probe LSH: check neighboring buckets if (multiProbe > 0) { for (let j = 1; j <= multiProbe; j++) { const probeBucket1 = (hash + j) % this.numberOfBuckets; const probeBucket2 = (hash - j + this.numberOfBuckets) % this.numberOfBuckets; if (bucket.has(probeBucket1)) { for (const id of bucket.get(probeBucket1)) { candidateIds.add(id); } } if (bucket.has(probeBucket2)) { for (const id of bucket.get(probeBucket2)) { candidateIds.add(id); } } } } } } else { // Query all dimensions or matching dimension based on allowMismatchedDimensions const dimensionsToQuery = this.allowMismatchedDimensions ? Array.from(this.buckets.keys()) : [dimension]; for (const dim of dimensionsToQuery) { // If no buckets for this dimension, skip if (!this.buckets.has(dim)) continue; const dimensionBuckets = this.buckets.get(dim); // Use the hash of the query vector for the default dimension // This is a simplification - ideally we'd recompute hashes for each dimension for (let i = 0; i < Math.min(hashes.length, dimensionBuckets.length); i++) { // Adapt hash to the current dimension's bucket count const hash = hashes[i] % this.numberOfBuckets; const bucket = dimensionBuckets[i]; // Get exact bucket if (bucket.has(hash)) { for (const id of bucket.get(hash)) { candidateIds.add(id); } } // Multi-probe LSH: check neighboring buckets if (multiProbe > 0) { for (let j = 1; j <= multiProbe; j++) { const probeBucket1 = (hash + j) % this.numberOfBuckets; const probeBucket2 = (hash - j + this.numberOfBuckets) % this.numberOfBuckets; if (bucket.has(probeBucket1)) { for (const id of bucket.get(probeBucket1)) { candidateIds.add(id); } } if (bucket.has(probeBucket2)) { for (const id of bucket.get(probeBucket2)) { candidateIds.add(id); } } } } } } } return Array.from(candidateIds); } /** * Find approximate nearest neighbors by first filtering with LSH * then refining with exact distance * @param query - Query vector * @param k - Number of nearest neighbors to return * @param options - Search options * @returns Array of search results */ findNearest(query, k = config_1.default.defaults.k, options = {}) { const typedQuery = query instanceof Float32Array ? query : new Float32Array(query); const filter = options.filter || (() => true); const exactDimensions = options.exactDimensions || false; // Fall back to linear search if not initialized if (!this.initialized) { return this._linearSearch(typedQuery, k, options); } // Get candidate IDs using LSH const candidateIds = this.query(typedQuery, 2, { exactDimensions }); if (candidateIds.length === 0) { return []; } // Compute exact distances for candidates const distances = []; for (const id of candidateIds) { // Apply filter if (!filter(id)) continue; const vector = this.db.getVector(id); if (!vector) continue; // Skip vectors with different dimensions in exactDimensions mode if (exactDimensions && vector.length !== typedQuery.length) continue; const dist = this._distance(typedQuery, vector); distances.push({ id, dist }); } // Sort by distance and return top k return distances.sort((a, b) => a.dist - b.dist).slice(0, k); } /** * Compute Euclidean distance between vectors * @private */ _distance(a, b) { const len = Math.min(a.length, b.length); let sum = 0; // Process 4 elements at a time for better performance for (let i = 0; i < len - 3; i += 4) { const d1 = a[i] - b[i]; const d2 = a[i + 1] - b[i + 1]; const d3 = a[i + 2] - b[i + 2]; const d4 = a[i + 3] - b[i + 3]; sum += d1 * d1 + d2 * d2 + d3 * d3 + d4 * d4; } // Handle remaining elements for (let i = len - (len % 4); i < len; i++) { const diff = a[i] - b[i]; sum += diff * diff; } // Add small penalty for dimension mismatch const dimDiff = Math.abs(a.length - b.length); if (dimDiff > 0) { sum += dimDiff * 0.01; } return Math.sqrt(sum); } /** * Perform linear search when no index is available * @private */ _linearSearch(query, k, options) { const results = []; const filter = options.filter || (() => true); const exactDimensions = options.exactDimensions || false; const queryDim = query.length; for (const [id, vector] of this.db.memoryStorage.entries()) { // Skip if filter excludes this ID if (!filter(id)) continue; // Skip vectors with different dimensions if exactDimensions is true if (exactDimensions && vector.length !== queryDim) continue; const dist = this._distance(query, vector); results.push({ id, dist }); } // Sort by distance and limit to k results return results.sort((a, b) => a.dist - b.dist).slice(0, k); } /** * Get index statistics * @returns Statistics about the LSH index */ getStats() { const stats = { numberOfHashes: this.numberOfHashes, numberOfBuckets: this.numberOfBuckets, defaultDimensions: this.defaultDimensions, totalItems: 0, bucketsUsed: 0, avgBucketSize: 0, maxBucketSize: 0, vectorsPerDimension: {}, bucketsPerDimension: {}, initialized: this.initialized, allowMismatchedDimensions: this.allowMismatchedDimensions, }; // Count vectors by dimension for (const [dimension, vectors] of this.dimensionGroups.entries()) { stats.vectorsPerDimension[dimension] = vectors.size; } // Count buckets and calculate statistics per dimension for (const [dimension, dimensionBuckets] of this.buckets.entries()) { let dimBucketsUsed = 0; let dimTotalItems = 0; let dimMaxBucketSize = 0; for (const bucket of dimensionBuckets) { dimBucketsUsed += bucket.size; for (const items of bucket.values()) { if (Array.isArray(items)) { dimTotalItems += items.length; dimMaxBucketSize = Math.max(dimMaxBucketSize, items.length); } } } // Add to overall stats stats.bucketsUsed += dimBucketsUsed; stats.totalItems += dimTotalItems; stats.maxBucketSize = Math.max(stats.maxBucketSize, dimMaxBucketSize); // Store dimension-specific stats stats.bucketsPerDimension[dimension] = dimBucketsUsed; } // Calculate average bucket size stats.avgBucketSize = stats.bucketsUsed > 0 ? stats.totalItems / stats.bucketsUsed : 0; return stats; } /** * Serialize the LSH index to JSON * @returns Serialized index data */ serialize() { // Convert Maps to serializable objects const hashFunctionsData = {}; const bucketsData = {}; const vectorDimensionsData = Array.from(this.vectorDimensions.entries()); // Convert hash functions for (const [dimension, functions] of this.hashFunctions.entries()) { hashFunctionsData[dimension] = functions.map((table) => table.map((hyperplane) => Array.from(hyperplane))); } // Convert buckets for (const [dimension, tables] of this.buckets.entries()) { bucketsData[dimension] = {}; tables.forEach((table, tableIndex) => { bucketsData[dimension][tableIndex] = {}; for (const [hash, ids] of table.entries()) { bucketsData[dimension][tableIndex][hash] = Array.isArray(ids) ? ids : [ids]; } }); } const data = { defaultDimensions: this.defaultDimensions, numberOfHashes: this.numberOfHashes, numberOfBuckets: this.numberOfBuckets, allowMismatchedDimensions: this.allowMismatchedDimensions, hashFunctions: hashFunctionsData, buckets: bucketsData, vectorDimensions: vectorDimensionsData, version: 1, // For future compatibility }; return JSON.stringify(data); } /** * Save index to file * @param filePath - Path to save the index */ async saveIndex(filePath) { const data = this.serialize(); await fs_1.promises.writeFile(filePath, data, "utf8"); } /** * Load serialized LSH index * @param json - Serialized LSH index * @param db - Vector database * @returns LSH instance */ static deserialize(json, db) { const data = JSON.parse(json); // Create LSH instance with basic parameters const lsh = new LSH(db, { dimensions: data.defaultDimensions, numberOfHashes: data.numberOfHashes, numberOfBuckets: data.numberOfBuckets, allowMismatchedDimensions: data.allowMismatchedDimensions, }); // Restore hash functions for (const [dimensionStr, functions] of Object.entries(data.hashFunctions)) { const dimension = parseInt(dimensionStr, 10); const typedFunctions = []; // Convert arrays to Float32Arrays for (const table of functions) { const typedTable = []; for (const hyperplane of table) { typedTable.push(new Float32Array(hyperplane)); } typedFunctions.push(typedTable); } lsh.hashFunctions.set(dimension, typedFunctions); } // Restore buckets for (const [dimensionStr, tables] of Object.entries(data.buckets)) { const dimension = parseInt(dimensionStr, 10); const dimensionBuckets = []; for (const [tableIndexStr, hashTable] of Object.entries(tables)) { const tableIndex = parseInt(tableIndexStr, 10); const bucketMap = new Map(); // Ensure we have enough tables while (dimensionBuckets.length <= tableIndex) { dimensionBuckets.push(new Map()); } // Restore each hash bucket for (const [hashStr, ids] of Object.entries(hashTable)) { const hash = parseInt(hashStr, 10); bucketMap.set(hash, ids); } dimensionBuckets[tableIndex] = bucketMap; } lsh.buckets.set(dimension, dimensionBuckets); } // Restore vector dimensions for (const [id, dimension] of data.vectorDimensions) { lsh.vectorDimensions.set(id, dimension); // Rebuild dimension groups if (!lsh.dimensionGroups.has(dimension)) { lsh.dimensionGroups.set(dimension, new Set()); } lsh.dimensionGroups.get(dimension).add(id); } lsh.initialized = true; return lsh; } /** * Load index from file * @param filePath - Path to load the index from * @param db - Vector database * @param options - Load options * @returns LSH instance */ static async loadIndex(filePath, db, options = {}) { const data = await fs_1.promises.readFile(filePath, "utf8"); const lsh = LSH.deserialize(data, db); // Apply options if (options.allowMismatchedDimensions !== undefined) { lsh.allowMismatchedDimensions = options.allowMismatchedDimensions; } return lsh; } } exports.default = LSH; //# sourceMappingURL=lsh.js.map