UNPKG

hnswsqlite

Version:

Vector search with HNSWlib and SQLite in TypeScript.

172 lines (148 loc) 5.54 kB
import { HierarchicalNSW } from 'hnswlib-node'; import Database from 'better-sqlite3'; export class VectorStore { private db: Database.Database; private index: any; // HNSWLib.Index private dim: number; private idMap: Set<number>; // Using Set for O(1) lookups private preparedStatements!: { getDocument: Database.Statement; searchDocuments: Database.Statement; deleteDocument: Database.Statement; getAllEmbeddings: Database.Statement; }; constructor(dbPath: string, dim: number = 1536) { this.db = new Database(dbPath, { verbose: console.log }); this.db.pragma('journal_mode = WAL'); // Better concurrency this.db.pragma('synchronous = NORMAL'); // Better write performance this.db.pragma('cache_size = -2000'); // 2MB cache this.dim = dim; this.idMap = new Set(); this._initTables(); this._initPreparedStatements(); this._initIndex(); this._loadEmbeddings(); } private _initTables() { // Enable WAL mode for better concurrency this.db.pragma('journal_mode = WAL'); this.db.exec(` CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY AUTOINCREMENT, text TEXT, embedding BLOB ); CREATE INDEX IF NOT EXISTS idx_documents_id ON documents(id); `); } private _initPreparedStatements() { this.preparedStatements = { getDocument: this.db.prepare('SELECT id, text FROM documents WHERE id = ?'), searchDocuments: this.db.prepare('SELECT id, text FROM documents WHERE id = ?'), deleteDocument: this.db.prepare('DELETE FROM documents WHERE id = ?'), getAllEmbeddings: this.db.prepare('SELECT id, embedding FROM documents') }; } private _initIndex() { this.index = new HierarchicalNSW('l2', this.dim); this.index.initIndex(10000); // initial capacity } private _loadEmbeddings() { // Use prepared statement for better performance const rows = this.preparedStatements.getAllEmbeddings.all() as Array<{id: number, embedding: Buffer}>; // Batch add points to the index const batchSize = 1000; for (let i = 0; i < rows.length; i += batchSize) { const batch = rows.slice(i, i + batchSize); for (const row of batch) { try { const emb = Buffer.from(row.embedding).buffer; const arr = Array.from(new Float32Array(emb)); this.index.addPoint(arr, row.id, false); // false = don't save index after each add this.idMap.add(row.id); } catch (error) { console.error(`Error loading embedding for document ${row.id}:`, error); } } // Save index every batch to prevent memory issues if (i + batchSize < rows.length) { this.index.writeIndexSync(); } } // Save the final index this.index.writeIndexSync(); } addDocument(text: string, embedding: number[]): number { const embBuf = Buffer.from(new Float32Array(embedding).buffer); const result = this.db.prepare('INSERT INTO documents (text, embedding) VALUES (?, ?)').run(text, embBuf); const docId = Number(result.lastInsertRowid); this.index.addPoint(embedding, docId); this.idMap.add(docId); return docId; } /** * Search for similar documents using vector similarity * @param embedding - The query embedding vector * @param k - Number of nearest neighbors to return * @returns Array of matching documents with id and text */ search(embedding: number[], k: number = 5): Array<{id: number, text: string}> { const neighbors = this.index.searchKnn(embedding, k); const results = []; // Use transaction for batch read const transaction = this.db.transaction((ids: number[]) => { return ids.map(id => this.preparedStatements.searchDocuments.get(id)); }); const rows = transaction(neighbors.neighbors) as Array<{id: number, text: string}>; return rows.filter(Boolean); } /** * Delete a document by ID * @param id - The ID of the document to delete * @returns boolean indicating success */ deleteDocument(id: number): boolean { try { // Begin transaction this.db.prepare('BEGIN').run(); // Delete from database const result = this.preparedStatements.deleteDocument.run(id); // Remove from in-memory index if it exists if (this.idMap.has(id)) { // Note: HNSWlib doesn't support direct deletion, we'll mark it as deleted in our idMap this.idMap.delete(id); // For a complete solution, you might want to rebuild the index periodically } // Commit transaction this.db.prepare('COMMIT').run(); return result.changes > 0; } catch (error) { this.db.prepare('ROLLBACK').run(); console.error('Error deleting document:', error); return false; } } /** * Batch delete multiple documents * @param ids - Array of document IDs to delete * @returns number of successfully deleted documents */ deleteDocuments(ids: number[]): number { if (!ids.length) return 0; try { // Use a transaction for batch deletion return this.db.transaction((ids: number[]) => { return ids.reduce((count, id) => { const deleted = this.deleteDocument(id); return count + (deleted ? 1 : 0); }, 0); })(ids); } catch (error) { console.error('Error in batch delete:', error); return 0; } } close() { this.db.close(); } }