hnswsqlite
Version:
Vector search with HNSWlib and SQLite in TypeScript.
172 lines (148 loc) • 5.54 kB
text/typescript
import { HierarchicalNSW } from 'hnswlib-node';
import Database from 'better-sqlite3';
export class VectorStore {
private db: Database.Database;
private index: any; // HNSWLib.Index
private dim: number;
private idMap: Set<number>; // Using Set for O(1) lookups
private preparedStatements!: {
getDocument: Database.Statement;
searchDocuments: Database.Statement;
deleteDocument: Database.Statement;
getAllEmbeddings: Database.Statement;
};
constructor(dbPath: string, dim: number = 1536) {
this.db = new Database(dbPath, { verbose: console.log });
this.db.pragma('journal_mode = WAL'); // Better concurrency
this.db.pragma('synchronous = NORMAL'); // Better write performance
this.db.pragma('cache_size = -2000'); // 2MB cache
this.dim = dim;
this.idMap = new Set();
this._initTables();
this._initPreparedStatements();
this._initIndex();
this._loadEmbeddings();
}
private _initTables() {
// Enable WAL mode for better concurrency
this.db.pragma('journal_mode = WAL');
this.db.exec(`
CREATE TABLE IF NOT EXISTS documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
text TEXT,
embedding BLOB
);
CREATE INDEX IF NOT EXISTS idx_documents_id ON documents(id);
`);
}
private _initPreparedStatements() {
this.preparedStatements = {
getDocument: this.db.prepare('SELECT id, text FROM documents WHERE id = ?'),
searchDocuments: this.db.prepare('SELECT id, text FROM documents WHERE id = ?'),
deleteDocument: this.db.prepare('DELETE FROM documents WHERE id = ?'),
getAllEmbeddings: this.db.prepare('SELECT id, embedding FROM documents')
};
}
private _initIndex() {
this.index = new HierarchicalNSW('l2', this.dim);
this.index.initIndex(10000); // initial capacity
}
private _loadEmbeddings() {
// Use prepared statement for better performance
const rows = this.preparedStatements.getAllEmbeddings.all() as Array<{id: number, embedding: Buffer}>;
// Batch add points to the index
const batchSize = 1000;
for (let i = 0; i < rows.length; i += batchSize) {
const batch = rows.slice(i, i + batchSize);
for (const row of batch) {
try {
const emb = Buffer.from(row.embedding).buffer;
const arr = Array.from(new Float32Array(emb));
this.index.addPoint(arr, row.id, false); // false = don't save index after each add
this.idMap.add(row.id);
} catch (error) {
console.error(`Error loading embedding for document ${row.id}:`, error);
}
}
// Save index every batch to prevent memory issues
if (i + batchSize < rows.length) {
this.index.writeIndexSync();
}
}
// Save the final index
this.index.writeIndexSync();
}
addDocument(text: string, embedding: number[]): number {
const embBuf = Buffer.from(new Float32Array(embedding).buffer);
const result = this.db.prepare('INSERT INTO documents (text, embedding) VALUES (?, ?)').run(text, embBuf);
const docId = Number(result.lastInsertRowid);
this.index.addPoint(embedding, docId);
this.idMap.add(docId);
return docId;
}
/**
* Search for similar documents using vector similarity
* @param embedding - The query embedding vector
* @param k - Number of nearest neighbors to return
* @returns Array of matching documents with id and text
*/
search(embedding: number[], k: number = 5): Array<{id: number, text: string}> {
const neighbors = this.index.searchKnn(embedding, k);
const results = [];
// Use transaction for batch read
const transaction = this.db.transaction((ids: number[]) => {
return ids.map(id => this.preparedStatements.searchDocuments.get(id));
});
const rows = transaction(neighbors.neighbors) as Array<{id: number, text: string}>;
return rows.filter(Boolean);
}
/**
* Delete a document by ID
* @param id - The ID of the document to delete
* @returns boolean indicating success
*/
deleteDocument(id: number): boolean {
try {
// Begin transaction
this.db.prepare('BEGIN').run();
// Delete from database
const result = this.preparedStatements.deleteDocument.run(id);
// Remove from in-memory index if it exists
if (this.idMap.has(id)) {
// Note: HNSWlib doesn't support direct deletion, we'll mark it as deleted in our idMap
this.idMap.delete(id);
// For a complete solution, you might want to rebuild the index periodically
}
// Commit transaction
this.db.prepare('COMMIT').run();
return result.changes > 0;
} catch (error) {
this.db.prepare('ROLLBACK').run();
console.error('Error deleting document:', error);
return false;
}
}
/**
* Batch delete multiple documents
* @param ids - Array of document IDs to delete
* @returns number of successfully deleted documents
*/
deleteDocuments(ids: number[]): number {
if (!ids.length) return 0;
try {
// Use a transaction for batch deletion
return this.db.transaction((ids: number[]) => {
return ids.reduce((count, id) => {
const deleted = this.deleteDocument(id);
return count + (deleted ? 1 : 0);
}, 0);
})(ids);
} catch (error) {
console.error('Error in batch delete:', error);
return 0;
}
}
close() {
this.db.close();
}
}