UNPKG

mcard-js

Version:

MCard - Content-addressable storage with cryptographic hashing, handle resolution, and vector search for Node.js and browsers

557 lines 20.5 kB
/** * MCard Vector Store for JavaScript * * SQLite-based vector storage for semantic search using sqlite-vec extension. * Compatible with the Python MCardVectorStore implementation. */ import Database from 'better-sqlite3'; import { createRequire } from 'module'; import { MCardSchema, getVec0Schema, VECTOR_METADATA_SCHEMA, VECTOR_METADATA_INDEX, VECTOR_EMBEDDINGS_SCHEMA, VECTOR_FTS_SCHEMA } from './schema'; const require = createRequire(import.meta.url); // Adapter for better-sqlite3 in Node.js export class BetterSqliteAdapter { db; constructor(db) { this.db = db; } exec(sql) { this.db.exec(sql); } prepare(sql) { const stmt = this.db.prepare(sql); return { run: (...params) => stmt.run(...params), all: (...params) => stmt.all(...params), get: (...params) => stmt.get(...params), loadVecExtension: (dimensions) => { try { // Dynamic import of sqlite-vec so browser bundles are unaffected. // This is Node-only. // eslint-disable-next-line @typescript-eslint/no-var-requires const sqliteVec = require('sqlite-vec'); sqliteVec.load(this.db); // Create vec0 table using schema helper // Note: This is dynamically generated because dimensions are runtime-determined const schema = getVec0Schema(dimensions); this.db.exec(schema); return true; } catch (e) { console.warn(`[VectorStore] sqlite-vec not available in Node adapter: ${e}`); return false; } }, }; } } export const DEFAULT_VECTOR_CONFIG = { embeddingModel: 'nomic-embed-text', dimensions: 768, ollamaBaseUrl: 'http://localhost:11434', chunkSize: 512, chunkOverlap: 50, enableHybridSearch: true, }; // ───────────────────────────────────────────────────────────────────────────── // Vector Utilities // ───────────────────────────────────────────────────────────────────────────── /** * Serialize a float array to bytes (little-endian float32). */ export function serializeVector(vector) { const buffer = Buffer.alloc(vector.length * 4); for (let i = 0; i < vector.length; i++) { buffer.writeFloatLE(vector[i], i * 4); } return buffer; } /** * Deserialize bytes to a float array. */ export function deserializeVector(buffer) { const count = buffer.length / 4; const vector = []; for (let i = 0; i < count; i++) { vector.push(buffer.readFloatLE(i * 4)); } return vector; } /** * Calculate cosine similarity between two vectors. */ export function cosineSimilarity(a, b) { if (a.length !== b.length) { throw new Error(`Vector length mismatch: ${a.length} vs ${b.length}`); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } normA = Math.sqrt(normA); normB = Math.sqrt(normB); if (normA === 0 || normB === 0) return 0; return dotProduct / (normA * normB); } // ───────────────────────────────────────────────────────────────────────────── // Vector Store // ───────────────────────────────────────────────────────────────────────────── export class MCardVectorStore { db; config; hasVecExtension = false; /** * Construct a vector store. * * - Node.js: pass a string path (uses better-sqlite3 under the hood) * - Advanced: pass a VectorDb adapter (e.g., SqlJsVectorAdapter for WASM) */ constructor(dbOrPath = ':memory:', config) { this.config = { ...DEFAULT_VECTOR_CONFIG, ...config }; if (typeof dbOrPath === 'string') { const raw = new Database(dbOrPath); this.db = new BetterSqliteAdapter(raw); } else { this.db = dbOrPath; } this.initDatabase(); this.tryLoadVecExtension(); } /** * Initialize database tables. * Uses schema from the unified MCardSchema singleton (schema/mcard_schema.sql). */ initDatabase() { // Import from singleton schema loader // This ensures all SQL comes from schema/mcard_schema.sql try { // Try to use singleton if running in Node.js with file access const schema = MCardSchema.getInstance(); // Vector metadata table const metadataSchema = schema.getTable('mcard_vector_metadata'); if (metadataSchema) { this.db.exec(metadataSchema); } // Metadata index const metadataIndex = schema.getIndex('idx_vector_metadata_hash'); if (metadataIndex) { this.db.exec(metadataIndex); } // Embeddings fallback table const embeddingsSchema = schema.getTable('mcard_embeddings'); if (embeddingsSchema) { this.db.exec(embeddingsSchema); } // FTS table for hybrid search if (this.config.enableHybridSearch) { const ftsSchema = schema.getTable('mcard_fts'); if (ftsSchema) { this.db.exec(ftsSchema); } } } catch (e) { // Fallback for browser/WASM environments where file access isn't available // In this case, we use the exported constants from schema.ts console.warn('[VectorStore] Could not load MCardSchema directly, using fallback'); this.initDatabaseFallback(); } } /** * Fallback initialization for browser/WASM environments. * Uses exported constants from schema.ts which are loaded from the SQL file. */ initDatabaseFallback() { try { if (VECTOR_METADATA_SCHEMA) { this.db.exec(VECTOR_METADATA_SCHEMA); } if (VECTOR_METADATA_INDEX) { this.db.exec(VECTOR_METADATA_INDEX); } if (VECTOR_EMBEDDINGS_SCHEMA) { this.db.exec(VECTOR_EMBEDDINGS_SCHEMA); } if (this.config.enableHybridSearch && VECTOR_FTS_SCHEMA) { this.db.exec(VECTOR_FTS_SCHEMA); } } catch (e) { console.error('[VectorStore] Failed to initialize database from schema:', e); throw e; } } /** * Try to load sqlite-vec extension. */ tryLoadVecExtension() { try { const anyDb = this.db; if (typeof anyDb.prepare === 'function') { // If the adapter exposes a loadVecExtension hook (Node or WASM), use it. const prepared = anyDb.prepare('SELECT 1'); if (typeof prepared.loadVecExtension === 'function') { const ok = prepared.loadVecExtension(this.config.dimensions); this.hasVecExtension = ok; return ok; } } // No extension hook available; fallback mode only. this.hasVecExtension = false; return false; } catch (e) { console.warn(`[VectorStore] Failed to load sqlite-vec extension: ${e}. Using fallback.`); this.hasVecExtension = false; return false; } } /** * Generate embedding using Ollama. */ async embed(text) { const response = await fetch(`${this.config.ollamaBaseUrl}/api/embeddings`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ model: this.config.embeddingModel, prompt: text }) }); if (!response.ok) { throw new Error(`Ollama embedding failed: ${response.statusText}`); } const data = await response.json(); return data.embedding; } /** * Index content with its embedding. */ async index(hash, content, metadata, chunk = true) { const chunks = chunk ? this.chunkText(content) : [content]; let indexed = 0; for (let i = 0; i < chunks.length; i++) { const chunkContent = chunks[i]; const preview = chunkContent.substring(0, 200); try { // Generate embedding const embedding = await this.embed(chunkContent); // Insert metadata (matches Python mcard_vector_metadata schema) const insertMeta = this.db.prepare(` INSERT OR REPLACE INTO mcard_vector_metadata (hash, model_name, dimensions, chunk_index, chunk_total, chunk_text, created_at) VALUES (?, ?, ?, ?, ?, ?, ?) `); const result = insertMeta.run(hash, this.config.embeddingModel, this.config.dimensions, i, chunks.length, preview, new Date().toISOString()); const lastId = result.lastInsertRowid; if (lastId === undefined) { throw new Error("Vector metadata insert failed: no rowid returned"); } const rowId = BigInt(lastId); // Store embedding if (this.hasVecExtension) { // Use vec0 virtual table (matches Python mcard_vec) const insertVec = this.db.prepare(` INSERT INTO mcard_vec (metadata_id, embedding) VALUES (?, ?) `); insertVec.run(rowId, serializeVector(embedding)); } else { // Use fallback embeddings table (matches Python mcard_embeddings) const insertEmb = this.db.prepare(` INSERT OR REPLACE INTO mcard_embeddings (metadata_id, embedding) VALUES (?, ?) `); insertEmb.run(rowId, serializeVector(embedding)); } // Update FTS (matches Python mcard_fts) if (this.config.enableHybridSearch) { const insertFts = this.db.prepare(` INSERT INTO mcard_fts (hash, content) VALUES (?, ?) `); insertFts.run(hash, chunkContent); } indexed++; } catch (e) { console.error(`[VectorStore] Failed to index chunk ${i}: ${e}`); } } return indexed; } /** * Search for similar content. */ async search(query, k = 5) { const queryEmbedding = await this.embed(query); if (this.hasVecExtension) { return this.searchWithVec(queryEmbedding, k); } else { return this.searchFallback(queryEmbedding, k); } } /** * Search using sqlite-vec KNN. */ searchWithVec(queryEmbedding, k) { const stmt = this.db.prepare(` SELECT m.hash, m.chunk_index, m.chunk_text, v.distance FROM ( SELECT metadata_id, distance FROM mcard_vec WHERE embedding MATCH ? ORDER BY distance LIMIT ${k} ) v JOIN mcard_vector_metadata m ON v.metadata_id = m.id ORDER BY v.distance `); // Serialize vector and pass as BLOB const rows = stmt.all(serializeVector(queryEmbedding)); return rows.map(row => ({ hash: row.hash, score: 1 - row.distance, // Convert distance to similarity content: row.chunk_text || undefined, chunkIndex: row.chunk_index })); } /** * Fallback brute-force similarity search. */ searchFallback(queryEmbedding, k) { const stmt = this.db.prepare(` SELECT m.id, m.hash, m.chunk_index, m.chunk_text, e.embedding FROM mcard_vector_metadata m JOIN mcard_embeddings e ON m.id = e.metadata_id `); const rows = stmt.all(); // Calculate similarities const scored = rows.map(row => ({ hash: row.hash, chunkIndex: row.chunk_index, content: row.chunk_text || undefined, score: cosineSimilarity(queryEmbedding, deserializeVector(row.embedding)) })); // Sort by score descending scored.sort((a, b) => b.score - a.score); return scored.slice(0, k); } /** * Hybrid search combining vector and FTS. */ async hybridSearch(query, k = 5, vectorWeight = 0.7) { if (!this.config.enableHybridSearch) { return this.search(query, k); } // Get vector results const vectorResults = await this.search(query, k * 2); // Get FTS results const ftsStmt = this.db.prepare(` SELECT hash, rank FROM mcard_fts WHERE mcard_fts MATCH ? ORDER BY rank LIMIT ? `); const ftsRows = ftsStmt.all(query, k * 2); // Merge results with weighted scoring const merged = new Map(); for (const result of vectorResults) { merged.set(result.hash, { ...result, score: result.score * vectorWeight }); } const ftsWeight = 1 - vectorWeight; const maxFtsRank = Math.max(...ftsRows.map(r => Math.abs(r.rank)), 1); for (const row of ftsRows) { const ftsScore = (1 - Math.abs(row.rank) / maxFtsRank) * ftsWeight; const existing = merged.get(row.hash); if (existing) { existing.score += ftsScore; } else { merged.set(row.hash, { hash: row.hash, score: ftsScore }); } } // Sort and return top k const results = Array.from(merged.values()); results.sort((a, b) => b.score - a.score); return results.slice(0, k); } /** * Split text into overlapping chunks. */ chunkText(text) { const { chunkSize, chunkOverlap } = this.config; if (text.length <= chunkSize) { return [text]; } const chunks = []; let start = 0; while (start < text.length) { const end = Math.min(start + chunkSize, text.length); chunks.push(text.slice(start, end)); start += chunkSize - chunkOverlap; } return chunks; } /** * Get count of indexed vectors. */ count() { const stmt = this.db.prepare('SELECT COUNT(*) as count FROM mcard_vector_metadata'); const row = stmt.get?.(); return row?.count ?? 0; } /** * Get list of all indexed hashes (distinct) */ async getIndexedHashes() { const stmt = this.db.prepare('SELECT DISTINCT hash FROM mcard_vector_metadata'); const rows = stmt.all(); return rows.map(r => r.hash); } /** * Count unique indexed cards */ countUnique() { const stmt = this.db.prepare('SELECT COUNT(DISTINCT hash) as count FROM mcard_vector_metadata'); const row = stmt.get(); return row?.count ?? 0; } /** * Delete vectors for a hash. */ delete(hash) { const getIds = this.db.prepare('SELECT id FROM mcard_vector_metadata WHERE hash = ?'); const ids = getIds.all(hash); for (const { id } of ids) { if (this.hasVecExtension) { this.db.prepare('DELETE FROM mcard_vec WHERE metadata_id = ?').run(id); } else { this.db.prepare('DELETE FROM mcard_embeddings WHERE metadata_id = ?').run(id); } } const deleteMeta = this.db.prepare('DELETE FROM mcard_vector_metadata WHERE hash = ?'); const result = deleteMeta.run(hash); if (this.config.enableHybridSearch) { this.db.prepare('DELETE FROM mcard_fts WHERE hash = ?').run(hash); } return result.changes ?? 0; } /** * Clear all vectors. */ clear() { if (this.hasVecExtension) { this.db.exec('DELETE FROM mcard_vec'); } else { this.db.exec('DELETE FROM mcard_embeddings'); } this.db.exec('DELETE FROM mcard_vector_metadata'); if (this.config.enableHybridSearch) { this.db.exec('DELETE FROM mcard_fts'); } } /** * Check if sqlite-vec extension is available. */ hasVectorExtension() { return this.hasVecExtension; } /** * Get information about the vector store. */ getInfo() { const count = this.count(); return { vectorCount: count, embeddingModel: this.config.embeddingModel, dimensions: this.config.dimensions, hasVecExtension: this.hasVecExtension, enableHybridSearch: this.config.enableHybridSearch, }; } /** * Close the database connection. */ close() { // VectorDb interface does not require close; for better-sqlite3 this is // handled by the adapter's underlying instance lifecycle. const anyDb = this.db; if (typeof anyDb.close === 'function') { anyDb.close(); } } } /** * Adapter for sql.js / SQLite WASM so VectorStore can run in the browser. * * NOTE: Parameter binding and stepping are left as a TODO because the exact * sql.js API usage (bind/step/getAsObject) depends on how you want to map * VectorDb semantics to sql.js. For now, this adapter is scaffolding that * shows where to integrate sql.js. */ export class SqlJsVectorAdapter { db; constructor(db) { this.db = db; } exec(sql) { this.db.run(sql); } prepare(_sql) { // TODO: Implement proper sql.js Statement handling here, including: // - stmt = this.db.prepare(sql) // - stmt.bind(params) // - stmt.step()/getAsObject() loops for all()/get() // - stmt.free() when done // For now, this is a stub so the TypeScript wiring compiles; you // should replace this with a real implementation once you decide on // the exact sql.js usage pattern. return { run: (..._params) => ({}), all: (..._params) => [], get: (..._params) => undefined, loadVecExtension: (_dimensions) => { // TODO: Load sqlite-vec WASM extension for sql.js here. // This will depend on which build of sqlite-vec WASM you use // (Alex Garcia's official builds, CR-SQLite integration, etc.). // For now, return false so the VectorStore stays in fallback mode. return false; }, }; } } /** * Factory to create a VectorStore backed by SqliteWasmEngine/sql.js. * * Example usage (browser): * * const engine = new SqliteWasmEngine(); * await engine.init(wasmUrl, existingData); * const vecStore = createWasmVectorStore(engine.getRawDb()); */ export function createWasmVectorStore(wasmDb, config) { const adapter = new SqlJsVectorAdapter(wasmDb); return new MCardVectorStore(adapter, config); } //# sourceMappingURL=VectorStore.js.map