UNPKG

mcard-js

Version:

MCard - Content-addressable storage with cryptographic hashing, handle resolution, and vector search for Node.js and browsers

527 lines 23.2 kB
/** * Handle-Aware Vector Store for Semantic Versioning * * Extends VectorStore to enable semantic similarity detection * across MCard versions linked to handles. * * Key Features: * - Link MCards to handles during indexing * - Compare versions within a handle by semantic similarity * - Filter searches by handle * - Sort version history by distance to current or query * - Automatic upgrade type classification based on semantic delta * * @module mcard-js/rag/HandleVectorStore */ import { MCardVectorStore, cosineSimilarity, deserializeVector } from '../storage/VectorStore'; import { initSemanticSchemas, HANDLE_VERSION_VECTORS_SCHEMA, VERSION_SIMILARITY_CACHE_SCHEMA } from '../storage/schema'; // ───────────────────────────────────────────────────────────────────────────── // Upgrade Type Classification // ───────────────────────────────────────────────────────────────────────────── /** * Classify upgrade type based on semantic similarity to parent. * * Thresholds: * - >= 0.95: trivial (formatting, typo fixes) * - 0.85 - 0.94: minor (small edits, clarifications) * - 0.70 - 0.84: major (significant content changes) * - < 0.70: breaking (major semantic drift) * * @param semanticDelta Cosine similarity to parent version * @returns Upgrade type string */ export function classifyUpgradeType(semanticDelta) { if (semanticDelta >= 0.95) { return 'trivial'; } else if (semanticDelta >= 0.85) { return 'minor'; } else if (semanticDelta >= 0.70) { return 'major'; } else { return 'breaking'; } } /** * Calculate Euclidean distance between two vectors. */ export function euclideanDistance(a, b) { if (a.length !== b.length) { throw new Error('Vectors must have same length'); } let sum = 0; for (let i = 0; i < a.length; i++) { const diff = a[i] - b[i]; sum += diff * diff; } return Math.sqrt(sum); } // ───────────────────────────────────────────────────────────────────────────── // Handle-Aware Vector Store // ───────────────────────────────────────────────────────────────────────────── /** * Extended vector store with handle-aware version similarity. * * This class implements the Handle-Hash Duality pattern with semantic * awareness, enabling: * * - Version tracking via handles * - Semantic similarity detection across versions * - Intelligent upgrade classification * - Cross-version semantic search * * Architecture: * - Handles remain stable identifiers (Proxy Pattern) * - Hashes reference immutable content (Content-Addressing) * - Embeddings enable semantic understanding (Vector Search) * * @example * ```typescript * const store = new HandleVectorStore(':memory:'); * * // Index with handle association * await store.indexWithHandle(mcard, 'my_document'); * * // Get versions sorted by similarity to current * const versions = await store.getVersionsBySimilarity('my_document'); * * // Find versions similar to a query * const results = await store.searchHandleVersions('my_document', 'machine learning'); * ``` */ export class HandleVectorStore extends MCardVectorStore { constructor(dbPath = ':memory:', config = {}) { super(dbPath, config); this.initHandleTables(); } /** * Create handle-vector bridge tables for semantic versioning. */ initHandleTables() { // Create handle-version-vector bridge table if (HANDLE_VERSION_VECTORS_SCHEMA) { this.db.exec(HANDLE_VERSION_VECTORS_SCHEMA); } // Create similarity cache table if (VERSION_SIMILARITY_CACHE_SCHEMA) { this.db.exec(VERSION_SIMILARITY_CACHE_SCHEMA); } // Use the schema initialization function if tables weren't created try { initSemanticSchemas(this.db); } catch (e) { // Tables might already exist, ignore } } // ───────────────────────────────────────────────────────────────────────── // Indexing with Handle Association // ───────────────────────────────────────────────────────────────────────── /** * Index an MCard and associate it with a handle. * * This method: * 1. Indexes the MCard content with embeddings (via base class) * 2. Links the MCard hash to the handle with version tracking * 3. Computes semantic delta from parent version (if exists) * 4. Classifies the upgrade type based on similarity * * @param mcard MCard to index * @param handle Handle name to associate with * @param isCurrent Whether this is the current version * @param chunk Whether to chunk long content * @returns Number of vectors indexed (0 if content couldn't be indexed) */ async indexWithHandle(mcard, handle, isCurrent = true, chunk = true) { // Extract content from MCard for indexing const content = typeof mcard.content === 'string' ? mcard.content : typeof mcard.content === 'object' && mcard.content !== null ? JSON.stringify(mcard.content) : String(mcard.content ?? ''); if (!content) { console.warn(`MCard ${mcard.hash.slice(0, 8)} has no content, skipping index`); return 0; } // Standard indexing (generates embeddings) const indexed = await this.index(mcard.hash, content, undefined, chunk); if (indexed === 0) { console.warn(`MCard ${mcard.hash.slice(0, 8)} could not be indexed, skipping handle link`); return 0; } const now = new Date().toISOString(); // Get embedding ID for this hash (chunk 0) const metaRow = this.db.prepare(` SELECT id FROM mcard_vector_metadata WHERE hash = ? AND chunk_index = 0 LIMIT 1 `).get(mcard.hash); const embeddingId = metaRow?.id ?? null; // Get current version info for calculating semantic delta let parentHash = null; let semanticDelta = null; let upgradeType = null; if (isCurrent) { // Get the current version before updating const currentRow = this.db.prepare(` SELECT hash, embedding_id FROM handle_version_vectors WHERE handle = ? AND is_current = TRUE LIMIT 1 `).get(handle); if (currentRow) { parentHash = currentRow.hash; const parentEmbeddingId = currentRow.embedding_id; // Calculate semantic delta if both have embeddings if (embeddingId && parentEmbeddingId) { const newEmbedding = this.getEmbedding(mcard.hash); const parentEmbedding = this.getEmbedding(parentHash); if (newEmbedding && parentEmbedding) { semanticDelta = cosineSimilarity(newEmbedding, parentEmbedding); upgradeType = classifyUpgradeType(semanticDelta); console.debug(`Semantic delta for ${handle}: ${semanticDelta.toFixed(4)} (${upgradeType})`); } } } // Shift existing versions down this.db.prepare(` UPDATE handle_version_vectors SET version_order = version_order + 1, is_current = FALSE WHERE handle = ? `).run(handle); } // Determine version order let versionOrder; if (isCurrent) { versionOrder = 0; } else { const maxRow = this.db.prepare(` SELECT COALESCE(MAX(version_order), -1) + 1 as next_order FROM handle_version_vectors WHERE handle = ? `).get(handle); versionOrder = maxRow.next_order; } // Insert version record this.db.prepare(` INSERT OR REPLACE INTO handle_version_vectors (handle, hash, parent_hash, version_order, is_current, embedding_id, semantic_delta_from_parent, upgrade_type, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) `).run(handle, mcard.hash, parentHash, versionOrder, isCurrent ? 1 : 0, embeddingId, semanticDelta, upgradeType, now); console.log(`Indexed MCard ${mcard.hash.slice(0, 8)} for handle '${handle}' ` + `(v${versionOrder}, current=${isCurrent}, type=${upgradeType})`); return indexed; } // ───────────────────────────────────────────────────────────────────────── // Version Retrieval // ───────────────────────────────────────────────────────────────────────── /** * Get all versions for a handle. * * @param handle Handle name to query * @returns List of HandleVersion objects, ordered by version_order (0 = current) */ getHandleVersions(handle) { const rows = this.db.prepare(` SELECT hash, version_order, is_current, embedding_id, created_at, parent_hash, semantic_delta_from_parent, upgrade_type FROM handle_version_vectors WHERE handle = ? ORDER BY version_order `).all(handle); return rows.map(row => ({ hash: row.hash, versionOrder: row.version_order, isCurrent: Boolean(row.is_current), embeddingId: row.embedding_id ?? undefined, createdAt: row.created_at, parentHash: row.parent_hash ?? undefined, semanticDelta: row.semantic_delta_from_parent ?? undefined, upgradeType: row.upgrade_type ?? undefined, })); } /** * Get all version hashes for a handle. * * @param handle Handle name to query * @returns List of hashes, ordered by version_order */ getHandleVersionHashes(handle) { const versions = this.getHandleVersions(handle); return versions.map(v => v.hash); } /** * Get the current version for a handle. * * @param handle Handle name to query * @returns HandleVersion for current version, or undefined if handle not found */ getCurrentVersion(handle) { const row = this.db.prepare(` SELECT hash, version_order, is_current, embedding_id, created_at, parent_hash, semantic_delta_from_parent, upgrade_type FROM handle_version_vectors WHERE handle = ? AND is_current = TRUE LIMIT 1 `).get(handle); if (!row) { return undefined; } return { hash: row.hash, versionOrder: row.version_order, isCurrent: Boolean(row.is_current), embeddingId: row.embedding_id ?? undefined, createdAt: row.created_at, parentHash: row.parent_hash ?? undefined, semanticDelta: row.semantic_delta_from_parent ?? undefined, upgradeType: row.upgrade_type ?? undefined, }; } // ───────────────────────────────────────────────────────────────────────── // Semantic Similarity Operations // ───────────────────────────────────────────────────────────────────────── /** * Get versions for a handle, sorted by similarity to a reference. * * @param handle Handle name * @param referenceHash Hash to compare against (default: current version) * @param metric 'cosine' (higher = more similar) or 'euclidean' (lower = more similar) * @returns List of VersionSimilarityResult, sorted by similarity */ getVersionsBySimilarity(handle, referenceHash, metric = 'cosine') { const versions = this.getHandleVersions(handle); if (versions.length === 0) { return []; } // Determine reference hash if (!referenceHash) { const current = versions.find(v => v.isCurrent); referenceHash = current?.hash ?? versions[0].hash; } // Get reference embedding const refEmbedding = this.getEmbedding(referenceHash); if (!refEmbedding) { console.warn(`No embedding found for reference hash ${referenceHash.slice(0, 8)}`); return []; } // Calculate similarities const results = []; for (const v of versions) { let similarity; let distance; if (v.hash === referenceHash) { // Self-similarity similarity = 1.0; distance = 0.0; } else { const vEmbedding = this.getEmbedding(v.hash); if (!vEmbedding) { console.debug(`Skipping version ${v.hash.slice(0, 8)} - no embedding`); continue; } similarity = cosineSimilarity(refEmbedding, vEmbedding); distance = euclideanDistance(refEmbedding, vEmbedding); } results.push({ hash: v.hash, versionOrder: v.versionOrder, similarityToCurrent: similarity, distanceToCurrent: distance, parentHash: v.parentHash, upgradeType: v.upgradeType, }); } // Sort by similarity (descending) or distance (ascending) if (metric === 'cosine') { results.sort((a, b) => b.similarityToCurrent - a.similarityToCurrent); } else { results.sort((a, b) => a.distanceToCurrent - b.distanceToCurrent); } return results; } /** * Search within a handle's versions by semantic query. * * Useful for finding which version of a document best matches * a specific concept or topic. * * @param handle Handle name to filter by * @param query Search query text * @param k Number of results * @returns List of VersionSimilarityResult with query similarity scores */ async searchHandleVersions(handle, query, k) { // Default to 10 results if not specified const limit = k ?? 10; // Get version hashes for this handle const versionHashes = this.getHandleVersionHashes(handle); if (versionHashes.length === 0) { return []; } // Search for similar content (no hash filter in base class) // We'll filter by handle versions after the search const allResults = await this.search(query, limit * 2); // Filter results to only include versions from this handle const versionHashSet = new Set(versionHashes); const results = allResults.filter(r => versionHashSet.has(r.hash)).slice(0, limit); // Get version info for enrichment const versions = new Map(this.getHandleVersions(handle).map(v => [v.hash, v])); // Enrich results with version info const enriched = []; for (const r of results) { const v = versions.get(r.hash); if (v) { enriched.push({ hash: r.hash, versionOrder: v.versionOrder, similarityToCurrent: 0.0, // Not computed in this method distanceToCurrent: 0.0, // Not computed in this method similarityToQuery: r.score, parentHash: v.parentHash, upgradeType: v.upgradeType, }); } } return enriched; } /** * Compute pairwise cosine similarities between all versions of a handle. * * Optionally caches results in the version_similarity_cache table. * * @param handle Handle name * @param cache Whether to cache computed similarities * @returns Map of (hash_a, hash_b) string key to cosine similarity */ computeVersionDistances(handle, cache = true) { const versions = this.getHandleVersions(handle); const embeddings = new Map(); for (const v of versions) { const emb = this.getEmbedding(v.hash); if (emb) { embeddings.set(v.hash, emb); } } const distances = new Map(); const hashes = Array.from(embeddings.keys()); const now = new Date().toISOString(); for (let i = 0; i < hashes.length; i++) { for (let j = i; j < hashes.length; j++) { const h1 = hashes[i]; const h2 = hashes[j]; let sim; if (h1 === h2) { sim = 1.0; } else { sim = cosineSimilarity(embeddings.get(h1), embeddings.get(h2)); const dist = euclideanDistance(embeddings.get(h1), embeddings.get(h2)); // Cache the result if (cache) { try { this.db.prepare(` INSERT OR REPLACE INTO version_similarity_cache (handle, hash_a, hash_b, similarity_score, distance_euclidean, computed_at) VALUES (?, ?, ?, ?, ?, ?) `).run(handle, h1, h2, sim, dist, now); } catch (e) { console.warn(`Failed to cache similarity for ${h1.slice(0, 8)}<->${h2.slice(0, 8)}: ${e}`); } } } // Store both directions distances.set(`${h1}:${h2}`, sim); distances.set(`${h2}:${h1}`, sim); } } return distances; } // ───────────────────────────────────────────────────────────────────────── // Embedding Retrieval (Internal) // ───────────────────────────────────────────────────────────────────────── /** * Get embedding vector for a hash. * * @param hash MCard hash * @param chunkIndex Chunk index (default 0 for first/whole chunk) * @returns Embedding vector as array of floats, or undefined if not found */ getEmbedding(hash, chunkIndex = 0) { // Get metadata ID const metaRow = this.db.prepare(` SELECT id FROM mcard_vector_metadata WHERE hash = ? AND chunk_index = ? `).get(hash, chunkIndex); if (!metaRow) { return undefined; } const metadataId = metaRow.id; // Get embedding from appropriate table const embeddingRow = this.db.prepare(` SELECT embedding FROM mcard_embeddings WHERE metadata_id = ? `).get(metadataId); if (embeddingRow) { return deserializeVector(embeddingRow.embedding); } return undefined; } // ───────────────────────────────────────────────────────────────────────── // Utility Methods // ───────────────────────────────────────────────────────────────────────── /** * List all handles with indexed versions. * * @returns List of unique handle names */ listHandles() { const rows = this.db.prepare('SELECT DISTINCT handle FROM handle_version_vectors ORDER BY handle').all(); return rows.map(row => row.handle); } /** * Count versions for a handle. * * @param handle Handle name * @returns Number of versions */ countVersions(handle) { const row = this.db.prepare('SELECT COUNT(*) as count FROM handle_version_vectors WHERE handle = ?').get(handle); return row.count; } /** * Delete all version records for a handle. * * Note: This only removes the handle-version associations, * not the underlying MCard embeddings. * * @param handle Handle name * @returns Number of version records deleted */ deleteHandle(handle) { // Delete similarity cache this.db.prepare('DELETE FROM version_similarity_cache WHERE handle = ?').run(handle); // Delete version records const result = this.db.prepare('DELETE FROM handle_version_vectors WHERE handle = ?').run(handle); console.log(`Deleted ${result.changes} version records for handle '${handle}'`); return result.changes ?? 0; } /** * Get extended vector store information including handle stats. */ getInfo() { const info = super.getInfo(); // Add handle-specific stats const handleCount = this.db.prepare('SELECT COUNT(DISTINCT handle) as count FROM handle_version_vectors').get(); info.handleCount = handleCount.count; const versionCount = this.db.prepare('SELECT COUNT(*) as count FROM handle_version_vectors').get(); info.versionCount = versionCount.count; const cachedSimilarities = this.db.prepare('SELECT COUNT(*) as count FROM version_similarity_cache').get(); info.cachedSimilarities = cachedSimilarities.count; return info; } } //# sourceMappingURL=HandleVectorStore.js.map