UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

496 lines 20.1 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.globalAnchorHNSW = exports.globalHNSW = exports.ColumnAnchorHNSW = exports.HierarchicalNavigableSmallWorld = void 0; const performance_profiler_1 = require("./performance-profiler"); class HierarchicalNavigableSmallWorld { config; nodes = new Map(); entryPoint = null; levelCounts = []; constructor(config = {}) { this.config = { maxConnections: 16, levelMultiplier: 1 / Math.log(2.0), efConstruction: 200, efSearch: 50, maxLevels: 16, distanceFunction: this.euclideanDistance, ...config }; } euclideanDistance(a, b) { let sum = 0; for (let i = 0; i < a.length; i++) { const diff = a[i] - b[i]; sum += diff * diff; } return Math.sqrt(sum); } cosineDistance(a, b) { let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } normA = Math.sqrt(normA); normB = Math.sqrt(normB); if (normA === 0 || normB === 0) { return 1; // Maximum distance for zero vectors } return 1 - (dotProduct / (normA * normB)); } generateLevel() { // Generate random level using exponential decay let level = 0; while (Math.random() < 0.5 && level < this.config.maxLevels - 1) { level++; } return level; } addNode(id, vector, metadata) { if (this.nodes.has(id)) { throw new Error(`Node ${id} already exists`); } const level = this.generateLevel(); const node = { id, vector: new Float32Array(vector), // Copy to prevent external modifications level, connections: new Map(), metadata }; // Initialize connection sets for each level for (let l = 0; l <= level; l++) { node.connections.set(l, new Set()); } this.nodes.set(id, node); this.updateLevelCounts(level); if (this.entryPoint === null || level > this.nodes.get(this.entryPoint).level) { this.entryPoint = id; } // Connect the node to the graph this.connectNode(node); } updateLevelCounts(level) { while (this.levelCounts.length <= level) { this.levelCounts.push(0); } this.levelCounts[level]++; } connectNode(newNode) { if (this.nodes.size === 1) { return; // First node, no connections needed } const profilerKey = `hnsw_connect_${newNode.id}`; performance_profiler_1.globalProfiler.startOperation(profilerKey); try { // Find closest nodes at each level using greedy search let currentClosest = this.entryPoint; const entryLevel = this.nodes.get(this.entryPoint).level; // Search from top level down to the node's level + 1 for (let level = entryLevel; level > newNode.level; level--) { currentClosest = this.greedySearchLayer(newNode.vector, currentClosest, 1, level)[0].id; } // Search and connect at each level from node's level down to 0 for (let level = Math.min(newNode.level, entryLevel); level >= 0; level--) { const candidates = this.searchLayer(newNode.vector, [currentClosest], this.config.efConstruction, level); const connections = this.selectConnections(candidates, level === 0 ? this.config.maxConnections * 2 : this.config.maxConnections); // Bidirectional connections for (const candidate of connections) { this.addConnection(newNode.id, candidate.id, level); this.addConnection(candidate.id, newNode.id, level); // Prune connections if needed this.pruneConnections(candidate.id, level); } currentClosest = connections[0].id; } performance_profiler_1.globalProfiler.endOperation(profilerKey, 1); } catch (error) { performance_profiler_1.globalProfiler.endOperation(profilerKey, 0); throw error; } } addConnection(nodeId1, nodeId2, level) { const node1 = this.nodes.get(nodeId1); const node2 = this.nodes.get(nodeId2); if (node1 && node2 && node1.level >= level && node2.level >= level) { node1.connections.get(level)?.add(nodeId2); } } pruneConnections(nodeId, level) { const node = this.nodes.get(nodeId); if (!node) return; const maxConnections = level === 0 ? this.config.maxConnections * 2 : this.config.maxConnections; const connections = node.connections.get(level); if (connections && connections.size > maxConnections) { // Convert connections to candidates and select best ones const candidates = Array.from(connections).map(id => ({ id, distance: this.config.distanceFunction(node.vector, this.nodes.get(id).vector) })); const selected = this.selectConnections(candidates, maxConnections); const newConnections = new Set(selected.map(c => c.id)); // Remove bidirectional connections for pruned nodes for (const removedId of connections) { if (!newConnections.has(removedId)) { this.nodes.get(removedId)?.connections.get(level)?.delete(nodeId); } } connections.clear(); selected.forEach(c => connections.add(c.id)); } } selectConnections(candidates, maxConnections) { // Sort by distance and take best candidates const sorted = candidates .sort((a, b) => a.distance - b.distance) .slice(0, maxConnections); return sorted; } search(query, k = 10, ef) { if (this.nodes.size === 0 || !this.entryPoint) { return []; } const searchEf = ef || Math.max(this.config.efSearch, k); const profilerKey = `hnsw_search_k${k}`; performance_profiler_1.globalProfiler.startOperation(profilerKey, { k, ef: searchEf }); try { let currentClosest = this.entryPoint; const entryLevel = this.nodes.get(this.entryPoint).level; // Search from top level down to level 1 for (let level = entryLevel; level > 0; level--) { const results = this.greedySearchLayer(query, currentClosest, 1, level); currentClosest = results[0].id; } // Search at level 0 with larger candidate list const finalCandidates = this.searchLayer(query, [currentClosest], searchEf, 0); const results = finalCandidates.slice(0, k); performance_profiler_1.globalProfiler.endOperation(profilerKey, k); return results; } catch (error) { performance_profiler_1.globalProfiler.endOperation(profilerKey, 0); throw error; } } greedySearchLayer(query, entryPoint, numClosest, level) { const visited = new Set(); const candidates = []; const dynamic = []; // Initialize with entry point const entryDistance = this.config.distanceFunction(query, this.nodes.get(entryPoint).vector); candidates.push({ id: entryPoint, distance: entryDistance }); dynamic.push({ id: entryPoint, distance: entryDistance }); visited.add(entryPoint); while (dynamic.length > 0) { // Get closest unvisited candidate dynamic.sort((a, b) => a.distance - b.distance); const current = dynamic.shift(); // If current is farther than the furthest in candidates, stop if (candidates.length >= numClosest) { candidates.sort((a, b) => a.distance - b.distance); if (current.distance > candidates[candidates.length - 1].distance) { break; } } // Explore neighbors const connections = this.nodes.get(current.id)?.connections.get(level); if (connections) { for (const neighborId of connections) { if (!visited.has(neighborId)) { visited.add(neighborId); const neighborDistance = this.config.distanceFunction(query, this.nodes.get(neighborId).vector); const neighbor = { id: neighborId, distance: neighborDistance }; candidates.push(neighbor); dynamic.push(neighbor); } } } } // Return best candidates candidates.sort((a, b) => a.distance - b.distance); return candidates.slice(0, numClosest); } searchLayer(query, entryPoints, ef, level) { const visited = new Set(); const candidates = []; const dynamic = []; // Initialize with entry points for (const entryPoint of entryPoints) { if (!visited.has(entryPoint)) { const distance = this.config.distanceFunction(query, this.nodes.get(entryPoint).vector); const result = { id: entryPoint, distance }; candidates.push(result); dynamic.push(result); visited.add(entryPoint); } } while (dynamic.length > 0) { // Get closest candidate dynamic.sort((a, b) => a.distance - b.distance); const current = dynamic.shift(); // If we have ef candidates and current is farther than the furthest, stop if (candidates.length >= ef) { candidates.sort((a, b) => a.distance - b.distance); if (current.distance > candidates[ef - 1].distance) { break; } } // Explore neighbors const connections = this.nodes.get(current.id)?.connections.get(level); if (connections) { for (const neighborId of connections) { if (!visited.has(neighborId)) { visited.add(neighborId); const neighborDistance = this.config.distanceFunction(query, this.nodes.get(neighborId).vector); const neighbor = { id: neighborId, distance: neighborDistance }; // Add to candidates if we need more or it's better than the worst if (candidates.length < ef || neighborDistance < candidates[ef - 1].distance) { candidates.push(neighbor); dynamic.push(neighbor); // Keep candidates sorted and trim to ef if (candidates.length > ef) { candidates.sort((a, b) => a.distance - b.distance); candidates.splice(ef); } } } } } } candidates.sort((a, b) => a.distance - b.distance); return candidates; } // Batch insertion for better performance addNodesBatch(nodes) { const profilerKey = `hnsw_batch_insert`; performance_profiler_1.globalProfiler.startOperation(profilerKey, { batchSize: nodes.length }); try { // Sort by level to insert higher-level nodes first const sortedNodes = nodes.map(({ id, vector, metadata }) => ({ id, vector, metadata, level: this.generateLevel() })).sort((a, b) => b.level - a.level); for (const nodeData of sortedNodes) { this.addNode(nodeData.id, nodeData.vector, nodeData.metadata); } performance_profiler_1.globalProfiler.endOperation(profilerKey, nodes.length); } catch (error) { performance_profiler_1.globalProfiler.endOperation(profilerKey, 0); throw error; } } // Remove node from index removeNode(id) { const node = this.nodes.get(id); if (!node) { return false; } // Remove all connections to this node for (let level = 0; level <= node.level; level++) { const connections = node.connections.get(level); if (connections) { for (const connectedId of connections) { this.nodes.get(connectedId)?.connections.get(level)?.delete(id); } } } // Update level counts this.levelCounts[node.level]--; this.nodes.delete(id); // Update entry point if needed if (this.entryPoint === id) { this.findNewEntryPoint(); } return true; } findNewEntryPoint() { let maxLevel = -1; let newEntryPoint = null; for (const [id, node] of this.nodes) { if (node.level > maxLevel) { maxLevel = node.level; newEntryPoint = id; } } this.entryPoint = newEntryPoint; } // Range search searchRange(query, radius, maxResults = 100) { if (this.nodes.size === 0) { return []; } // Start with regular search to find candidates const candidates = this.search(query, Math.min(maxResults * 2, this.config.efSearch * 2)); // Filter by radius return candidates .filter(result => result.distance <= radius) .slice(0, maxResults); } getStats() { let totalConnections = 0; let totalComparisons = 0; for (const node of this.nodes.values()) { for (const connections of node.connections.values()) { totalConnections += connections.size; } } return { nodeCount: this.nodes.size, levelDistribution: [...this.levelCounts], averageConnections: this.nodes.size > 0 ? totalConnections / this.nodes.size : 0, searchPerformance: { averageDistance: 0, // Would be calculated from search history averageComparisons: totalComparisons / Math.max(1, this.nodes.size) } }; } // Optimize index after bulk operations optimize() { const profilerKey = 'hnsw_optimize'; performance_profiler_1.globalProfiler.startOperation(profilerKey); try { // Rebuild connections for better connectivity const nodes = Array.from(this.nodes.values()); const nodeConnections = new Map(); // Save current connections for (const node of nodes) { nodeConnections.set(node.id, new Map(node.connections)); } // Clear all connections for (const node of nodes) { node.connections.clear(); for (let level = 0; level <= node.level; level++) { node.connections.set(level, new Set()); } } // Reconnect nodes in level order (highest first) nodes.sort((a, b) => b.level - a.level); for (let i = 1; i < nodes.length; i++) { this.connectNode(nodes[i]); } performance_profiler_1.globalProfiler.endOperation(profilerKey, nodes.length); } catch (error) { performance_profiler_1.globalProfiler.endOperation(profilerKey, 0); throw error; } } // Serialize index for persistence serialize() { const serializable = { config: this.config, entryPoint: this.entryPoint, levelCounts: this.levelCounts, nodes: Array.from(this.nodes.entries()).map(([id, node]) => ({ id, vector: Array.from(node.vector), level: node.level, connections: Object.fromEntries(Array.from(node.connections.entries()).map(([level, connections]) => [ level, Array.from(connections) ])), metadata: node.metadata })) }; return JSON.stringify(serializable); } // Deserialize index from persistence deserialize(serialized) { const data = JSON.parse(serialized); this.config = { ...this.config, ...data.config }; this.entryPoint = data.entryPoint; this.levelCounts = data.levelCounts || []; this.nodes.clear(); for (const nodeData of data.nodes) { const node = { id: nodeData.id, vector: new Float32Array(nodeData.vector), level: nodeData.level, connections: new Map(), metadata: nodeData.metadata }; // Rebuild connections map for (const [level, connections] of Object.entries(nodeData.connections)) { node.connections.set(parseInt(level), new Set(connections)); } this.nodes.set(nodeData.id, node); } } clear() { this.nodes.clear(); this.entryPoint = null; this.levelCounts = []; } } exports.HierarchicalNavigableSmallWorld = HierarchicalNavigableSmallWorld; // Specialized HNSW for column anchor similarity search class ColumnAnchorHNSW { hnsw; anchorVectors = new Map(); constructor() { this.hnsw = new HierarchicalNavigableSmallWorld({ maxConnections: 32, efConstruction: 400, efSearch: 100, distanceFunction: this.columnSimilarityDistance }); } columnSimilarityDistance(a, b) { // Weighted distance function for column similarity // Emphasizes statistical and semantic features let statisticalDiff = 0; let semanticDiff = 0; const halfLen = a.length / 2; // First half: statistical features (nulls, cardinality, etc.) for (let i = 0; i < halfLen; i++) { const diff = a[i] - b[i]; statisticalDiff += diff * diff; } // Second half: semantic features (patterns, types, etc.) for (let i = halfLen; i < a.length; i++) { const diff = a[i] - b[i]; semanticDiff += diff * diff; } // Weight semantic features more heavily return Math.sqrt(0.3 * statisticalDiff + 0.7 * semanticDiff); } addAnchor(anchorId, features, metadata) { this.anchorVectors.set(anchorId, new Float32Array(features)); this.hnsw.addNode(anchorId, features, metadata); } findSimilarAnchors(queryFeatures, k = 10, threshold = 0.8) { const results = this.hnsw.search(queryFeatures, k * 2); // Search more to filter return results .map(result => ({ anchorId: result.id, similarity: 1 / (1 + result.distance), // Convert distance to similarity metadata: result.metadata })) .filter(result => result.similarity >= threshold) .slice(0, k); } optimize() { this.hnsw.optimize(); } getStats() { return this.hnsw.getStats(); } } exports.ColumnAnchorHNSW = ColumnAnchorHNSW; // Global instances exports.globalHNSW = new HierarchicalNavigableSmallWorld({ maxConnections: 16, efConstruction: 200, efSearch: 50 }); exports.globalAnchorHNSW = new ColumnAnchorHNSW(); //# sourceMappingURL=hnsw-index.js.map