semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
496 lines • 20.1 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.globalAnchorHNSW = exports.globalHNSW = exports.ColumnAnchorHNSW = exports.HierarchicalNavigableSmallWorld = void 0;
const performance_profiler_1 = require("./performance-profiler");
class HierarchicalNavigableSmallWorld {
config;
nodes = new Map();
entryPoint = null;
levelCounts = [];
constructor(config = {}) {
this.config = {
maxConnections: 16,
levelMultiplier: 1 / Math.log(2.0),
efConstruction: 200,
efSearch: 50,
maxLevels: 16,
distanceFunction: this.euclideanDistance,
...config
};
}
euclideanDistance(a, b) {
let sum = 0;
for (let i = 0; i < a.length; i++) {
const diff = a[i] - b[i];
sum += diff * diff;
}
return Math.sqrt(sum);
}
cosineDistance(a, b) {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
normA = Math.sqrt(normA);
normB = Math.sqrt(normB);
if (normA === 0 || normB === 0) {
return 1; // Maximum distance for zero vectors
}
return 1 - (dotProduct / (normA * normB));
}
generateLevel() {
// Generate random level using exponential decay
let level = 0;
while (Math.random() < 0.5 && level < this.config.maxLevels - 1) {
level++;
}
return level;
}
addNode(id, vector, metadata) {
if (this.nodes.has(id)) {
throw new Error(`Node ${id} already exists`);
}
const level = this.generateLevel();
const node = {
id,
vector: new Float32Array(vector), // Copy to prevent external modifications
level,
connections: new Map(),
metadata
};
// Initialize connection sets for each level
for (let l = 0; l <= level; l++) {
node.connections.set(l, new Set());
}
this.nodes.set(id, node);
this.updateLevelCounts(level);
if (this.entryPoint === null || level > this.nodes.get(this.entryPoint).level) {
this.entryPoint = id;
}
// Connect the node to the graph
this.connectNode(node);
}
updateLevelCounts(level) {
while (this.levelCounts.length <= level) {
this.levelCounts.push(0);
}
this.levelCounts[level]++;
}
connectNode(newNode) {
if (this.nodes.size === 1) {
return; // First node, no connections needed
}
const profilerKey = `hnsw_connect_${newNode.id}`;
performance_profiler_1.globalProfiler.startOperation(profilerKey);
try {
// Find closest nodes at each level using greedy search
let currentClosest = this.entryPoint;
const entryLevel = this.nodes.get(this.entryPoint).level;
// Search from top level down to the node's level + 1
for (let level = entryLevel; level > newNode.level; level--) {
currentClosest = this.greedySearchLayer(newNode.vector, currentClosest, 1, level)[0].id;
}
// Search and connect at each level from node's level down to 0
for (let level = Math.min(newNode.level, entryLevel); level >= 0; level--) {
const candidates = this.searchLayer(newNode.vector, [currentClosest], this.config.efConstruction, level);
const connections = this.selectConnections(candidates, level === 0 ? this.config.maxConnections * 2 : this.config.maxConnections);
// Bidirectional connections
for (const candidate of connections) {
this.addConnection(newNode.id, candidate.id, level);
this.addConnection(candidate.id, newNode.id, level);
// Prune connections if needed
this.pruneConnections(candidate.id, level);
}
currentClosest = connections[0].id;
}
performance_profiler_1.globalProfiler.endOperation(profilerKey, 1);
}
catch (error) {
performance_profiler_1.globalProfiler.endOperation(profilerKey, 0);
throw error;
}
}
addConnection(nodeId1, nodeId2, level) {
const node1 = this.nodes.get(nodeId1);
const node2 = this.nodes.get(nodeId2);
if (node1 && node2 && node1.level >= level && node2.level >= level) {
node1.connections.get(level)?.add(nodeId2);
}
}
pruneConnections(nodeId, level) {
const node = this.nodes.get(nodeId);
if (!node)
return;
const maxConnections = level === 0 ? this.config.maxConnections * 2 : this.config.maxConnections;
const connections = node.connections.get(level);
if (connections && connections.size > maxConnections) {
// Convert connections to candidates and select best ones
const candidates = Array.from(connections).map(id => ({
id,
distance: this.config.distanceFunction(node.vector, this.nodes.get(id).vector)
}));
const selected = this.selectConnections(candidates, maxConnections);
const newConnections = new Set(selected.map(c => c.id));
// Remove bidirectional connections for pruned nodes
for (const removedId of connections) {
if (!newConnections.has(removedId)) {
this.nodes.get(removedId)?.connections.get(level)?.delete(nodeId);
}
}
connections.clear();
selected.forEach(c => connections.add(c.id));
}
}
selectConnections(candidates, maxConnections) {
// Sort by distance and take best candidates
const sorted = candidates
.sort((a, b) => a.distance - b.distance)
.slice(0, maxConnections);
return sorted;
}
search(query, k = 10, ef) {
if (this.nodes.size === 0 || !this.entryPoint) {
return [];
}
const searchEf = ef || Math.max(this.config.efSearch, k);
const profilerKey = `hnsw_search_k${k}`;
performance_profiler_1.globalProfiler.startOperation(profilerKey, { k, ef: searchEf });
try {
let currentClosest = this.entryPoint;
const entryLevel = this.nodes.get(this.entryPoint).level;
// Search from top level down to level 1
for (let level = entryLevel; level > 0; level--) {
const results = this.greedySearchLayer(query, currentClosest, 1, level);
currentClosest = results[0].id;
}
// Search at level 0 with larger candidate list
const finalCandidates = this.searchLayer(query, [currentClosest], searchEf, 0);
const results = finalCandidates.slice(0, k);
performance_profiler_1.globalProfiler.endOperation(profilerKey, k);
return results;
}
catch (error) {
performance_profiler_1.globalProfiler.endOperation(profilerKey, 0);
throw error;
}
}
greedySearchLayer(query, entryPoint, numClosest, level) {
const visited = new Set();
const candidates = [];
const dynamic = [];
// Initialize with entry point
const entryDistance = this.config.distanceFunction(query, this.nodes.get(entryPoint).vector);
candidates.push({ id: entryPoint, distance: entryDistance });
dynamic.push({ id: entryPoint, distance: entryDistance });
visited.add(entryPoint);
while (dynamic.length > 0) {
// Get closest unvisited candidate
dynamic.sort((a, b) => a.distance - b.distance);
const current = dynamic.shift();
// If current is farther than the furthest in candidates, stop
if (candidates.length >= numClosest) {
candidates.sort((a, b) => a.distance - b.distance);
if (current.distance > candidates[candidates.length - 1].distance) {
break;
}
}
// Explore neighbors
const connections = this.nodes.get(current.id)?.connections.get(level);
if (connections) {
for (const neighborId of connections) {
if (!visited.has(neighborId)) {
visited.add(neighborId);
const neighborDistance = this.config.distanceFunction(query, this.nodes.get(neighborId).vector);
const neighbor = { id: neighborId, distance: neighborDistance };
candidates.push(neighbor);
dynamic.push(neighbor);
}
}
}
}
// Return best candidates
candidates.sort((a, b) => a.distance - b.distance);
return candidates.slice(0, numClosest);
}
searchLayer(query, entryPoints, ef, level) {
const visited = new Set();
const candidates = [];
const dynamic = [];
// Initialize with entry points
for (const entryPoint of entryPoints) {
if (!visited.has(entryPoint)) {
const distance = this.config.distanceFunction(query, this.nodes.get(entryPoint).vector);
const result = { id: entryPoint, distance };
candidates.push(result);
dynamic.push(result);
visited.add(entryPoint);
}
}
while (dynamic.length > 0) {
// Get closest candidate
dynamic.sort((a, b) => a.distance - b.distance);
const current = dynamic.shift();
// If we have ef candidates and current is farther than the furthest, stop
if (candidates.length >= ef) {
candidates.sort((a, b) => a.distance - b.distance);
if (current.distance > candidates[ef - 1].distance) {
break;
}
}
// Explore neighbors
const connections = this.nodes.get(current.id)?.connections.get(level);
if (connections) {
for (const neighborId of connections) {
if (!visited.has(neighborId)) {
visited.add(neighborId);
const neighborDistance = this.config.distanceFunction(query, this.nodes.get(neighborId).vector);
const neighbor = { id: neighborId, distance: neighborDistance };
// Add to candidates if we need more or it's better than the worst
if (candidates.length < ef || neighborDistance < candidates[ef - 1].distance) {
candidates.push(neighbor);
dynamic.push(neighbor);
// Keep candidates sorted and trim to ef
if (candidates.length > ef) {
candidates.sort((a, b) => a.distance - b.distance);
candidates.splice(ef);
}
}
}
}
}
}
candidates.sort((a, b) => a.distance - b.distance);
return candidates;
}
// Batch insertion for better performance
addNodesBatch(nodes) {
const profilerKey = `hnsw_batch_insert`;
performance_profiler_1.globalProfiler.startOperation(profilerKey, { batchSize: nodes.length });
try {
// Sort by level to insert higher-level nodes first
const sortedNodes = nodes.map(({ id, vector, metadata }) => ({
id,
vector,
metadata,
level: this.generateLevel()
})).sort((a, b) => b.level - a.level);
for (const nodeData of sortedNodes) {
this.addNode(nodeData.id, nodeData.vector, nodeData.metadata);
}
performance_profiler_1.globalProfiler.endOperation(profilerKey, nodes.length);
}
catch (error) {
performance_profiler_1.globalProfiler.endOperation(profilerKey, 0);
throw error;
}
}
// Remove node from index
removeNode(id) {
const node = this.nodes.get(id);
if (!node) {
return false;
}
// Remove all connections to this node
for (let level = 0; level <= node.level; level++) {
const connections = node.connections.get(level);
if (connections) {
for (const connectedId of connections) {
this.nodes.get(connectedId)?.connections.get(level)?.delete(id);
}
}
}
// Update level counts
this.levelCounts[node.level]--;
this.nodes.delete(id);
// Update entry point if needed
if (this.entryPoint === id) {
this.findNewEntryPoint();
}
return true;
}
findNewEntryPoint() {
let maxLevel = -1;
let newEntryPoint = null;
for (const [id, node] of this.nodes) {
if (node.level > maxLevel) {
maxLevel = node.level;
newEntryPoint = id;
}
}
this.entryPoint = newEntryPoint;
}
// Range search
searchRange(query, radius, maxResults = 100) {
if (this.nodes.size === 0) {
return [];
}
// Start with regular search to find candidates
const candidates = this.search(query, Math.min(maxResults * 2, this.config.efSearch * 2));
// Filter by radius
return candidates
.filter(result => result.distance <= radius)
.slice(0, maxResults);
}
getStats() {
let totalConnections = 0;
let totalComparisons = 0;
for (const node of this.nodes.values()) {
for (const connections of node.connections.values()) {
totalConnections += connections.size;
}
}
return {
nodeCount: this.nodes.size,
levelDistribution: [...this.levelCounts],
averageConnections: this.nodes.size > 0 ? totalConnections / this.nodes.size : 0,
searchPerformance: {
averageDistance: 0, // Would be calculated from search history
averageComparisons: totalComparisons / Math.max(1, this.nodes.size)
}
};
}
// Optimize index after bulk operations
optimize() {
const profilerKey = 'hnsw_optimize';
performance_profiler_1.globalProfiler.startOperation(profilerKey);
try {
// Rebuild connections for better connectivity
const nodes = Array.from(this.nodes.values());
const nodeConnections = new Map();
// Save current connections
for (const node of nodes) {
nodeConnections.set(node.id, new Map(node.connections));
}
// Clear all connections
for (const node of nodes) {
node.connections.clear();
for (let level = 0; level <= node.level; level++) {
node.connections.set(level, new Set());
}
}
// Reconnect nodes in level order (highest first)
nodes.sort((a, b) => b.level - a.level);
for (let i = 1; i < nodes.length; i++) {
this.connectNode(nodes[i]);
}
performance_profiler_1.globalProfiler.endOperation(profilerKey, nodes.length);
}
catch (error) {
performance_profiler_1.globalProfiler.endOperation(profilerKey, 0);
throw error;
}
}
// Serialize index for persistence
serialize() {
const serializable = {
config: this.config,
entryPoint: this.entryPoint,
levelCounts: this.levelCounts,
nodes: Array.from(this.nodes.entries()).map(([id, node]) => ({
id,
vector: Array.from(node.vector),
level: node.level,
connections: Object.fromEntries(Array.from(node.connections.entries()).map(([level, connections]) => [
level,
Array.from(connections)
])),
metadata: node.metadata
}))
};
return JSON.stringify(serializable);
}
// Deserialize index from persistence
deserialize(serialized) {
const data = JSON.parse(serialized);
this.config = { ...this.config, ...data.config };
this.entryPoint = data.entryPoint;
this.levelCounts = data.levelCounts || [];
this.nodes.clear();
for (const nodeData of data.nodes) {
const node = {
id: nodeData.id,
vector: new Float32Array(nodeData.vector),
level: nodeData.level,
connections: new Map(),
metadata: nodeData.metadata
};
// Rebuild connections map
for (const [level, connections] of Object.entries(nodeData.connections)) {
node.connections.set(parseInt(level), new Set(connections));
}
this.nodes.set(nodeData.id, node);
}
}
clear() {
this.nodes.clear();
this.entryPoint = null;
this.levelCounts = [];
}
}
exports.HierarchicalNavigableSmallWorld = HierarchicalNavigableSmallWorld;
// Specialized HNSW for column anchor similarity search
class ColumnAnchorHNSW {
hnsw;
anchorVectors = new Map();
constructor() {
this.hnsw = new HierarchicalNavigableSmallWorld({
maxConnections: 32,
efConstruction: 400,
efSearch: 100,
distanceFunction: this.columnSimilarityDistance
});
}
columnSimilarityDistance(a, b) {
// Weighted distance function for column similarity
// Emphasizes statistical and semantic features
let statisticalDiff = 0;
let semanticDiff = 0;
const halfLen = a.length / 2;
// First half: statistical features (nulls, cardinality, etc.)
for (let i = 0; i < halfLen; i++) {
const diff = a[i] - b[i];
statisticalDiff += diff * diff;
}
// Second half: semantic features (patterns, types, etc.)
for (let i = halfLen; i < a.length; i++) {
const diff = a[i] - b[i];
semanticDiff += diff * diff;
}
// Weight semantic features more heavily
return Math.sqrt(0.3 * statisticalDiff + 0.7 * semanticDiff);
}
addAnchor(anchorId, features, metadata) {
this.anchorVectors.set(anchorId, new Float32Array(features));
this.hnsw.addNode(anchorId, features, metadata);
}
findSimilarAnchors(queryFeatures, k = 10, threshold = 0.8) {
const results = this.hnsw.search(queryFeatures, k * 2); // Search more to filter
return results
.map(result => ({
anchorId: result.id,
similarity: 1 / (1 + result.distance), // Convert distance to similarity
metadata: result.metadata
}))
.filter(result => result.similarity >= threshold)
.slice(0, k);
}
optimize() {
this.hnsw.optimize();
}
getStats() {
return this.hnsw.getStats();
}
}
exports.ColumnAnchorHNSW = ColumnAnchorHNSW;
// Global instances
exports.globalHNSW = new HierarchicalNavigableSmallWorld({
maxConnections: 16,
efConstruction: 200,
efSearch: 50
});
exports.globalAnchorHNSW = new ColumnAnchorHNSW();
//# sourceMappingURL=hnsw-index.js.map