UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

415 lines 17.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.globalFingerprintCompressor = exports.globalProductQuantizer = exports.ColumnFingerprintCompressor = exports.ProductQuantizer = void 0; const performance_profiler_1 = require("./performance-profiler"); class ProductQuantizer { config; codebooks = []; isTrained = false; constructor(config = {}) { this.config = { subspaceCount: 8, bitsPerCode: 8, // 256 centroids per subspace maxIterations: 100, convergenceThreshold: 1e-4, initializationMethod: 'kmeans++', ...config }; } async train(vectors) { if (vectors.length === 0) { throw new Error('Cannot train on empty vector set'); } const profilerKey = 'pq_training'; performance_profiler_1.globalProfiler.startOperation(profilerKey, { vectorCount: vectors.length, vectorDimension: vectors[0].length, subspaceCount: this.config.subspaceCount }); try { const dimension = vectors[0].length; const subspaceDimension = Math.floor(dimension / this.config.subspaceCount); const numCentroids = Math.pow(2, this.config.bitsPerCode); // Split vectors into subspaces const subspaces = this.splitVectorsIntoSubspaces(vectors, subspaceDimension); // Train codebook for each subspace this.codebooks = await Promise.all(subspaces.map((subspace, index) => this.trainSubspaceCodebook(subspace, subspaceDimension, numCentroids, index))); this.isTrained = true; performance_profiler_1.globalProfiler.endOperation(profilerKey, vectors.length); } catch (error) { performance_profiler_1.globalProfiler.endOperation(profilerKey, 0); throw error; } } splitVectorsIntoSubspaces(vectors, subspaceDimension) { const subspaces = []; for (let s = 0; s < this.config.subspaceCount; s++) { const subspace = []; const startDim = s * subspaceDimension; const endDim = Math.min(startDim + subspaceDimension, vectors[0].length); for (const vector of vectors) { const subvector = new Float32Array(endDim - startDim); for (let d = startDim; d < endDim; d++) { subvector[d - startDim] = vector[d]; } subspace.push(subvector); } subspaces.push(subspace); } return subspaces; } async trainSubspaceCodebook(subspace, dimension, numCentroids, subspaceIndex) { // Initialize centroids using k-means++ let centroids = this.initializeCentroids(subspace, numCentroids, dimension); let assignments = new Uint8Array(subspace.length); let prevCost = Infinity; let iteration = 0; while (iteration < this.config.maxIterations) { // Assignment step - assign each vector to nearest centroid let totalCost = 0; for (let i = 0; i < subspace.length; i++) { let minDistance = Infinity; let bestCentroid = 0; for (let c = 0; c < numCentroids; c++) { const distance = this.euclideanDistance(subspace[i], centroids[c]); if (distance < minDistance) { minDistance = distance; bestCentroid = c; } } assignments[i] = bestCentroid; totalCost += minDistance; } // Check convergence const costImprovement = (prevCost - totalCost) / prevCost; if (costImprovement < this.config.convergenceThreshold) { break; } prevCost = totalCost; // Update step - recompute centroids const newCentroids = []; for (let c = 0; c < numCentroids; c++) { const assignedVectors = subspace.filter((_, i) => assignments[i] === c); if (assignedVectors.length > 0) { newCentroids.push(this.computeMean(assignedVectors)); } else { // Reinitialize empty centroid newCentroids.push(this.randomVector(dimension)); } } centroids = newCentroids; iteration++; } return { centroids, subspaceDimension: dimension, numCentroids, subspaceCount: this.config.subspaceCount }; } initializeCentroids(vectors, numCentroids, dimension) { if (this.config.initializationMethod === 'kmeans++') { return this.kMeansPlusPlusInit(vectors, numCentroids); } else { // Random initialization const centroids = []; for (let i = 0; i < numCentroids; i++) { centroids.push(this.randomVector(dimension)); } return centroids; } } kMeansPlusPlusInit(vectors, numCentroids) { const centroids = []; // Choose first centroid randomly const firstIndex = Math.floor(Math.random() * vectors.length); centroids.push(new Float32Array(vectors[firstIndex])); // Choose remaining centroids using k-means++ strategy for (let c = 1; c < numCentroids; c++) { const distances = new Float32Array(vectors.length); let totalDistance = 0; // Compute distance to nearest centroid for each vector for (let i = 0; i < vectors.length; i++) { let minDistance = Infinity; for (const centroid of centroids) { const distance = this.euclideanDistance(vectors[i], centroid); minDistance = Math.min(minDistance, distance); } distances[i] = minDistance * minDistance; // Square for weighting totalDistance += distances[i]; } // Choose next centroid with probability proportional to squared distance const threshold = Math.random() * totalDistance; let cumulativeDistance = 0; for (let i = 0; i < vectors.length; i++) { cumulativeDistance += distances[i]; if (cumulativeDistance >= threshold) { centroids.push(new Float32Array(vectors[i])); break; } } } return centroids; } computeMean(vectors) { const dimension = vectors[0].length; const mean = new Float32Array(dimension); for (const vector of vectors) { for (let d = 0; d < dimension; d++) { mean[d] += vector[d]; } } for (let d = 0; d < dimension; d++) { mean[d] /= vectors.length; } return mean; } randomVector(dimension) { const vector = new Float32Array(dimension); for (let d = 0; d < dimension; d++) { vector[d] = (Math.random() - 0.5) * 2; // Range [-1, 1] } return vector; } euclideanDistance(a, b) { let sum = 0; for (let i = 0; i < a.length; i++) { const diff = a[i] - b[i]; sum += diff * diff; } return Math.sqrt(sum); } compress(vector) { if (!this.isTrained) { throw new Error('ProductQuantizer must be trained before compression'); } const codes = new Uint8Array(this.config.subspaceCount); const subspaceDimension = Math.floor(vector.length / this.config.subspaceCount); for (let s = 0; s < this.config.subspaceCount; s++) { const startDim = s * subspaceDimension; const endDim = Math.min(startDim + subspaceDimension, vector.length); // Extract subvector const subvector = new Float32Array(endDim - startDim); for (let d = startDim; d < endDim; d++) { subvector[d - startDim] = vector[d]; } // Find nearest centroid in this subspace const codebook = this.codebooks[s]; let minDistance = Infinity; let bestCode = 0; for (let c = 0; c < codebook.centroids.length; c++) { const distance = this.euclideanDistance(subvector, codebook.centroids[c]); if (distance < minDistance) { minDistance = distance; bestCode = c; } } codes[s] = bestCode; } const originalSize = vector.length * 4; // 4 bytes per float32 const compressedSize = codes.length; // 1 byte per code const compressionRatio = originalSize / compressedSize; return { codes, metadata: { originalDimension: vector.length, subspaceCount: this.config.subspaceCount, compressionRatio } }; } decompress(compressed) { if (!this.isTrained) { throw new Error('ProductQuantizer must be trained before decompression'); } const { codes, metadata } = compressed; const reconstructed = new Float32Array(metadata.originalDimension); const subspaceDimension = Math.floor(metadata.originalDimension / metadata.subspaceCount); for (let s = 0; s < metadata.subspaceCount; s++) { const code = codes[s]; const codebook = this.codebooks[s]; const centroid = codebook.centroids[code]; const startDim = s * subspaceDimension; const endDim = Math.min(startDim + subspaceDimension, metadata.originalDimension); for (let d = 0; d < centroid.length && startDim + d < endDim; d++) { reconstructed[startDim + d] = centroid[d]; } } return reconstructed; } // Batch compression for better performance compressBatch(vectors) { const profilerKey = 'pq_batch_compression'; performance_profiler_1.globalProfiler.startOperation(profilerKey, { batchSize: vectors.length }); try { const compressed = vectors.map(vector => this.compress(vector)); performance_profiler_1.globalProfiler.endOperation(profilerKey, vectors.length); return compressed; } catch (error) { performance_profiler_1.globalProfiler.endOperation(profilerKey, 0); throw error; } } // Approximate distance computation using compressed vectors approximateDistance(compressed1, compressed2) { if (compressed1.codes.length !== compressed2.codes.length) { throw new Error('Compressed vectors must have same dimensionality'); } let totalDistance = 0; for (let s = 0; s < compressed1.codes.length; s++) { const code1 = compressed1.codes[s]; const code2 = compressed2.codes[s]; if (code1 === code2) { // Same centroid, distance is 0 continue; } const codebook = this.codebooks[s]; const centroid1 = codebook.centroids[code1]; const centroid2 = codebook.centroids[code2]; const distance = this.euclideanDistance(centroid1, centroid2); totalDistance += distance * distance; } return Math.sqrt(totalDistance); } // Pre-compute distance lookup tables for faster similarity search buildDistanceTables(query) { if (!this.isTrained) { throw new Error('ProductQuantizer must be trained before building distance tables'); } const distanceTables = []; const subspaceDimension = Math.floor(query.length / this.config.subspaceCount); for (let s = 0; s < this.config.subspaceCount; s++) { const startDim = s * subspaceDimension; const endDim = Math.min(startDim + subspaceDimension, query.length); // Extract query subvector const querySubvector = new Float32Array(endDim - startDim); for (let d = startDim; d < endDim; d++) { querySubvector[d - startDim] = query[d]; } // Compute distances to all centroids in this subspace const codebook = this.codebooks[s]; const distances = new Float32Array(codebook.centroids.length); for (let c = 0; c < codebook.centroids.length; c++) { distances[c] = this.euclideanDistance(querySubvector, codebook.centroids[c]); } distanceTables.push(distances); } return distanceTables; } // Fast approximate similarity search using precomputed distance tables searchApproximate(distanceTables, compressed, k = 10) { const distances = compressed.map((comp, index) => { let totalDistance = 0; for (let s = 0; s < comp.codes.length; s++) { const code = comp.codes[s]; totalDistance += distanceTables[s][code] * distanceTables[s][code]; } return { index, distance: Math.sqrt(totalDistance) }; }); return distances .sort((a, b) => a.distance - b.distance) .slice(0, k); } getCompressionStats() { const totalCentroids = this.codebooks.reduce((sum, codebook) => sum + codebook.numCentroids, 0); // Original: 4 bytes per dimension // Compressed: 1 byte per subspace const memoryReduction = this.config.subspaceCount > 0 ? (4 * this.config.subspaceCount) / this.config.subspaceCount : 1; return { subspaceCount: this.config.subspaceCount, bitsPerCode: this.config.bitsPerCode, totalCentroids, memoryReduction, isTrained: this.isTrained }; } // Serialize codebooks for persistence serialize() { if (!this.isTrained) { throw new Error('Cannot serialize untrained quantizer'); } const serializable = { config: this.config, codebooks: this.codebooks.map(codebook => ({ centroids: codebook.centroids.map(centroid => Array.from(centroid)), subspaceDimension: codebook.subspaceDimension, numCentroids: codebook.numCentroids, subspaceCount: codebook.subspaceCount })) }; return JSON.stringify(serializable); } // Deserialize codebooks from persistence deserialize(serialized) { const data = JSON.parse(serialized); this.config = data.config; this.codebooks = data.codebooks.map((codebook) => ({ centroids: codebook.centroids.map((centroid) => new Float32Array(centroid)), subspaceDimension: codebook.subspaceDimension, numCentroids: codebook.numCentroids, subspaceCount: codebook.subspaceCount })); this.isTrained = true; } } exports.ProductQuantizer = ProductQuantizer; // Optimized vector quantization for column fingerprints class ColumnFingerprintCompressor { pq; compressedFingerprints = new Map(); constructor(subspaceCount = 16) { this.pq = new ProductQuantizer({ subspaceCount, bitsPerCode: 8, maxIterations: 50, convergenceThreshold: 1e-3 }); } async trainOnFingerprints(fingerprints) { const vectors = Array.from(fingerprints.values()); await this.pq.train(vectors); // Pre-compress all training fingerprints for (const [id, vector] of fingerprints.entries()) { this.compressedFingerprints.set(id, this.pq.compress(vector)); } } compressFingerprint(id, fingerprint) { const compressed = this.pq.compress(fingerprint); this.compressedFingerprints.set(id, compressed); return compressed; } findSimilarFingerprints(queryFingerprint, threshold = 0.8, maxResults = 100) { const distanceTables = this.pq.buildDistanceTables(queryFingerprint); const compressedArray = Array.from(this.compressedFingerprints.entries()); const results = this.pq.searchApproximate(distanceTables, compressedArray.map(([_, compressed]) => compressed), maxResults); return results .map(({ index, distance }) => ({ id: compressedArray[index][0], similarity: 1 / (1 + distance) // Convert distance to similarity })) .filter(result => result.similarity >= threshold); } getCompressionRatio() { if (this.compressedFingerprints.size === 0) return 1; const sample = this.compressedFingerprints.values().next().value; return sample ? sample.metadata.compressionRatio : 1; } clear() { this.compressedFingerprints.clear(); } } exports.ColumnFingerprintCompressor = ColumnFingerprintCompressor; // Global instances for reuse exports.globalProductQuantizer = new ProductQuantizer({ subspaceCount: 8, bitsPerCode: 8, maxIterations: 100 }); exports.globalFingerprintCompressor = new ColumnFingerprintCompressor(16); //# sourceMappingURL=vector-compression.js.map