UNPKG

similarity-sensitive-hash

Version:

Lightning-fast cosine similarity search using locality-sensitive hashing. Compress high-dimensional vectors into tiny fingerprints for 10× faster similarity search with 99% less memory usage.

238 lines (208 loc) 8.22 kB
/** * @fileoverview Implementation of Count-Sketch Sign Random Projections (CS-SRP) and * Higher-Order Count-Sketch SRP (HCS-SRP) for cosine similarity estimation. * Based on the paper "Improving SRP with Count-Sketch and Higher-Order Count-Sketch". * Provides functions to generate SRP signatures and estimate cosine similarity. */ /** * Splitmix32 random number generator for reproducible random number generation. * @private * @param {number} a The seed value. * @returns {function(): number} A function that returns random numbers between 0 and 1. */ function splitmix32(a) { return function() { a |= 0; a = a + 0x9e3779b9 | 0; let t = Math.imul(a ^ a >>> 16, 0x21f0aaad) | 0; t = t ^ t >>> 15; t = Math.imul(t, 0x735a2d97) | 0; return ((t ^ t >>> 15) >>> 0) / 4294967296; }; } /** * Count-Sketch function (Definition 4 from paper). * Computes a count-sketch of a vector using hash functions and sign functions. * @private * @param {number[]} vec The input vector. * @param {number} m The size of the count-sketch (number of bins). * @param {number} [seed=42] The seed for reproducible random number generation. * @returns {Float32Array} The count-sketch array. */ function countSketch(vec, m, seed = 42) { const d = vec.length; const rng = splitmix32(seed); // Generate hash functions h: [d] -> [m] and sign functions s: [d] -> {-1, 1} const h = new Int32Array(d); const s = new Int32Array(d); for (let j = 0; j < d; j++) { h[j] = Math.floor(rng() * m); // h(j) ∈ [m] s[j] = rng() < 0.5 ? -1 : 1; // s(j) ∈ {-1, 1} } // Compute count-sketch: CS(u)_l = Σ_{h(j)=l} s(j) * u_j const sketch = new Float32Array(m); for (let j = 0; j < d; j++) { const l = h[j]; sketch[l] += s[j] * vec[j]; } return sketch; } /** * CS-SRP encoding * Generates a binary signature using Count-Sketch Sign Random Projections. * ξ(u) = (sgn(CS(u)_1), ..., sgn(CS(u)_m)) * where sgn(CS(u)_l) = 1 if CS(u)_l > 0, otherwise 0 * @param {number[]} vec The input vector to encode. * @param {number} m The number of bits in the signature. * @param {number} [seed=42] The seed for reproducible random number generation. * @returns {Uint8Array} The binary signature array. */ export function encodeCSSRP(vec, m, seed = 42) { if (!Array.isArray(vec) || vec.length === 0) { throw new Error("Input vector must be a non-empty array."); } if (typeof m !== 'number' || m <= 0 || !Number.isInteger(m)) { throw new Error("Signature size must be a positive integer."); } if (typeof seed !== 'number' || !Number.isInteger(seed)) { throw new Error("Seed must be an integer."); } const sketch = countSketch(vec, m, seed); return Uint8Array.from(sketch, v => v > 0 ? 1 : 0); } /** * Higher-Order Count-Sketch function * Computes a higher-order count-sketch using 2D tensor operations. * @private * @param {number[]} vec The input vector. * @param {number} m1 The first dimension size of the 2D tensor. * @param {number} m2 The second dimension size of the 2D tensor. * @param {number} [seed=42] The seed for reproducible random number generation. * @returns {Float32Array} The higher-order count-sketch array. */ function higherOrderCountSketch(vec, m1, m2, seed = 42) { const d = vec.length; const d2 = Math.floor(Math.sqrt(d)); const d1 = Math.ceil(d / d2); const rng = splitmix32(seed); // Generate hash functions h1, h2: [d1] -> [m1], [d2] -> [m2] // and sign functions s1, s2: [d1] -> {-1, 1}, [d2] -> {-1, 1} const h1 = new Int32Array(d1); const h2 = new Int32Array(d2); const s1 = new Int32Array(d1); const s2 = new Int32Array(d2); for (let i1 = 0; i1 < d1; i1++) { h1[i1] = Math.floor(rng() * m1); s1[i1] = rng() < 0.5 ? -1 : 1; } for (let i2 = 0; i2 < d2; i2++) { h2[i2] = Math.floor(rng() * m2); s2[i2] = rng() < 0.5 ? -1 : 1; } // Compute higher-order count-sketch // HCS(u)_{l1,l2} = Σ_{h1(i1)=l1, h2(i2)=l2} s1(i1) * s2(i2) * u_j // where j = i2 * d1 + i1 (flattening 2D to 1D) const sketch = new Float32Array(m1 * m2); for (let i1 = 0; i1 < d1; i1++) { for (let i2 = 0; i2 < d2; i2++) { const j = i2 * d1 + i1; if (j < d) { // Only process valid indices const l1 = h1[i1]; const l2 = h2[i2]; const idx = l1 * m2 + l2; sketch[idx] += s1[i1] * s2[i2] * vec[j]; } } } return sketch; } /** * HCS-SRP encoding * Generates a binary signature using Higher-Order Count-Sketch Sign Random Projections. * ξ'(u) = (sgn(ũ_1), ..., sgn(ũ_m)) * where ũ = vec(HCS(u)) and sgn(ũ_l) = 1 if ũ_l > 0, otherwise 0 * @param {number[]} vec The input vector to encode. * @param {number} m The number of bits in the signature. * @param {number} [seed=42] The seed for reproducible random number generation. * @returns {Uint8Array} The binary signature array. */ export function encodeHCSSRP(vec, m, seed = 42) { if (!Array.isArray(vec) || vec.length === 0) { throw new Error("Input vector must be a non-empty array."); } if (typeof m !== 'number' || m <= 0 || !Number.isInteger(m)) { throw new Error("Signature size must be a positive integer."); } if (typeof seed !== 'number' || !Number.isInteger(seed)) { throw new Error("Seed must be an integer."); } // Calculate square root for 2D tensor dimensions const m1 = Math.floor(Math.sqrt(m)); const m2 = Math.ceil(m / m1); // Ensure we get exactly m bits const actualSize = m1 * m2; const sketch = higherOrderCountSketch(vec, m1, m2, seed); const result = Uint8Array.from(sketch, v => v > 0 ? 1 : 0); // Return exactly m bits return result.slice(0, m); } /** * Estimates cosine similarity between two SRP signatures. * Pr(collision) = 1 - θ/π * So θ = π(1-p), and cos(θ) = cos(π(1-p)) * @param {Uint8Array} sigA The first signature. * @param {Uint8Array} sigB The second signature. * @returns {number} The estimated cosine similarity between 0 and 1. * @throws {Error} If signatures have different lengths. */ export function estimateCosineSimilarity(sigA, sigB) { if (!sigA || !sigB) { throw new Error("Both signatures must be provided."); } if (sigA.length !== sigB.length) { throw new Error('Signature length mismatch.'); } let agree = 0; const len = sigA.length; for (let i = 0; i < len; i++) { if (sigA[i] === sigB[i]) agree++; } const p = agree / len; // According to Theorem 29/35: Pr(collision) = 1 - θ/π // So θ = π(1-p), and cos(θ) = cos(π(1-p)) return Math.cos(Math.PI * (1 - p)); } /** * Calculates the true cosine similarity between two vectors. * This is the exact computation used for comparison with estimated similarity. * @param {number[]} vecA The first vector. * @param {number[]} vecB The second vector. * @returns {number} The exact cosine similarity between -1 and 1. * @throws {Error} If vectors have different lengths or are empty. */ export function cosineSimilarity(vecA, vecB) { if (!Array.isArray(vecA) || !Array.isArray(vecB)) { throw new Error("Both inputs must be arrays."); } if (vecA.length !== vecB.length) { throw new Error("Vectors must have the same length."); } if (vecA.length === 0) { throw new Error("Vectors cannot be empty."); } let dotProduct = 0; let normA = 0; let normB = 0; const len = vecA.length; for (let i = 0; i < len; i++) { dotProduct += vecA[i] * vecB[i]; normA += vecA[i] * vecA[i]; normB += vecB[i] * vecB[i]; } if (normA === 0 || normB === 0) { return 0; // Handle zero vectors } return dotProduct / Math.sqrt(normA * normB); } // Default export for convenience - use HCS-SRP as default export { encodeHCSSRP as encodeSRP, estimateCosineSimilarity as estimateSimilarity };