similarity-sensitive-hash
Version:
Lightning-fast cosine similarity search using locality-sensitive hashing. Compress high-dimensional vectors into tiny fingerprints for 10× faster similarity search with 99% less memory usage.
238 lines (208 loc) • 8.22 kB
JavaScript
/**
* @fileoverview Implementation of Count-Sketch Sign Random Projections (CS-SRP) and
* Higher-Order Count-Sketch SRP (HCS-SRP) for cosine similarity estimation.
* Based on the paper "Improving SRP with Count-Sketch and Higher-Order Count-Sketch".
* Provides functions to generate SRP signatures and estimate cosine similarity.
*/
/**
* Splitmix32 random number generator for reproducible random number generation.
* @private
* @param {number} a The seed value.
* @returns {function(): number} A function that returns random numbers between 0 and 1.
*/
function splitmix32(a) {
return function() {
a |= 0;
a = a + 0x9e3779b9 | 0;
let t = Math.imul(a ^ a >>> 16, 0x21f0aaad) | 0;
t = t ^ t >>> 15;
t = Math.imul(t, 0x735a2d97) | 0;
return ((t ^ t >>> 15) >>> 0) / 4294967296;
};
}
/**
* Count-Sketch function (Definition 4 from paper).
* Computes a count-sketch of a vector using hash functions and sign functions.
* @private
* @param {number[]} vec The input vector.
* @param {number} m The size of the count-sketch (number of bins).
* @param {number} [seed=42] The seed for reproducible random number generation.
* @returns {Float32Array} The count-sketch array.
*/
function countSketch(vec, m, seed = 42) {
const d = vec.length;
const rng = splitmix32(seed);
// Generate hash functions h: [d] -> [m] and sign functions s: [d] -> {-1, 1}
const h = new Int32Array(d);
const s = new Int32Array(d);
for (let j = 0; j < d; j++) {
h[j] = Math.floor(rng() * m); // h(j) ∈ [m]
s[j] = rng() < 0.5 ? -1 : 1; // s(j) ∈ {-1, 1}
}
// Compute count-sketch: CS(u)_l = Σ_{h(j)=l} s(j) * u_j
const sketch = new Float32Array(m);
for (let j = 0; j < d; j++) {
const l = h[j];
sketch[l] += s[j] * vec[j];
}
return sketch;
}
/**
* CS-SRP encoding
* Generates a binary signature using Count-Sketch Sign Random Projections.
* ξ(u) = (sgn(CS(u)_1), ..., sgn(CS(u)_m))
* where sgn(CS(u)_l) = 1 if CS(u)_l > 0, otherwise 0
* @param {number[]} vec The input vector to encode.
* @param {number} m The number of bits in the signature.
* @param {number} [seed=42] The seed for reproducible random number generation.
* @returns {Uint8Array} The binary signature array.
*/
export function encodeCSSRP(vec, m, seed = 42) {
if (!Array.isArray(vec) || vec.length === 0) {
throw new Error("Input vector must be a non-empty array.");
}
if (typeof m !== 'number' || m <= 0 || !Number.isInteger(m)) {
throw new Error("Signature size must be a positive integer.");
}
if (typeof seed !== 'number' || !Number.isInteger(seed)) {
throw new Error("Seed must be an integer.");
}
const sketch = countSketch(vec, m, seed);
return Uint8Array.from(sketch, v => v > 0 ? 1 : 0);
}
/**
* Higher-Order Count-Sketch function
* Computes a higher-order count-sketch using 2D tensor operations.
* @private
* @param {number[]} vec The input vector.
* @param {number} m1 The first dimension size of the 2D tensor.
* @param {number} m2 The second dimension size of the 2D tensor.
* @param {number} [seed=42] The seed for reproducible random number generation.
* @returns {Float32Array} The higher-order count-sketch array.
*/
function higherOrderCountSketch(vec, m1, m2, seed = 42) {
const d = vec.length;
const d2 = Math.floor(Math.sqrt(d));
const d1 = Math.ceil(d / d2);
const rng = splitmix32(seed);
// Generate hash functions h1, h2: [d1] -> [m1], [d2] -> [m2]
// and sign functions s1, s2: [d1] -> {-1, 1}, [d2] -> {-1, 1}
const h1 = new Int32Array(d1);
const h2 = new Int32Array(d2);
const s1 = new Int32Array(d1);
const s2 = new Int32Array(d2);
for (let i1 = 0; i1 < d1; i1++) {
h1[i1] = Math.floor(rng() * m1);
s1[i1] = rng() < 0.5 ? -1 : 1;
}
for (let i2 = 0; i2 < d2; i2++) {
h2[i2] = Math.floor(rng() * m2);
s2[i2] = rng() < 0.5 ? -1 : 1;
}
// Compute higher-order count-sketch
// HCS(u)_{l1,l2} = Σ_{h1(i1)=l1, h2(i2)=l2} s1(i1) * s2(i2) * u_j
// where j = i2 * d1 + i1 (flattening 2D to 1D)
const sketch = new Float32Array(m1 * m2);
for (let i1 = 0; i1 < d1; i1++) {
for (let i2 = 0; i2 < d2; i2++) {
const j = i2 * d1 + i1;
if (j < d) { // Only process valid indices
const l1 = h1[i1];
const l2 = h2[i2];
const idx = l1 * m2 + l2;
sketch[idx] += s1[i1] * s2[i2] * vec[j];
}
}
}
return sketch;
}
/**
* HCS-SRP encoding
* Generates a binary signature using Higher-Order Count-Sketch Sign Random Projections.
* ξ'(u) = (sgn(ũ_1), ..., sgn(ũ_m))
* where ũ = vec(HCS(u)) and sgn(ũ_l) = 1 if ũ_l > 0, otherwise 0
* @param {number[]} vec The input vector to encode.
* @param {number} m The number of bits in the signature.
* @param {number} [seed=42] The seed for reproducible random number generation.
* @returns {Uint8Array} The binary signature array.
*/
export function encodeHCSSRP(vec, m, seed = 42) {
if (!Array.isArray(vec) || vec.length === 0) {
throw new Error("Input vector must be a non-empty array.");
}
if (typeof m !== 'number' || m <= 0 || !Number.isInteger(m)) {
throw new Error("Signature size must be a positive integer.");
}
if (typeof seed !== 'number' || !Number.isInteger(seed)) {
throw new Error("Seed must be an integer.");
}
// Calculate square root for 2D tensor dimensions
const m1 = Math.floor(Math.sqrt(m));
const m2 = Math.ceil(m / m1);
// Ensure we get exactly m bits
const actualSize = m1 * m2;
const sketch = higherOrderCountSketch(vec, m1, m2, seed);
const result = Uint8Array.from(sketch, v => v > 0 ? 1 : 0);
// Return exactly m bits
return result.slice(0, m);
}
/**
* Estimates cosine similarity between two SRP signatures.
* Pr(collision) = 1 - θ/π
* So θ = π(1-p), and cos(θ) = cos(π(1-p))
* @param {Uint8Array} sigA The first signature.
* @param {Uint8Array} sigB The second signature.
* @returns {number} The estimated cosine similarity between 0 and 1.
* @throws {Error} If signatures have different lengths.
*/
export function estimateCosineSimilarity(sigA, sigB) {
if (!sigA || !sigB) {
throw new Error("Both signatures must be provided.");
}
if (sigA.length !== sigB.length) {
throw new Error('Signature length mismatch.');
}
let agree = 0;
const len = sigA.length;
for (let i = 0; i < len; i++) {
if (sigA[i] === sigB[i]) agree++;
}
const p = agree / len;
// According to Theorem 29/35: Pr(collision) = 1 - θ/π
// So θ = π(1-p), and cos(θ) = cos(π(1-p))
return Math.cos(Math.PI * (1 - p));
}
/**
* Calculates the true cosine similarity between two vectors.
* This is the exact computation used for comparison with estimated similarity.
* @param {number[]} vecA The first vector.
* @param {number[]} vecB The second vector.
* @returns {number} The exact cosine similarity between -1 and 1.
* @throws {Error} If vectors have different lengths or are empty.
*/
export function cosineSimilarity(vecA, vecB) {
if (!Array.isArray(vecA) || !Array.isArray(vecB)) {
throw new Error("Both inputs must be arrays.");
}
if (vecA.length !== vecB.length) {
throw new Error("Vectors must have the same length.");
}
if (vecA.length === 0) {
throw new Error("Vectors cannot be empty.");
}
let dotProduct = 0;
let normA = 0;
let normB = 0;
const len = vecA.length;
for (let i = 0; i < len; i++) {
dotProduct += vecA[i] * vecB[i];
normA += vecA[i] * vecA[i];
normB += vecB[i] * vecB[i];
}
if (normA === 0 || normB === 0) {
return 0; // Handle zero vectors
}
return dotProduct / Math.sqrt(normA * normB);
}
// Default export for convenience - use HCS-SRP as default
export { encodeHCSSRP as encodeSRP, estimateCosineSimilarity as estimateSimilarity };