@raven-js/cortex
Version:
Zero-dependency machine learning, AI, and data processing library for modern JavaScript
428 lines (359 loc) • 12.5 kB
JavaScript
/**
* @author Anonyfox <max@anonyfox.com>
* @license MIT
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://ravenjs.dev}
* @see {@link https://anonyfox.com}
*/
/**
* @file Locality-Sensitive Hashing (LSH) for fast approximate similarity search.
*
* LSH enables sub-linear time approximate nearest neighbor search by grouping
* similar items into the same buckets with high probability. Works with MinHash
* signatures to provide efficient similarity search across large document collections.
*/
/**
* LSH bucket system for fast approximate similarity search.
* Groups similar MinHash signatures into buckets, enabling O(1) candidate
* retrieval for similarity queries instead of O(n) brute force comparison.
*/
export class LSHBuckets {
/**
* Creates a new LSH bucket system.
*
* @param {Object} options - Configuration options
* @param {number} [options.numBands=16] - Number of bands to split signature into
* @param {number} [options.signatureLength=128] - Expected length of MinHash signatures
* @param {number} [options.threshold=0.5] - Approximate similarity threshold for bucketing
*/
constructor(options = {}) {
const { numBands = 16, signatureLength = 128, threshold = 0.5 } = options;
this.numBands = numBands;
this.signatureLength = signatureLength;
this.threshold = threshold;
// Calculate rows per band
this.rowsPerBand = Math.floor(signatureLength / numBands);
this.actualSignatureLength = this.rowsPerBand * numBands;
if (this.rowsPerBand < 1) {
throw new Error(
"Number of bands too large for signature length. Need at least 1 row per band.",
);
}
// Storage for buckets: Map<bandIndex, Map<bucketHash, Set<itemId>>>
this.buckets = new Map();
for (let i = 0; i < numBands; i++) {
this.buckets.set(i, new Map());
}
// Storage for signatures: Map<itemId, signature>
this.signatures = new Map();
// Storage for original items: Map<itemId, item>
this.items = new Map();
// Counter for generating item IDs
this.nextItemId = 0;
}
/**
* Hash function for bucket assignment.
* Simple hash based on signature band content.
*
* @param {number[]} bandSignature - Portion of signature for this band
* @returns {string} Hash string for bucket assignment
*/
hashBand(bandSignature) {
let hash = 0;
for (let i = 0; i < bandSignature.length; i++) {
hash = ((hash << 5) - hash + bandSignature[i]) | 0;
}
return String(Math.abs(hash));
}
/**
* Adds an item with its MinHash signature to the LSH index.
*
* @param {*} item - Item to index (can be any type)
* @param {number[]} signature - MinHash signature for the item
* @returns {number} Generated item ID
*/
add(item, signature) {
if (!Array.isArray(signature)) {
throw new Error("Signature must be an array");
}
if (signature.length !== this.signatureLength) {
throw new Error(
`Signature length ${signature.length} does not match expected length ${this.signatureLength}`,
);
}
const itemId = this.nextItemId++;
// Store the item and signature
this.items.set(itemId, item);
this.signatures.set(itemId, signature);
// Add to LSH buckets
for (let bandIdx = 0; bandIdx < this.numBands; bandIdx++) {
const startIdx = bandIdx * this.rowsPerBand;
const endIdx = startIdx + this.rowsPerBand;
const bandSignature = signature.slice(startIdx, endIdx);
const bucketHash = this.hashBand(bandSignature);
const bandBuckets = this.buckets.get(bandIdx);
if (!bandBuckets.has(bucketHash)) {
bandBuckets.set(bucketHash, new Set());
}
bandBuckets.get(bucketHash).add(itemId);
}
return itemId;
}
/**
* Adds multiple items with their signatures in batch.
*
* @param {Array<{item: *, signature: number[]}>} itemsWithSignatures - Array of item-signature pairs
* @returns {number[]} Array of generated item IDs
*/
addBatch(itemsWithSignatures) {
if (!Array.isArray(itemsWithSignatures)) {
throw new Error("Input must be an array");
}
const itemIds = [];
for (const { item, signature } of itemsWithSignatures) {
itemIds.push(this.add(item, signature));
}
return itemIds;
}
/**
* Finds candidate items similar to the query signature.
* Returns items that share at least one bucket with the query.
*
* @param {number[]} querySignature - MinHash signature to search for
* @returns {Set<number>} Set of candidate item IDs
*/
getCandidates(querySignature) {
if (!Array.isArray(querySignature)) {
throw new Error("Query signature must be an array");
}
if (querySignature.length !== this.signatureLength) {
throw new Error(
`Query signature length ${querySignature.length} does not match expected length ${this.signatureLength}`,
);
}
const candidates = new Set();
// Check each band for bucket matches
for (let bandIdx = 0; bandIdx < this.numBands; bandIdx++) {
const startIdx = bandIdx * this.rowsPerBand;
const endIdx = startIdx + this.rowsPerBand;
const bandSignature = querySignature.slice(startIdx, endIdx);
const bucketHash = this.hashBand(bandSignature);
const bandBuckets = this.buckets.get(bandIdx);
if (bandBuckets.has(bucketHash)) {
for (const itemId of bandBuckets.get(bucketHash)) {
candidates.add(itemId);
}
}
}
return candidates;
}
/**
* Estimates MinHash similarity between two signatures.
* Helper function for similarity computation.
*
* @param {number[]} sig1 - First signature
* @param {number[]} sig2 - Second signature
* @returns {number} Estimated Jaccard similarity
*/
estimateSimilarity(sig1, sig2) {
if (sig1.length !== sig2.length) {
return 0;
}
let matches = 0;
for (let i = 0; i < sig1.length; i++) {
if (sig1[i] === sig2[i]) {
matches++;
}
}
return matches / sig1.length;
}
/**
* Searches for similar items using LSH candidate generation + exact similarity filtering.
*
* @param {number[]} querySignature - MinHash signature to search for
* @param {Object} options - Search options
* @param {number} [options.threshold=0.5] - Minimum similarity threshold
* @param {number} [options.maxResults=10] - Maximum number of results
* @returns {Array<{item: *, similarity: number, itemId: number}>} Similar items with scores
*/
search(querySignature, options = {}) {
const { threshold = this.threshold, maxResults = 10 } = options;
// Get candidate items from LSH buckets
const candidateIds = this.getCandidates(querySignature);
const results = [];
// Compute exact similarities for candidates
for (const itemId of candidateIds) {
const candidateSignature = this.signatures.get(itemId);
const similarity = this.estimateSimilarity(
querySignature,
candidateSignature,
);
if (similarity >= threshold) {
results.push({
item: this.items.get(itemId),
similarity,
itemId,
});
}
}
// Sort by similarity descending and limit results
results.sort((a, b) => b.similarity - a.similarity);
return results.slice(0, maxResults);
}
/**
* Removes an item from the LSH index.
*
* @param {number} itemId - ID of item to remove
* @returns {boolean} True if item was removed, false if not found
*/
remove(itemId) {
if (!this.signatures.has(itemId)) {
return false;
}
const signature = this.signatures.get(itemId);
// Remove from all buckets
for (let bandIdx = 0; bandIdx < this.numBands; bandIdx++) {
const startIdx = bandIdx * this.rowsPerBand;
const endIdx = startIdx + this.rowsPerBand;
const bandSignature = signature.slice(startIdx, endIdx);
const bucketHash = this.hashBand(bandSignature);
const bandBuckets = this.buckets.get(bandIdx);
if (bandBuckets.has(bucketHash)) {
bandBuckets.get(bucketHash).delete(itemId);
// Clean up empty buckets
if (bandBuckets.get(bucketHash).size === 0) {
bandBuckets.delete(bucketHash);
}
}
}
// Remove from storage
this.signatures.delete(itemId);
this.items.delete(itemId);
return true;
}
/**
* Clears all items from the LSH index.
*/
clear() {
this.signatures.clear();
this.items.clear();
this.nextItemId = 0;
for (let i = 0; i < this.numBands; i++) {
this.buckets.get(i).clear();
}
}
/**
* Gets statistics about the LSH index.
*
* @returns {Object} Statistics including item count, bucket distribution, etc.
*/
getStats() {
const totalItems = this.signatures.size;
const totalBuckets = this.numBands;
let usedBuckets = 0;
let totalBucketSize = 0;
let maxBucketSize = 0;
let minBucketSize = Number.POSITIVE_INFINITY;
const bucketSizes = [];
for (let bandIdx = 0; bandIdx < this.numBands; bandIdx++) {
const bandBuckets = this.buckets.get(bandIdx);
usedBuckets += bandBuckets.size;
for (const [, bucket] of bandBuckets) {
const size = bucket.size;
bucketSizes.push(size);
totalBucketSize += size;
maxBucketSize = Math.max(maxBucketSize, size);
minBucketSize = Math.min(minBucketSize, size);
}
}
const avgBucketSize =
bucketSizes.length > 0 ? totalBucketSize / bucketSizes.length : 0;
// Calculate load factor and collision rate
const loadFactor = totalItems > 0 ? usedBuckets / totalBuckets : 0;
const avgItemsPerBucket =
totalItems > 0 ? totalBucketSize / usedBuckets : 0;
return {
totalItems,
totalBands: totalBuckets,
usedBuckets,
avgBucketSize,
maxBucketSize: bucketSizes.length > 0 ? maxBucketSize : 0,
minBucketSize: bucketSizes.length > 0 ? minBucketSize : 0,
loadFactor,
avgItemsPerBucket,
rowsPerBand: this.rowsPerBand,
signatureLength: this.signatureLength,
threshold: this.threshold,
};
}
/**
* Estimates the probability that two items with given Jaccard similarity
* will be placed in the same bucket (collision probability).
*
* @param {number} jaccardSimilarity - Jaccard similarity between 0 and 1
* @returns {number} Probability of collision
*/
estimateCollisionProbability(jaccardSimilarity) {
if (jaccardSimilarity < 0 || jaccardSimilarity > 1) {
throw new Error("Jaccard similarity must be between 0 and 1");
}
// Probability that a band matches: s^r where s = similarity, r = rows per band
const bandMatchProb = jaccardSimilarity ** this.rowsPerBand;
// Probability that at least one band matches: 1 - (1 - s^r)^b
const collisionProb = 1 - (1 - bandMatchProb) ** this.numBands;
return collisionProb;
}
/**
* Finds the optimal number of bands for a given similarity threshold.
* Uses the LSH theory to balance false positives and false negatives.
*
* @param {number} threshold - Target similarity threshold
* @param {number} signatureLength - Length of MinHash signatures
* @returns {{numBands: number, rowsPerBand: number, signatureLength: number, collisionProbability: number}} Recommended configuration
*/
static findOptimalBands(threshold, signatureLength = 128) {
if (threshold <= 0 || threshold >= 1) {
throw new Error("Threshold must be between 0 and 1");
}
let bestBands = 1;
let bestScore = Number.POSITIVE_INFINITY;
// Try different number of bands
for (let bands = 1; bands <= signatureLength; bands++) {
const rows = Math.floor(signatureLength / bands);
if (rows < 1) continue;
// Calculate collision probability at the threshold
const bandMatchProb = threshold ** rows;
const collisionProb = 1 - (1 - bandMatchProb) ** bands;
// Score based on how close we are to 0.5 probability at threshold
// (good balance between false positives and negatives)
const score = Math.abs(collisionProb - 0.5);
if (score < bestScore) {
bestScore = score;
bestBands = bands;
}
}
const bestRows = Math.floor(signatureLength / bestBands);
const actualSignatureLength = bestRows * bestBands;
return {
numBands: bestBands,
rowsPerBand: bestRows,
signatureLength: actualSignatureLength,
collisionProbability: 1 - (1 - threshold ** bestRows) ** bestBands,
};
}
/**
* Creates an LSH bucket system with optimal parameters for given threshold.
*
* @param {number} threshold - Target similarity threshold
* @param {number} signatureLength - Expected MinHash signature length
* @returns {LSHBuckets} Optimally configured LSH bucket system
*/
static createOptimal(threshold = 0.5, signatureLength = 128) {
const optimal = LSHBuckets.findOptimalBands(threshold, signatureLength);
return new LSHBuckets({
numBands: optimal.numBands,
signatureLength: optimal.signatureLength,
threshold,
});
}
}