superminhash
Version:
TypeScript implementation of the SuperMinHash algorithm for Jaccard similarity estimation
189 lines (188 loc) • 8.19 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.SuperMinHash = void 0;
const seedrandom_1 = __importDefault(require("seedrandom"));
class SuperMinHash {
constructor(signatureSize = SuperMinHash.DEFAULT_SIGNATURE_SIZE, seed = SuperMinHash.DEFAULT_SEED) {
this.signatureSize = signatureSize;
this.seed = seed;
this.empty = true;
if (this.signatureSize <= 0 || !Number.isInteger(this.signatureSize)) {
throw new Error('Signature size must be a positive integer');
}
this.signature = new Uint32Array(this.signatureSize).fill(SuperMinHash.MAX_HASH_VALUE);
this.empty = true;
}
generateSeedString(element) {
const serialized = typeof element === 'string' ? element : JSON.stringify(element);
if (serialized.length > SuperMinHash.MAX_INPUT_LENGTH) {
throw new Error(`Input exceeds maximum length of ${SuperMinHash.MAX_INPUT_LENGTH} characters`);
}
return `${this.seed}:${serialized}`;
}
add(elements) {
const emptyContext = this.initProcessingContext();
for (const element of elements) {
this.empty = false;
const elementSeedString = this.generateSeedString(element);
const randomGenerator = (0, seedrandom_1.default)(elementSeedString);
const processingContext = this.cloneProcessingContext(emptyContext);
this.processElementWithContext(processingContext, randomGenerator);
}
}
initProcessingContext() {
const m = this.signatureSize;
return {
positions: Array.from({ length: m }, (_, i) => i),
processedElements: new Array(m).fill(-1),
bucketCounts: [...new Array(m - 1).fill(0), m],
maxBucketIndex: m - 1,
};
}
cloneProcessingContext(context) {
return {
positions: [...context.positions],
processedElements: [...context.processedElements],
bucketCounts: [...context.bucketCounts],
maxBucketIndex: context.maxBucketIndex,
};
}
processElementWithContext(context, randomGenerator) {
const { positions, processedElements, bucketCounts } = context;
let { maxBucketIndex } = context;
let currentPosition = 0;
while (currentPosition <= maxBucketIndex) {
const randomValue = Math.floor(randomGenerator() * SuperMinHash.MAX_HASH_VALUE);
const randomPosition = this.selectRandomPosition(currentPosition, this.signatureSize, randomGenerator);
this.ensurePositionsInitialized(currentPosition, randomPosition, positions, processedElements);
this.swapPositions(currentPosition, randomPosition, positions);
const signaturePosition = positions[currentPosition];
maxBucketIndex = this.updateSignatureIfNeeded(currentPosition, randomValue, signaturePosition, bucketCounts, maxBucketIndex);
currentPosition++;
}
}
selectRandomPosition(currentPosition, size, randomGenerator) {
return currentPosition + Math.floor(randomGenerator() * (size - currentPosition));
}
ensurePositionsInitialized(pos1, pos2, positions, processedElements) {
if (processedElements[pos1] !== 0) {
processedElements[pos1] = 0;
positions[pos1] = pos1;
}
if (processedElements[pos2] !== 0) {
processedElements[pos2] = 0;
positions[pos2] = pos2;
}
}
swapPositions(pos1, pos2, positions) {
const temp = positions[pos1];
positions[pos1] = positions[pos2];
positions[pos2] = temp;
}
updateSignatureIfNeeded(currentPosition, randomValue, signaturePosition, bucketCounts, maxBucketIndex) {
const newValue = (randomValue + currentPosition) % SuperMinHash.MAX_HASH_VALUE;
const currentValue = this.signature[signaturePosition];
if (newValue < currentValue) {
const previousBucket = Math.min(currentValue, this.signatureSize - 1);
this.signature[signaturePosition] = newValue;
if (currentPosition < previousBucket) {
bucketCounts[previousBucket]--;
bucketCounts[currentPosition]++;
return this.adjustMaxBucketIndex(maxBucketIndex, bucketCounts);
}
}
return maxBucketIndex;
}
adjustMaxBucketIndex(currentMax, bucketCounts) {
let newMax = currentMax;
while (newMax > 0 && bucketCounts[newMax] === 0) {
newMax--;
}
return newMax;
}
similarity(other) {
if (this.empty || other.empty) {
return this.empty && other.empty ? 1.0 : 0.0;
}
return this.getJaccardIndex(other);
}
getJaccardIndex(other) {
if (this.seed !== other.seed) {
throw new Error('Cannot compare signatures generated with different seeds');
}
if (this.signatureSize !== other.signatureSize) {
throw new Error('Can only compare signatures of the same size');
}
return (this.signature.reduce((acc, value, index) => {
return acc + (value === other.signature[index] ? 1 : 0);
}, 0) / this.signatureSize);
}
getSignature() {
return new Uint32Array(this.signature);
}
isEmpty() {
return this.empty;
}
serialize() {
const metadataSize = 9; // 4 bytes for size, 4 for seed, 1 for empty
const bufferSize = metadataSize + this.signatureSize * 4;
const buffer = new ArrayBuffer(bufferSize);
const view = new DataView(buffer);
view.setUint32(0, this.signatureSize, true);
view.setUint32(4, this.seed, true);
view.setUint8(8, this.empty ? 0 : 1);
let offset = metadataSize;
for (let position = 0; position < this.signatureSize; position++) {
view.setUint32(offset, this.signature[position], true);
offset += 4;
}
return new Uint8Array(buffer);
}
static deserialize(binary) {
if (binary.length < 9) {
throw new Error('Invalid binary data: too short');
}
const view = new DataView(binary.buffer);
const signatureSize = view.getUint32(0, true);
if (signatureSize <= 0) {
throw new Error('Invalid binary data: signature size must be positive');
}
const expectedLength = 9 + signatureSize * 4;
if (binary.length !== expectedLength) {
throw new Error(`Invalid binary data: expected length ${expectedLength}, got ${binary.length}`);
}
const seed = view.getUint32(4, true);
const empty = view.getUint8(8) === 0;
const minhash = new SuperMinHash(signatureSize, seed);
minhash.empty = empty;
const metadataSize = 9;
for (let position = 0; position < signatureSize; position++) {
minhash.signature[position] = view.getUint32(metadataSize + position * 4, true);
}
return minhash;
}
static compareSerialized(firstSignature, secondSignature) {
const firstMinHash = SuperMinHash.deserialize(firstSignature);
const secondMinHash = SuperMinHash.deserialize(secondSignature);
return firstMinHash.similarity(secondMinHash);
}
static fromRawSignature(signature, seed, empty = false) {
const minhash = new SuperMinHash(signature.length, seed);
minhash.signature.set(signature);
minhash.empty = empty;
return minhash;
}
static fromIterable(elements, signatureSize = SuperMinHash.DEFAULT_SIGNATURE_SIZE, seed = SuperMinHash.DEFAULT_SEED) {
const minhash = new SuperMinHash(signatureSize, seed);
minhash.add(elements);
return minhash;
}
}
exports.SuperMinHash = SuperMinHash;
SuperMinHash.DEFAULT_SIGNATURE_SIZE = 256;
SuperMinHash.DEFAULT_SEED = 42;
SuperMinHash.MAX_INPUT_LENGTH = 100000;
SuperMinHash.MAX_HASH_VALUE = 0xffffffff;