UNPKG

superminhash

Version:

TypeScript implementation of the SuperMinHash algorithm for Jaccard similarity estimation

189 lines (188 loc) 8.19 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.SuperMinHash = void 0; const seedrandom_1 = __importDefault(require("seedrandom")); class SuperMinHash { constructor(signatureSize = SuperMinHash.DEFAULT_SIGNATURE_SIZE, seed = SuperMinHash.DEFAULT_SEED) { this.signatureSize = signatureSize; this.seed = seed; this.empty = true; if (this.signatureSize <= 0 || !Number.isInteger(this.signatureSize)) { throw new Error('Signature size must be a positive integer'); } this.signature = new Uint32Array(this.signatureSize).fill(SuperMinHash.MAX_HASH_VALUE); this.empty = true; } generateSeedString(element) { const serialized = typeof element === 'string' ? element : JSON.stringify(element); if (serialized.length > SuperMinHash.MAX_INPUT_LENGTH) { throw new Error(`Input exceeds maximum length of ${SuperMinHash.MAX_INPUT_LENGTH} characters`); } return `${this.seed}:${serialized}`; } add(elements) { const emptyContext = this.initProcessingContext(); for (const element of elements) { this.empty = false; const elementSeedString = this.generateSeedString(element); const randomGenerator = (0, seedrandom_1.default)(elementSeedString); const processingContext = this.cloneProcessingContext(emptyContext); this.processElementWithContext(processingContext, randomGenerator); } } initProcessingContext() { const m = this.signatureSize; return { positions: Array.from({ length: m }, (_, i) => i), processedElements: new Array(m).fill(-1), bucketCounts: [...new Array(m - 1).fill(0), m], maxBucketIndex: m - 1, }; } cloneProcessingContext(context) { return { positions: [...context.positions], processedElements: [...context.processedElements], bucketCounts: [...context.bucketCounts], maxBucketIndex: context.maxBucketIndex, }; } processElementWithContext(context, randomGenerator) { const { positions, processedElements, bucketCounts } = context; let { maxBucketIndex } = context; let currentPosition = 0; while (currentPosition <= maxBucketIndex) { const randomValue = Math.floor(randomGenerator() * SuperMinHash.MAX_HASH_VALUE); const randomPosition = this.selectRandomPosition(currentPosition, this.signatureSize, randomGenerator); this.ensurePositionsInitialized(currentPosition, randomPosition, positions, processedElements); this.swapPositions(currentPosition, randomPosition, positions); const signaturePosition = positions[currentPosition]; maxBucketIndex = this.updateSignatureIfNeeded(currentPosition, randomValue, signaturePosition, bucketCounts, maxBucketIndex); currentPosition++; } } selectRandomPosition(currentPosition, size, randomGenerator) { return currentPosition + Math.floor(randomGenerator() * (size - currentPosition)); } ensurePositionsInitialized(pos1, pos2, positions, processedElements) { if (processedElements[pos1] !== 0) { processedElements[pos1] = 0; positions[pos1] = pos1; } if (processedElements[pos2] !== 0) { processedElements[pos2] = 0; positions[pos2] = pos2; } } swapPositions(pos1, pos2, positions) { const temp = positions[pos1]; positions[pos1] = positions[pos2]; positions[pos2] = temp; } updateSignatureIfNeeded(currentPosition, randomValue, signaturePosition, bucketCounts, maxBucketIndex) { const newValue = (randomValue + currentPosition) % SuperMinHash.MAX_HASH_VALUE; const currentValue = this.signature[signaturePosition]; if (newValue < currentValue) { const previousBucket = Math.min(currentValue, this.signatureSize - 1); this.signature[signaturePosition] = newValue; if (currentPosition < previousBucket) { bucketCounts[previousBucket]--; bucketCounts[currentPosition]++; return this.adjustMaxBucketIndex(maxBucketIndex, bucketCounts); } } return maxBucketIndex; } adjustMaxBucketIndex(currentMax, bucketCounts) { let newMax = currentMax; while (newMax > 0 && bucketCounts[newMax] === 0) { newMax--; } return newMax; } similarity(other) { if (this.empty || other.empty) { return this.empty && other.empty ? 1.0 : 0.0; } return this.getJaccardIndex(other); } getJaccardIndex(other) { if (this.seed !== other.seed) { throw new Error('Cannot compare signatures generated with different seeds'); } if (this.signatureSize !== other.signatureSize) { throw new Error('Can only compare signatures of the same size'); } return (this.signature.reduce((acc, value, index) => { return acc + (value === other.signature[index] ? 1 : 0); }, 0) / this.signatureSize); } getSignature() { return new Uint32Array(this.signature); } isEmpty() { return this.empty; } serialize() { const metadataSize = 9; // 4 bytes for size, 4 for seed, 1 for empty const bufferSize = metadataSize + this.signatureSize * 4; const buffer = new ArrayBuffer(bufferSize); const view = new DataView(buffer); view.setUint32(0, this.signatureSize, true); view.setUint32(4, this.seed, true); view.setUint8(8, this.empty ? 0 : 1); let offset = metadataSize; for (let position = 0; position < this.signatureSize; position++) { view.setUint32(offset, this.signature[position], true); offset += 4; } return new Uint8Array(buffer); } static deserialize(binary) { if (binary.length < 9) { throw new Error('Invalid binary data: too short'); } const view = new DataView(binary.buffer); const signatureSize = view.getUint32(0, true); if (signatureSize <= 0) { throw new Error('Invalid binary data: signature size must be positive'); } const expectedLength = 9 + signatureSize * 4; if (binary.length !== expectedLength) { throw new Error(`Invalid binary data: expected length ${expectedLength}, got ${binary.length}`); } const seed = view.getUint32(4, true); const empty = view.getUint8(8) === 0; const minhash = new SuperMinHash(signatureSize, seed); minhash.empty = empty; const metadataSize = 9; for (let position = 0; position < signatureSize; position++) { minhash.signature[position] = view.getUint32(metadataSize + position * 4, true); } return minhash; } static compareSerialized(firstSignature, secondSignature) { const firstMinHash = SuperMinHash.deserialize(firstSignature); const secondMinHash = SuperMinHash.deserialize(secondSignature); return firstMinHash.similarity(secondMinHash); } static fromRawSignature(signature, seed, empty = false) { const minhash = new SuperMinHash(signature.length, seed); minhash.signature.set(signature); minhash.empty = empty; return minhash; } static fromIterable(elements, signatureSize = SuperMinHash.DEFAULT_SIGNATURE_SIZE, seed = SuperMinHash.DEFAULT_SEED) { const minhash = new SuperMinHash(signatureSize, seed); minhash.add(elements); return minhash; } } exports.SuperMinHash = SuperMinHash; SuperMinHash.DEFAULT_SIGNATURE_SIZE = 256; SuperMinHash.DEFAULT_SEED = 42; SuperMinHash.MAX_INPUT_LENGTH = 100000; SuperMinHash.MAX_HASH_VALUE = 0xffffffff;