UNPKG

node-simhash-mod

Version:

Command Line tool that compares two text files using simhash

218 lines (190 loc) 7.22 kB
void function () { 'use strict'; var crypto = require('crypto'); var natural = require('natural'); var NGrams = natural.NGrams; const bits = 64; module.exports = { compare: compare, summary: summary, hammingWeight: hammingWeight, shingles: shingles, jaccardIndex: jaccardIndex, createBinaryString: createBinaryString, shingleHashList: shingleHashList, simhash: simhash, similarity: similarity, getDistanceReport: getDistanceReport } function compare(file1, file2) { return similarity(simhash(file1), simhash(file2)); } function summary(file1, file2) { var hash1 = simhash(file1); var hash2 = simhash(file2); var simhashval = similarity(hash1, hash2); var hashList1 = shingleHashList(file1); var hashList2 = shingleHashList(file2); const common = hashList1.filter(value => hashList2.includes(value)); console.log(`common hashes: ${common.join(', ')} - which is ${common.length} / ${hashList1.length} or ${hashList2.length}`); //var jaccard = jaccardIndex(shingles(file1), shingles(file2)); console.log("File1 simhash:", hash1, createBinaryString(hash1)); console.log("File2 simhash:", hash2, createBinaryString(hash2)); console.log("Simhash similarity is " + simhashval + " (%d%% similar)", Math.round(simhashval * 100)); console.log(`similar bits: ${similarBits(hash1, hash2)}`); //console.log("Jaccard index is " + jaccard + " (%d%% similar)", Math.round(jaccard * 100)); } function hammingWeight(l) { var c; for (c = 0; l; c++) { l &= l - 1n; } return c; } function similarBits(simhash1, simhash2) { // console.log(`${createBinaryString(simhash1)} <= 1`); // console.log(`${createBinaryString(simhash2)} <= 2`); var bitCount = 0n; // count bits that are different var z = BigInt(simhash1 ^ simhash2); // set bits in z that are different in x and y to 1. // console.log(`${createBinaryString(z)} <= bits that are different`); while (z !== 0n) { // not all bits are 0 // console.log(`${createBinaryString(z)} <= current z`); bitCount += z & 1n; // add least significant bit z = z >> 1n; // iterate over z; set next bit as lsb } //console.log(z); return bitCount; } function similarity(simhash1, simhash2) { return hammingWeight((simhash1 & simhash2)) / hammingWeight((simhash1 | simhash2)); } function shingleHashList(str) { //console.log(crypto.getHashes()); var list = []; console.log('calculated hashes:'); for (var word of shingles(str, 4)) { const hash = crypto.createHash('sha256'); hash.update(word); const hexString = hash.digest('hex'); // fait 256 bits // console.log('hexString: ' + hexString); // quelle taille ? // console.log(parseInt(hexString, 16).toString(2)); const originalHashAsInt = BigInt('0x' + hexString); // console.log(originalHashAsInt.toString(2) + ' <= ORIGINAL'); // ok fait 256 bits const truncatedHashAsInt = BigInt(originalHashAsInt >> BigInt(256-bits)); // on garde que le début // console.log(truncatedHashAsInt.toString(2) + ' <= TRUNCATED'); // it remains a BigInt here, as js Number is not ok for 64 bits integers list.push(truncatedHashAsInt); /* const hash = crc32.str(word) & 0xffffffff; list.push(hash); // ça sert à quoi le & ? // and-ing it with 0xffffffff makes sure any bits over the 32 are zeroed out. */ console.log(`${createBinaryString(truncatedHashAsInt)} <= ${word}`); } // console.log(`shingleHashList: ${list.join(', ')}`); return list; } // hum function cleanHtml(input) { let output = input.replace(/<p>/g, '').replace(/div>/g, '').replace(/<\/p>/g, '').replace(/<\/div>/g, ''); return output; } function shingles(original, kshingles = 2) { var shingles = new Set(); const cleanedHtml = cleanHtml(original); const normalized = cleanedHtml.normalize("NFD").replace(/[\u0300-\u036f]/g, ""); for (var wordlist of NGrams.ngrams(normalized, kshingles, null, '[end]')) { shingles.add(wordlist.join(" ")); } console.log(`shingles: ${[...shingles].join(', ')}`); return shingles; } function simhash(str) { console.log('START SIMHASH'); var shingles = shingleHashList(str); // liste d'entiers xx bits signés var mask = 1n; var simhash = 0n; for (var i = 0; i < bits; i++) { // ce 64 a de l'importance ; mais pourquoi ce n'est pas un 32 ? 32 SUFFISENT! var sim = 0; // n'a pas besoin d'être BigInt lui for (var s of shingles) { sim += ((s & mask) == mask) ? 1 : -1; // comptage de ceux qui ont le bit à 1 } simhash |= (sim > 0 ? mask : 0n); // on complète le morceau du simhash // console.log(`${simhash.toString(2)} <= current simhash`); mask <<= 1n; } console.log(`${createBinaryString(simhash)} <= binary simhash`); console.log(`END SIMHASH: ${simhash} => ${simhash.toString(2)}`); return simhash; } function jaccardIndex(set1, set2) { var total = set1.size + set2.size; var intersection = 0; for (var shingle of set1) { if (set2.has(shingle)) { intersection++; } } var union = total - intersection; return intersection / union; } function createBinaryString(nMask) { // console.log(nMask.toString(2) + ' <= INPUT'); const asBinaryString = nMask.toString(2); const padded = asBinaryString.padStart(bits, '0'); // console.log(padded + ' <= PADDED'); return padded; /* for (var nFlag = 0, nShifted = nMask, sMask = ""; nFlag < bits; nFlag++ , sMask += String(nShifted >>> BigInt(bits-1) ), nShifted <<= 1n); return sMask; */ } function getDistanceReport(texts, maxSim, closestQty) { if (!texts || texts.length == 0) { const err = new Error(); err.message = `you must provide some texts`; throw err; } if (!maxSim) { const err = new Error(); err.message = `the maximum acceptable similarity is mandatory`; throw err; } if (!closestQty) { const err = new Error(); err.message = `the closest quantity limit is mandatory`; throw err; } const res = []; for (let i = 0; i < texts.length; i++) { // create result holder const distAnalysis = { for: texts[i], closestOnes: [], }; res.push(distAnalysis); for (let j = 0; j < texts.length; j++) { // console.log(texts[i].simhash); if (i != j) { const sim = similarity(texts[i].simhash, texts[j].simhash); // console.log(sim); // console.log(`sim: ${sim}, maxSim: ${maxSim}`); if (sim >= maxSim) { const closeOne = { similarity: Math.round(sim * 100) / 100, with: texts[j], }; distAnalysis.closestOnes.push(closeOne); if (distAnalysis.closestOnes.length == closestQty) { // console.log(`limit reached for ${j}, break`); break; } } } } } return res; } }.call(this);