node-simhash-mod
Version:
Command Line tool that compares two text files using simhash
218 lines (190 loc) • 7.22 kB
JavaScript
void function () {
'use strict';
var crypto = require('crypto');
var natural = require('natural');
var NGrams = natural.NGrams;
const bits = 64;
module.exports = {
compare: compare,
summary: summary,
hammingWeight: hammingWeight,
shingles: shingles,
jaccardIndex: jaccardIndex,
createBinaryString: createBinaryString,
shingleHashList: shingleHashList,
simhash: simhash,
similarity: similarity,
getDistanceReport: getDistanceReport
}
function compare(file1, file2) {
return similarity(simhash(file1), simhash(file2));
}
function summary(file1, file2) {
var hash1 = simhash(file1);
var hash2 = simhash(file2);
var simhashval = similarity(hash1, hash2);
var hashList1 = shingleHashList(file1);
var hashList2 = shingleHashList(file2);
const common = hashList1.filter(value => hashList2.includes(value));
console.log(`common hashes: ${common.join(', ')} - which is ${common.length} / ${hashList1.length} or ${hashList2.length}`);
//var jaccard = jaccardIndex(shingles(file1), shingles(file2));
console.log("File1 simhash:", hash1, createBinaryString(hash1));
console.log("File2 simhash:", hash2, createBinaryString(hash2));
console.log("Simhash similarity is " + simhashval + " (%d%% similar)", Math.round(simhashval * 100));
console.log(`similar bits: ${similarBits(hash1, hash2)}`);
//console.log("Jaccard index is " + jaccard + " (%d%% similar)", Math.round(jaccard * 100));
}
function hammingWeight(l) {
var c;
for (c = 0; l; c++) {
l &= l - 1n;
}
return c;
}
function similarBits(simhash1, simhash2) {
// console.log(`${createBinaryString(simhash1)} <= 1`);
// console.log(`${createBinaryString(simhash2)} <= 2`);
var bitCount = 0n; // count bits that are different
var z = BigInt(simhash1 ^ simhash2); // set bits in z that are different in x and y to 1.
// console.log(`${createBinaryString(z)} <= bits that are different`);
while (z !== 0n) { // not all bits are 0
// console.log(`${createBinaryString(z)} <= current z`);
bitCount += z & 1n; // add least significant bit
z = z >> 1n; // iterate over z; set next bit as lsb
}
//console.log(z);
return bitCount;
}
function similarity(simhash1, simhash2) {
return hammingWeight((simhash1 & simhash2)) / hammingWeight((simhash1 | simhash2));
}
function shingleHashList(str) {
//console.log(crypto.getHashes());
var list = [];
console.log('calculated hashes:');
for (var word of shingles(str, 4)) {
const hash = crypto.createHash('sha256');
hash.update(word);
const hexString = hash.digest('hex'); // fait 256 bits
// console.log('hexString: ' + hexString);
// quelle taille ?
// console.log(parseInt(hexString, 16).toString(2));
const originalHashAsInt = BigInt('0x' + hexString);
// console.log(originalHashAsInt.toString(2) + ' <= ORIGINAL'); // ok fait 256 bits
const truncatedHashAsInt = BigInt(originalHashAsInt >> BigInt(256-bits)); // on garde que le début
// console.log(truncatedHashAsInt.toString(2) + ' <= TRUNCATED');
// it remains a BigInt here, as js Number is not ok for 64 bits integers
list.push(truncatedHashAsInt);
/*
const hash = crc32.str(word) & 0xffffffff;
list.push(hash); // ça sert à quoi le & ?
// and-ing it with 0xffffffff makes sure any bits over the 32 are zeroed out.
*/
console.log(`${createBinaryString(truncatedHashAsInt)} <= ${word}`);
}
// console.log(`shingleHashList: ${list.join(', ')}`);
return list;
}
// hum
function cleanHtml(input) {
let output = input.replace(/<p>/g, '').replace(/div>/g, '').replace(/<\/p>/g, '').replace(/<\/div>/g, '');
return output;
}
function shingles(original, kshingles = 2) {
var shingles = new Set();
const cleanedHtml = cleanHtml(original);
const normalized = cleanedHtml.normalize("NFD").replace(/[\u0300-\u036f]/g, "");
for (var wordlist of NGrams.ngrams(normalized, kshingles, null, '[end]')) {
shingles.add(wordlist.join(" "));
}
console.log(`shingles: ${[...shingles].join(', ')}`);
return shingles;
}
function simhash(str) {
console.log('START SIMHASH');
var shingles = shingleHashList(str); // liste d'entiers xx bits signés
var mask = 1n;
var simhash = 0n;
for (var i = 0; i < bits; i++) { // ce 64 a de l'importance ; mais pourquoi ce n'est pas un 32 ? 32 SUFFISENT!
var sim = 0; // n'a pas besoin d'être BigInt lui
for (var s of shingles) {
sim += ((s & mask) == mask) ? 1 : -1; // comptage de ceux qui ont le bit à 1
}
simhash |= (sim > 0 ? mask : 0n); // on complète le morceau du simhash
// console.log(`${simhash.toString(2)} <= current simhash`);
mask <<= 1n;
}
console.log(`${createBinaryString(simhash)} <= binary simhash`);
console.log(`END SIMHASH: ${simhash} => ${simhash.toString(2)}`);
return simhash;
}
function jaccardIndex(set1, set2) {
var total = set1.size + set2.size;
var intersection = 0;
for (var shingle of set1) {
if (set2.has(shingle)) {
intersection++;
}
}
var union = total - intersection;
return intersection / union;
}
function createBinaryString(nMask) {
// console.log(nMask.toString(2) + ' <= INPUT');
const asBinaryString = nMask.toString(2);
const padded = asBinaryString.padStart(bits, '0');
// console.log(padded + ' <= PADDED');
return padded;
/*
for (var nFlag = 0, nShifted = nMask, sMask = ""; nFlag < bits;
nFlag++ , sMask += String(nShifted >>> BigInt(bits-1) ), nShifted <<= 1n);
return sMask;
*/
}
function getDistanceReport(texts, maxSim, closestQty) {
if (!texts || texts.length == 0) {
const err = new Error();
err.message = `you must provide some texts`;
throw err;
}
if (!maxSim) {
const err = new Error();
err.message = `the maximum acceptable similarity is mandatory`;
throw err;
}
if (!closestQty) {
const err = new Error();
err.message = `the closest quantity limit is mandatory`;
throw err;
}
const res = [];
for (let i = 0; i < texts.length; i++) {
// create result holder
const distAnalysis = {
for: texts[i],
closestOnes: [],
};
res.push(distAnalysis);
for (let j = 0; j < texts.length; j++) {
// console.log(texts[i].simhash);
if (i != j) {
const sim = similarity(texts[i].simhash, texts[j].simhash);
// console.log(sim);
// console.log(`sim: ${sim}, maxSim: ${maxSim}`);
if (sim >= maxSim) {
const closeOne = {
similarity: Math.round(sim * 100) / 100,
with: texts[j],
};
distAnalysis.closestOnes.push(closeOne);
if (distAnalysis.closestOnes.length == closestQty) {
// console.log(`limit reached for ${j}, break`);
break;
}
}
}
}
}
return res;
}
}.call(this);