@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
76 lines (75 loc) • 1.8 kB
JavaScript
import { distCosine } from "@thi.ng/vectors/dist-cosine";
import { jaccardSimilarity as $jaccard } from "@thi.ng/vectors/dist-jaccard";
import { distSq as $distSq } from "@thi.ng/vectors/distsq";
import { dot as $dot } from "@thi.ng/vectors/dot";
const cosineSimilarityDense = distCosine;
const cosineSimilaritySparse = (a, b) => {
const dot = dotProductSparse(a, b);
return dot > 0 ? dot / (Math.sqrt(a.length) * Math.sqrt(b.length)) : 0;
};
const jaccardSimilarityDense = $jaccard;
const jaccardSimilaritySparse = (a, b) => {
const na = a.length;
const nb = b.length;
let numIsec = 0;
let apos, bpos;
for (let i = 0, j = 0; i < na && j < nb; ) {
apos = a[i];
bpos = b[j];
if (apos === bpos) {
numIsec++;
i++;
j++;
} else if (apos < bpos) {
i++;
} else {
j++;
}
}
const numUnion = na + nb - numIsec;
return numUnion > 0 ? numIsec / numUnion : 0;
};
const dotProductDense = $dot;
const dotProductSparse = (a, b) => {
let res = 0;
let apos, bpos;
for (let i = 0, j = 0, na = a.length, nb = b.length; i < na && j < nb; ) {
apos = a[i];
bpos = b[j];
if (apos === bpos) {
res++;
i++;
j++;
} else if (apos < bpos) i++;
else j++;
}
return res;
};
const distSqDense = $distSq;
const distSqSparse = (a, b) => {
let res = 0;
let apos, bpos;
for (let i = 0, j = 0, na = a.length, nb = b.length; i < na && j < nb; ) {
apos = a[i];
bpos = b[j];
if (apos === bpos) {
i++;
j++;
} else {
res++;
if (apos < bpos) i++;
else j++;
}
}
return res;
};
export {
cosineSimilarityDense,
cosineSimilaritySparse,
distSqDense,
distSqSparse,
dotProductDense,
dotProductSparse,
jaccardSimilarityDense,
jaccardSimilaritySparse
};