@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
64 lines (63 loc) • 2.01 kB
JavaScript
import { transduce } from "@thi.ng/transducers/transduce";
import { frequencies, normFrequencies } from "./frequencies.js";
import { vocabOnly } from "./xform.js";
import { defVocab } from "./vocab.js";
const { log10 } = Math;
const tfCount = (vocab, docTokens) => transduce(vocabOnly(vocab), frequencies(), docTokens);
const tfNormalized = (vocab, docTokens) => transduce(vocabOnly(vocab), normFrequencies(), docTokens);
const tfLog = (vocab, docTokens) => {
const res = transduce(vocabOnly(vocab), frequencies(), docTokens);
for (const [word, count] of res) res.set(word, log10(1 + count));
return res;
};
const defIDF = (fnIDF) => (vocab, tokenizedDocs) => {
const acc = /* @__PURE__ */ new Map();
for (const word of vocab.keys()) {
let count = 0;
for (const doc of tokenizedDocs) {
if (doc.includes(word)) count++;
}
acc.set(word, fnIDF(count, tokenizedDocs.length));
}
return acc;
};
const idfClassic = defIDF(
(docsWithTerm, numDocs) => log10(numDocs / docsWithTerm)
);
const idfSmooth = defIDF(
(docsWithTerm, numDocs) => 1 + log10(numDocs / (1 + docsWithTerm))
);
const idfProbabilistic = defIDF(
(docsWithTerm, numDocs) => log10((numDocs - docsWithTerm) / docsWithTerm)
);
const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => {
const idf = fnIDF(vocab, tokenizedDocs);
return tokenizedDocs.map((doc) => {
const tf = fnTF(vocab, doc);
const acc = /* @__PURE__ */ new Map();
for (const [word, f] of tf) {
acc.set(word, f * idf.get(word));
}
return { doc, tf, idf, tfidf: acc };
});
};
const tfidf = defTFIDF(tfNormalized, idfClassic);
const filterDocsIDF = (docs, pred, vocab, fnIDF = idfClassic) => {
if (!vocab) vocab = defVocab(docs);
const idf = fnIDF(vocab, docs);
return docs.map(
(doc) => doc.filter((word) => vocab.has(word) && pred(word, idf.get(word)))
);
};
export {
defIDF,
defTFIDF,
filterDocsIDF,
idfClassic,
idfProbabilistic,
idfSmooth,
tfCount,
tfLog,
tfNormalized,
tfidf
};