UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

64 lines (63 loc) 2.01 kB
import { transduce } from "@thi.ng/transducers/transduce"; import { frequencies, normFrequencies } from "./frequencies.js"; import { vocabOnly } from "./xform.js"; import { defVocab } from "./vocab.js"; const { log10 } = Math; const tfCount = (vocab, docTokens) => transduce(vocabOnly(vocab), frequencies(), docTokens); const tfNormalized = (vocab, docTokens) => transduce(vocabOnly(vocab), normFrequencies(), docTokens); const tfLog = (vocab, docTokens) => { const res = transduce(vocabOnly(vocab), frequencies(), docTokens); for (const [word, count] of res) res.set(word, log10(1 + count)); return res; }; const defIDF = (fnIDF) => (vocab, tokenizedDocs) => { const acc = /* @__PURE__ */ new Map(); for (const word of vocab.keys()) { let count = 0; for (const doc of tokenizedDocs) { if (doc.includes(word)) count++; } acc.set(word, fnIDF(count, tokenizedDocs.length)); } return acc; }; const idfClassic = defIDF( (docsWithTerm, numDocs) => log10(numDocs / docsWithTerm) ); const idfSmooth = defIDF( (docsWithTerm, numDocs) => 1 + log10(numDocs / (1 + docsWithTerm)) ); const idfProbabilistic = defIDF( (docsWithTerm, numDocs) => log10((numDocs - docsWithTerm) / docsWithTerm) ); const defTFIDF = (fnTF, fnIDF) => (vocab, tokenizedDocs) => { const idf = fnIDF(vocab, tokenizedDocs); return tokenizedDocs.map((doc) => { const tf = fnTF(vocab, doc); const acc = /* @__PURE__ */ new Map(); for (const [word, f] of tf) { acc.set(word, f * idf.get(word)); } return { doc, tf, idf, tfidf: acc }; }); }; const tfidf = defTFIDF(tfNormalized, idfClassic); const filterDocsIDF = (docs, pred, vocab, fnIDF = idfClassic) => { if (!vocab) vocab = defVocab(docs); const idf = fnIDF(vocab, docs); return docs.map( (doc) => doc.filter((word) => vocab.has(word) && pred(word, idf.get(word))) ); }; export { defIDF, defTFIDF, filterDocsIDF, idfClassic, idfProbabilistic, idfSmooth, tfCount, tfLog, tfNormalized, tfidf };