UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

50 lines (49 loc) 1.13 kB
import { defVocab } from "./vocab.js"; const encodeDense = (vocab, doc) => toDense(vocab.size, vocab.getAll(doc)); const encodeAllDense = (docs) => { const vocab = defVocab(docs); return { vocab, docs: docs.map((x) => encodeDense(vocab, x)) }; }; const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b); const encodeAllSparse = (docs) => { const vocab = defVocab(docs); return { vocab, docs: docs.map((x) => encodeSparse(vocab, x)) }; }; const decodeDense = (vocab, vec) => { const res = []; let i = 0; for (const x of vec) { if (x) res.push(vocab.getID(i)); i++; } return res; }; const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec); const toDense = (dim, sparse) => { const res = new Array(dim).fill(0); for (const i of sparse) res[i] = 1; return res; }; const toSparse = (dense) => { const res = []; for (let i = 0, n = dense.length; i < n; i++) { if (dense[i]) res.push(i); } return res; }; export { decodeDense, decodeSparse, encodeAllDense, encodeAllSparse, encodeDense, encodeSparse, toDense, toSparse };