@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
50 lines (49 loc) • 1.13 kB
JavaScript
import { defVocab } from "./vocab.js";
const encodeDense = (vocab, doc) => toDense(vocab.size, vocab.getAll(doc));
const encodeAllDense = (docs) => {
const vocab = defVocab(docs);
return {
vocab,
docs: docs.map((x) => encodeDense(vocab, x))
};
};
const encodeSparse = (vocab, src) => [...vocab.getAllUnique(src)].sort((a, b) => a - b);
const encodeAllSparse = (docs) => {
const vocab = defVocab(docs);
return {
vocab,
docs: docs.map((x) => encodeSparse(vocab, x))
};
};
const decodeDense = (vocab, vec) => {
const res = [];
let i = 0;
for (const x of vec) {
if (x) res.push(vocab.getID(i));
i++;
}
return res;
};
const decodeSparse = (vocab, vec) => vocab.getAllIDs(vec);
const toDense = (dim, sparse) => {
const res = new Array(dim).fill(0);
for (const i of sparse) res[i] = 1;
return res;
};
const toSparse = (dense) => {
const res = [];
for (let i = 0, n = dense.length; i < n; i++) {
if (dense[i]) res.push(i);
}
return res;
};
export {
decodeDense,
decodeSparse,
encodeAllDense,
encodeAllSparse,
encodeDense,
encodeSparse,
toDense,
toSparse
};