@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
31 lines (30 loc) • 937 B
JavaScript
import { comp } from "@thi.ng/transducers/comp";
import { filter } from "@thi.ng/transducers/filter";
import { map } from "@thi.ng/transducers/map";
import { stemWord } from "./stem.js";
const RE_NON_ALPHA = /[^A-Za-z\u00c0-\u017f]/g;
const RE_NON_ALPHANUM = /[^0-9A-Za-z\u00c0-\u017f]/g;
const lowercase = map((x) => x.toLowerCase());
const collapseWS = map((x) => x.replace(/\s+/g, " "));
const removeEmpty = filter((x) => !/^\s*$/.test(x));
const removeNonAlpha = comp(
map((x) => x.replace(RE_NON_ALPHA, "")),
removeEmpty
);
const removeNonAlphaNum = comp(
map((x) => x.replace(RE_NON_ALPHANUM, "")),
removeEmpty
);
const minMaxLength = (min, max) => filter((x) => x.length >= min && x.length <= max);
const stemOnly = map(stemWord);
const vocabOnly = (vocab) => filter((x) => vocab.has(x));
export {
collapseWS,
lowercase,
minMaxLength,
removeEmpty,
removeNonAlpha,
removeNonAlphaNum,
stemOnly,
vocabOnly
};