UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

31 lines (30 loc) 937 B
import { comp } from "@thi.ng/transducers/comp"; import { filter } from "@thi.ng/transducers/filter"; import { map } from "@thi.ng/transducers/map"; import { stemWord } from "./stem.js"; const RE_NON_ALPHA = /[^A-Za-z\u00c0-\u017f]/g; const RE_NON_ALPHANUM = /[^0-9A-Za-z\u00c0-\u017f]/g; const lowercase = map((x) => x.toLowerCase()); const collapseWS = map((x) => x.replace(/\s+/g, " ")); const removeEmpty = filter((x) => !/^\s*$/.test(x)); const removeNonAlpha = comp( map((x) => x.replace(RE_NON_ALPHA, "")), removeEmpty ); const removeNonAlphaNum = comp( map((x) => x.replace(RE_NON_ALPHANUM, "")), removeEmpty ); const minMaxLength = (min, max) => filter((x) => x.length >= min && x.length <= max); const stemOnly = map(stemWord); const vocabOnly = (vocab) => filter((x) => vocab.has(x)); export { collapseWS, lowercase, minMaxLength, removeEmpty, removeNonAlpha, removeNonAlphaNum, stemOnly, vocabOnly };