UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

51 lines 1.54 kB
import type { Transducer } from "@thi.ng/transducers"; /** * Customizable string tokenizer with optional transducer-based token * transformation(s). Yields an iterator of tokens. * * @remarks * The package provides a number of composable string transducers which can be * listed here and will be applied in sequence for each input token. * * @example * ```ts tangle:../export/tokenize.ts * import * as ta from "@thi.ng/text-analysis"; * * const input = `Do not go gentle into that good night, * Old age should burn and rave at close of day; * Rage, rage against the dying of the light. * * Though wise men at their end know dark is right, * Because their words had forked no lightning they * Do not go gentle into that good night.`; * * // tokenize input with given token transforms * // collect tokens into array * const tokens = [...ta.tokenize( * input, * [ * ta.lowercase, * ta.removeNonAlphaNum, * ta.removeStopWords() * ] * )]; * * console.log(tokens); * // [ * // "do", "not", "go", "gentle", "good", "night", "old", "age", * // "burn", "rave", "close", "day", "rage", "rage", "dying", "light", * // ... * // ] * * console.log( * [...ta.tokenize(input, [ta.ngrams(2)])] * ); * ``` * * @param src * @param xforms * @param delim * @param includeDelim */ export declare const tokenize: (src: string, xforms?: Transducer<string, string>[], delim?: RegExp | string, includeDelim?: boolean) => IterableIterator<string>; //# sourceMappingURL=tokenize.d.ts.map