@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
51 lines • 1.54 kB
TypeScript
import type { Transducer } from "@thi.ng/transducers";
/**
* Customizable string tokenizer with optional transducer-based token
* transformation(s). Yields an iterator of tokens.
*
* @remarks
* The package provides a number of composable string transducers which can be
* listed here and will be applied in sequence for each input token.
*
* @example
* ```ts tangle:../export/tokenize.ts
* import * as ta from "@thi.ng/text-analysis";
*
* const input = `Do not go gentle into that good night,
* Old age should burn and rave at close of day;
* Rage, rage against the dying of the light.
*
* Though wise men at their end know dark is right,
* Because their words had forked no lightning they
* Do not go gentle into that good night.`;
*
* // tokenize input with given token transforms
* // collect tokens into array
* const tokens = [...ta.tokenize(
* input,
* [
* ta.lowercase,
* ta.removeNonAlphaNum,
* ta.removeStopWords()
* ]
* )];
*
* console.log(tokens);
* // [
* // "do", "not", "go", "gentle", "good", "night", "old", "age",
* // "burn", "rave", "close", "day", "rage", "rage", "dying", "light",
* // ...
* // ]
*
* console.log(
* [...ta.tokenize(input, [ta.ngrams(2)])]
* );
* ```
*
* @param src
* @param xforms
* @param delim
* @param includeDelim
*/
export declare const tokenize: (src: string, xforms?: Transducer<string, string>[], delim?: RegExp | string, includeDelim?: boolean) => IterableIterator<string>;
//# sourceMappingURL=tokenize.d.ts.map