@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
53 lines • 1.42 kB
TypeScript
import type { SerializedVocab, Vocab } from "./api.js";
/**
* Creates a bi-directional index storing unique tokens from given `src`
* iterable, optionally using custom `start` ID offset (default: 0). This index
* can then be used with {@link encodeDense}, {@link encodeSparse} and related
* functions.
*
* @remarks
* This function is syntax sugar for
* [thi.ng/bidir-index](https://thi.ng/bidir-index).
*
* The vocab/index can be serialized to JSON and then re-created via
* `defVocab()`.
*
* @example
* ```ts tangle:../export/def-vocab.ts
* import { defVocab, tokenize } from "@thi.ng/text-analysis";
*
* const vocab = defVocab(
* tokenize("the quick brown fox jumps over the lazy dog")
* );
*
* console.log([...vocab.entries()]);
* // [
* // [ "the", 0 ],
* // [ "quick", 1 ],
* // [ "brown", 2 ],
* // [ "fox", 3 ],
* // [ "jumps", 4 ],
* // [ "over", 5 ],
* // [ "lazy", 6 ],
* // [ "dog", 7 ]
* // ]
*
* console.log(vocab.get("fox"))
* // 3
*
* console.log(vocab.getID(3))
* // "fox"
* ```
*
* @param src
* @param start
*/
export declare function defVocab(src: Iterable<string> | Iterable<string>[], start?: number): Vocab;
/**
* (Re)creates bi-directional vocab index from previous serialized state (e.g.
* via `vocab.toJSON()`).
*
* @param vocab
*/
export declare function defVocab(vocab: SerializedVocab): Vocab;
//# sourceMappingURL=vocab.d.ts.map