UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

53 lines 1.42 kB
import type { SerializedVocab, Vocab } from "./api.js"; /** * Creates a bi-directional index storing unique tokens from given `src` * iterable, optionally using custom `start` ID offset (default: 0). This index * can then be used with {@link encodeDense}, {@link encodeSparse} and related * functions. * * @remarks * This function is syntax sugar for * [thi.ng/bidir-index](https://thi.ng/bidir-index). * * The vocab/index can be serialized to JSON and then re-created via * `defVocab()`. * * @example * ```ts tangle:../export/def-vocab.ts * import { defVocab, tokenize } from "@thi.ng/text-analysis"; * * const vocab = defVocab( * tokenize("the quick brown fox jumps over the lazy dog") * ); * * console.log([...vocab.entries()]); * // [ * // [ "the", 0 ], * // [ "quick", 1 ], * // [ "brown", 2 ], * // [ "fox", 3 ], * // [ "jumps", 4 ], * // [ "over", 5 ], * // [ "lazy", 6 ], * // [ "dog", 7 ] * // ] * * console.log(vocab.get("fox")) * // 3 * * console.log(vocab.getID(3)) * // "fox" * ``` * * @param src * @param start */ export declare function defVocab(src: Iterable<string> | Iterable<string>[], start?: number): Vocab; /** * (Re)creates bi-directional vocab index from previous serialized state (e.g. * via `vocab.toJSON()`). * * @param vocab */ export declare function defVocab(vocab: SerializedVocab): Vocab; //# sourceMappingURL=vocab.d.ts.map