UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

150 lines 4.49 kB
import type { ReadonlyVec } from "@thi.ng/vectors"; import type { Vocab } from "./api.js"; /** * Encodes the given `doc` tokens into a dense multi-hot vector using provided * `vocab` (e.g. created via {@link defVocab}). The vector size is the number of * items in the vocab. * * @remarks * Also see {@link encodeSparse}. * * @example * ```ts tangle:../export/encode-dense.ts * import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis"; * * const vocab = defVocab( * tokenize("the quick brown fox jumps over the lazy dog") * ); * * console.log(encodeDense(vocab, tokenize("the brown dog jumps"))); * // [ 1, 0, 1, 0, 1, 0, 0, 1 ] * * console.log(encodeDense(vocab, tokenize("the lazy fox"))); * // [ 1, 0, 0, 1, 0, 0, 1, 0 ] * ``` * * @param vocab * @param doc */ export declare const encodeDense: (vocab: Vocab, doc: Iterable<string>) => ReadonlyVec; /** * Convenience function to create a vocabulary from given docs and encode each * doc into a dense multi-hot vector (using {@link encodeDense}). * * @param docs */ export declare const encodeAllDense: (docs: string[][]) => { vocab: Vocab; docs: ReadonlyVec[]; }; /** * Encodes the given `src` tokens into a sparse vector using provided `vocab` * (created via {@link defVocab}). Only the IDs of matched tokens are stored. * The returned vector size depends on the number of used/matched tokens, at * most `vocab.size` (if entire vocab is used by `src`). * * @remarks * Also see {@link encodeDense} for alternative encoding. * * @example * ```ts tangle:../export/encode-sparse.ts * import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis"; * * const vocab = defVocab( * tokenize("the quick brown fox jumps over the lazy dog") * ); * * console.log(encodeSparse(vocab, tokenize("the brown dog jumps"))); * // [ 0, 2, 4, 7 ] * * console.log(encodeSparse(vocab, tokenize("the lazy fox"))); * // [ 0, 3, 6 ] * ``` * * @param vocab * @param src */ export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => ReadonlyVec; /** * Convenience function to create a vocabulary from given docs and encode each * doc into a sparse multi-hot vector (using {@link encodeSparse}). * * @param docs */ export declare const encodeAllSparse: (docs: string[][]) => { vocab: Vocab; docs: ReadonlyVec[]; }; /** * Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract * tokens from provided `vocab` (created via {@link defVocab}). The returned * array only contains the corresponding tokens of the vector's non-zero * components. * * @remarks * Also see {@link decodeSparse}. * * @example * ```ts tangle:../export/decode-dense.ts * import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis"; * * const vocab = defVocab( * tokenize("the quick brown fox jumps over the lazy dog") * ); * * console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1])); * // [ "the", "brown", "jumps", "dog" ] * * console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0])); * // [ "the", "fox", "lazy" ] * ``` * * @param vocab * @param src * @param sort */ export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[]; /** * Reverse op of {@link encodeSparse}. Decodes sparse vector (created via * {@link encodeSparse} to extract tokens from provided `vocab` (created via * {@link defVocab}). * * @remarks * Also see {@link decodeDense}. * * @example * ```ts tangle:../export/decode-sparse.ts * import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis"; * * const vocab = defVocab( * tokenize("the quick brown fox jumps over the lazy dog") * ); * * console.log(decodeSparse(vocab, [0, 2, 4, 7])); * // [ "the", "brown", "jumps", "dog" ] * * console.log(decodeSparse(vocab, [0, 3, 6])); * // [ "the", "fox", "lazy" ] * ``` * * @param vocab * @param src * @param sort */ export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[]; /** * Converts given multi-hot sparse vector (e.g. created via {@link encodeSparse} * into a dense representation. * * @param dim * @param sparse */ export declare const toDense: (dim: number, sparse: ReadonlyVec) => ReadonlyVec; /** * Converts given multi-hot dense vector (e.g. created via {@link encodeDense}) * into a sparse representation. * * @param dense */ export declare const toSparse: (dense: ReadonlyVec) => ReadonlyVec; //# sourceMappingURL=vec.d.ts.map