@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
150 lines • 4.49 kB
TypeScript
import type { ReadonlyVec } from "@thi.ng/vectors";
import type { Vocab } from "./api.js";
/**
* Encodes the given `doc` tokens into a dense multi-hot vector using provided
* `vocab` (e.g. created via {@link defVocab}). The vector size is the number of
* items in the vocab.
*
* @remarks
* Also see {@link encodeSparse}.
*
* @example
* ```ts tangle:../export/encode-dense.ts
* import { defVocab, encodeDense, tokenize } from "@thi.ng/text-analysis";
*
* const vocab = defVocab(
* tokenize("the quick brown fox jumps over the lazy dog")
* );
*
* console.log(encodeDense(vocab, tokenize("the brown dog jumps")));
* // [ 1, 0, 1, 0, 1, 0, 0, 1 ]
*
* console.log(encodeDense(vocab, tokenize("the lazy fox")));
* // [ 1, 0, 0, 1, 0, 0, 1, 0 ]
* ```
*
* @param vocab
* @param doc
*/
export declare const encodeDense: (vocab: Vocab, doc: Iterable<string>) => ReadonlyVec;
/**
* Convenience function to create a vocabulary from given docs and encode each
* doc into a dense multi-hot vector (using {@link encodeDense}).
*
* @param docs
*/
export declare const encodeAllDense: (docs: string[][]) => {
vocab: Vocab;
docs: ReadonlyVec[];
};
/**
* Encodes the given `src` tokens into a sparse vector using provided `vocab`
* (created via {@link defVocab}). Only the IDs of matched tokens are stored.
* The returned vector size depends on the number of used/matched tokens, at
* most `vocab.size` (if entire vocab is used by `src`).
*
* @remarks
* Also see {@link encodeDense} for alternative encoding.
*
* @example
* ```ts tangle:../export/encode-sparse.ts
* import { defVocab, encodeSparse, tokenize } from "@thi.ng/text-analysis";
*
* const vocab = defVocab(
* tokenize("the quick brown fox jumps over the lazy dog")
* );
*
* console.log(encodeSparse(vocab, tokenize("the brown dog jumps")));
* // [ 0, 2, 4, 7 ]
*
* console.log(encodeSparse(vocab, tokenize("the lazy fox")));
* // [ 0, 3, 6 ]
* ```
*
* @param vocab
* @param src
*/
export declare const encodeSparse: (vocab: Vocab, src: Iterable<string>) => ReadonlyVec;
/**
* Convenience function to create a vocabulary from given docs and encode each
* doc into a sparse multi-hot vector (using {@link encodeSparse}).
*
* @param docs
*/
export declare const encodeAllSparse: (docs: string[][]) => {
vocab: Vocab;
docs: ReadonlyVec[];
};
/**
* Reverse op of {@link encodeDense}. Decodes dense multi-hot vector to extract
* tokens from provided `vocab` (created via {@link defVocab}). The returned
* array only contains the corresponding tokens of the vector's non-zero
* components.
*
* @remarks
* Also see {@link decodeSparse}.
*
* @example
* ```ts tangle:../export/decode-dense.ts
* import { defVocab, decodeDense, tokenize } from "@thi.ng/text-analysis";
*
* const vocab = defVocab(
* tokenize("the quick brown fox jumps over the lazy dog")
* );
*
* console.log(decodeDense(vocab, [1, 0, 1, 0, 1, 0, 0, 1]));
* // [ "the", "brown", "jumps", "dog" ]
*
* console.log(decodeDense(vocab, [1, 0, 0, 1, 0, 0, 1, 0]));
* // [ "the", "fox", "lazy" ]
* ```
*
* @param vocab
* @param src
* @param sort
*/
export declare const decodeDense: (vocab: Vocab, vec: Iterable<number>) => string[];
/**
* Reverse op of {@link encodeSparse}. Decodes sparse vector (created via
* {@link encodeSparse} to extract tokens from provided `vocab` (created via
* {@link defVocab}).
*
* @remarks
* Also see {@link decodeDense}.
*
* @example
* ```ts tangle:../export/decode-sparse.ts
* import { defVocab, decodeSparse, tokenize } from "@thi.ng/text-analysis";
*
* const vocab = defVocab(
* tokenize("the quick brown fox jumps over the lazy dog")
* );
*
* console.log(decodeSparse(vocab, [0, 2, 4, 7]));
* // [ "the", "brown", "jumps", "dog" ]
*
* console.log(decodeSparse(vocab, [0, 3, 6]));
* // [ "the", "fox", "lazy" ]
* ```
*
* @param vocab
* @param src
* @param sort
*/
export declare const decodeSparse: (vocab: Vocab, vec: Iterable<number>) => string[];
/**
* Converts given multi-hot sparse vector (e.g. created via {@link encodeSparse}
* into a dense representation.
*
* @param dim
* @param sparse
*/
export declare const toDense: (dim: number, sparse: ReadonlyVec) => ReadonlyVec;
/**
* Converts given multi-hot dense vector (e.g. created via {@link encodeDense})
* into a sparse representation.
*
* @param dense
*/
export declare const toSparse: (dense: ReadonlyVec) => ReadonlyVec;
//# sourceMappingURL=vec.d.ts.map