UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

146 lines 5.13 kB
import type { Fn2 } from "@thi.ng/api"; import type { Vocab } from "./api.js"; /** * TF weighting function for {@link defTFIDF}. Computes {@link frequencies} for * given words/tokens (only includes those defined in `vocab`). */ export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>; /** * TF weighting function for {@link defTFIDF}. Computes {@link normFrequencies} * for given words/tokens (only includes those defined in `vocab`). */ export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>; /** * TF weighting function for {@link defTFIDF}. First computes * {@link frequencies} for given words/tokens (only includes those defined in * `vocab`), then transforms each value via `log10(1 + count)`. */ export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>; /** * Higher order Inverse Document Frequency, using provided weighting strategy * function. * * @remarks * Also see {@link defTFIDF} for full tf-idf implementation. * * References: * * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency * * Provided IDF impls for use with this function: * * - {@link idfClassic} * - {@link idfSmooth} * - {@link idfProbabilistic} * * @param fnIDF */ export declare const defIDF: (fnIDF: (docsWithTerm: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>; /** * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes: * `log10(numDocs / docsWithTerm)` */ export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>; /** * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes: * `1 + log10(numDocs / (1 + docsWithTerm))` */ export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>; /** * IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes: * `log10((numDocs - docsWithTerm) / docsWithTerm)` */ export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>; /** * Higher-order customizable tf-idf implementation, using provided fns for term * frequency and inverse document frequency. * * @remarks * See {@link tfidf} for default impl. * * Also see: * * - {@link tfCount}, {@link tfNormalized}, {@link tfLog} * - {@link idfClassic}, {@link idfSmooth}, {@link idfProbabilistic}. * * References: * * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf * * * @param fnTF * @param fnIDF */ export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>, fnIDF: Fn2<Vocab, string[][], Map<string, number>>) => (vocab: Vocab, tokenizedDocs: string[][]) => { doc: string[]; tf: Map<string, number>; idf: Map<string, number>; tfidf: Map<string, number>; }[]; /** * Default tf-idf implementation, using {@link tfNormalized} for term * frequency and {@link idfClassic} for inverse document frequency * calculation. * * @remarks * References: * * - https://en.wikipedia.org/wiki/Tf%E2%80%93idf * * Also see {@link defTFIDF}, {@link defIDF}. * * @param vocab * @param tokenizedDocs */ export declare const tfidf: (vocab: Vocab, tokenizedDocs: string[][]) => { doc: string[]; tf: Map<string, number>; idf: Map<string, number>; tfidf: Map<string, number>; }[]; /** * Takes a vocab, an array of tokenized documents and a predicate function. * Computes the IDF (Inverse Document Frequency, default: {@link idfClassic}) * and then filters each document using supplied predicate, which is called with * a single word/token and its computed IDF. Only words are kept for which the * predicate succeeds. * * @remarks * The IDF for common words is close to zero. This function can be used as a * pre-processing step for improved and more efficient vocabulary construction, * vector encoding (e.g. via {@link encodeDense}), clustering etc. by pre-excluding * tokens which do not contribute much information. * * @example * ```ts tangle:../export/filter-docs-idf.ts * import { filterDocsIDF } from "@thi.ng/text-analysis"; * * const docs = [ * ["a", "b", "c"], * ["a", "b", "d", "e"], * ["b", "f", "g"], * ["a", "b", "c", "f"], * ["a", "g", "h"] * ]; * * // remove common words, i.e. those with an IDF below given threshold * const filtered = filterDocsIDF(docs, (_, x) => x > 0.3); * * // show before & after * for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]); * * // [ "a", "b", "c" ] => [ "c" ] * // [ "a", "b", "d", "e" ] => [ "d", "e" ] * // [ "b", "f", "g" ] => [ "f", "g" ] * // [ "a", "b", "c", "f" ] => [ "c", "f" ] * // [ "a", "g", "h" ] => [ "g", "h" ] * ``` * * @param docs * @param pred * @param vocab * @param fnIDF */ export declare const filterDocsIDF: (docs: string[][], pred: Fn2<string, number, boolean>, vocab?: Vocab, fnIDF?: Fn2<Vocab, string[][], Map<string, number>>) => string[][]; //# sourceMappingURL=tf-idf.d.ts.map