@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
146 lines • 5.13 kB
TypeScript
import type { Fn2 } from "@thi.ng/api";
import type { Vocab } from "./api.js";
/**
* TF weighting function for {@link defTFIDF}. Computes {@link frequencies} for
* given words/tokens (only includes those defined in `vocab`).
*/
export declare const tfCount: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
/**
* TF weighting function for {@link defTFIDF}. Computes {@link normFrequencies}
* for given words/tokens (only includes those defined in `vocab`).
*/
export declare const tfNormalized: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
/**
* TF weighting function for {@link defTFIDF}. First computes
* {@link frequencies} for given words/tokens (only includes those defined in
* `vocab`), then transforms each value via `log10(1 + count)`.
*/
export declare const tfLog: (vocab: Vocab, docTokens: Iterable<string>) => Map<string, number>;
/**
* Higher order Inverse Document Frequency, using provided weighting strategy
* function.
*
* @remarks
* Also see {@link defTFIDF} for full tf-idf implementation.
*
* References:
*
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency
*
* Provided IDF impls for use with this function:
*
* - {@link idfClassic}
* - {@link idfSmooth}
* - {@link idfProbabilistic}
*
* @param fnIDF
*/
export declare const defIDF: (fnIDF: (docsWithTerm: number, numDocs: number) => number) => (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
/**
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
* `log10(numDocs / docsWithTerm)`
*/
export declare const idfClassic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
/**
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
* `1 + log10(numDocs / (1 + docsWithTerm))`
*/
export declare const idfSmooth: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
/**
* IDF weighting function for {@link defIDF} and {@link defTFIDF}. Computes:
* `log10((numDocs - docsWithTerm) / docsWithTerm)`
*/
export declare const idfProbabilistic: (vocab: Vocab, tokenizedDocs: string[][]) => Map<string, number>;
/**
* Higher-order customizable tf-idf implementation, using provided fns for term
* frequency and inverse document frequency.
*
* @remarks
* See {@link tfidf} for default impl.
*
* Also see:
*
* - {@link tfCount}, {@link tfNormalized}, {@link tfLog}
* - {@link idfClassic}, {@link idfSmooth}, {@link idfProbabilistic}.
*
* References:
*
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
*
*
* @param fnTF
* @param fnIDF
*/
export declare const defTFIDF: (fnTF: Fn2<Vocab, string[], Map<string, number>>, fnIDF: Fn2<Vocab, string[][], Map<string, number>>) => (vocab: Vocab, tokenizedDocs: string[][]) => {
doc: string[];
tf: Map<string, number>;
idf: Map<string, number>;
tfidf: Map<string, number>;
}[];
/**
* Default tf-idf implementation, using {@link tfNormalized} for term
* frequency and {@link idfClassic} for inverse document frequency
* calculation.
*
* @remarks
* References:
*
* - https://en.wikipedia.org/wiki/Tf%E2%80%93idf
*
* Also see {@link defTFIDF}, {@link defIDF}.
*
* @param vocab
* @param tokenizedDocs
*/
export declare const tfidf: (vocab: Vocab, tokenizedDocs: string[][]) => {
doc: string[];
tf: Map<string, number>;
idf: Map<string, number>;
tfidf: Map<string, number>;
}[];
/**
* Takes a vocab, an array of tokenized documents and a predicate function.
* Computes the IDF (Inverse Document Frequency, default: {@link idfClassic})
* and then filters each document using supplied predicate, which is called with
* a single word/token and its computed IDF. Only words are kept for which the
* predicate succeeds.
*
* @remarks
* The IDF for common words is close to zero. This function can be used as a
* pre-processing step for improved and more efficient vocabulary construction,
* vector encoding (e.g. via {@link encodeDense}), clustering etc. by pre-excluding
* tokens which do not contribute much information.
*
* @example
* ```ts tangle:../export/filter-docs-idf.ts
* import { filterDocsIDF } from "@thi.ng/text-analysis";
*
* const docs = [
* ["a", "b", "c"],
* ["a", "b", "d", "e"],
* ["b", "f", "g"],
* ["a", "b", "c", "f"],
* ["a", "g", "h"]
* ];
*
* // remove common words, i.e. those with an IDF below given threshold
* const filtered = filterDocsIDF(docs, (_, x) => x > 0.3);
*
* // show before & after
* for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]);
*
* // [ "a", "b", "c" ] => [ "c" ]
* // [ "a", "b", "d", "e" ] => [ "d", "e" ]
* // [ "b", "f", "g" ] => [ "f", "g" ]
* // [ "a", "b", "c", "f" ] => [ "c", "f" ]
* // [ "a", "g", "h" ] => [ "g", "h" ]
* ```
*
* @param docs
* @param pred
* @param vocab
* @param fnIDF
*/
export declare const filterDocsIDF: (docs: string[][], pred: Fn2<string, number, boolean>, vocab?: Vocab, fnIDF?: Fn2<Vocab, string[][], Map<string, number>>) => string[][];
//# sourceMappingURL=tf-idf.d.ts.map