@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

thi.ng/text-analysis

thi-ng/umbrella

102 lines • 3.72 kB

TypeScript

import type { Fn, Fn2 } from "@thi.ng/api"; import { frequencies as $freq } from "@thi.ng/transducers/frequencies"; import { normFrequenciesAuto as $norm } from "@thi.ng/transducers/norm-frequencies-auto"; import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequencies"; /** * Reducer. Computes histogram of given inputs. Returns a Map with unique inputs * as keys and their occurrences as values. * * @remarks * Re-export of [eponymous function in * thi.ng/transducers](https://docs.thi.ng/umbrella/transducers/functions/frequencies.html) * * @example * ```ts tangle:../export/frequencies.ts * import { frequencies, tokenize } from "@thi.ng/text-analysis"; * * console.log( * frequencies(tokenize("to be or not to be")) * ); * // Map(4) { "to": 2, "be": 2, "or": 1, "not": 1 } * ``` */ export declare const frequencies: typeof $freq; /** * Similar to {@link frequencies}, but with each all values normalized (based on * total number of inputs received). * * @remarks * Re-export of [`normFrequenciesAuto()` in * thi.ng/transducers](https://docs.thi.ng/umbrella/transducers/functions/normFrequenciesAuto.html) * * @example * ```ts tangle:../export/norm-frequencies.ts * import { normFrequencies, tokenize } from "@thi.ng/text-analysis"; * * console.log( * normFrequencies(tokenize("to be or not to be")) * ); * // Map(4) { "to": 0.333, "be": 0.333, "or": 0.166, "not": 0.166 } * ``` */ export declare const normFrequencies: typeof $norm; /** * Reducer. Similar to {@link frequencies}, but instead of a Map produces an * array of `[key, frequency]`-pairs, sorted by the descending number of * occurrences of each distinct key/token. * * @remarks * Re-export of [eponymous function in * thi.ng/transducers](https://docs.thi.ng/umbrella/transducers/functions/sortedFrequencies.html) * * @example * ```ts tangle:../export/sorted-frequencies.ts * import { sortedFrequencies, tokenize } from "@thi.ng/text-analysis"; * * console.log( * sortedFrequencies(tokenize("to be or not to be")) * ); * // [ [ "to", 2 ], [ "be", 2 ], [ "or", 1 ], [ "not", 1 ] ] * ``` */ export declare const sortedFrequencies: typeof $sorted; /** * Takes an array of tokenized documents, a histogram function (`frequencies`) * and a predicate function (`pred`). First computes the combined histogram of * terms/works in all given docs using `frequencies`, then filters each document * using supplied predicate, which is called with a single word/token and its * computed frequency. Only words are kept for which the predicate succeeds. * * @remarks * See {@link frequencies} and {@link normFrequencies} for histogram creation. * * @example * ```ts tangle:../export/filter-docs-frequency.ts * import { filterDocsFrequency, frequencies } from "@thi.ng/text-analysis"; * * const docs = [ * ["a", "b", "c"], * ["a", "b", "d", "e"], * ["b", "f", "g"], * ["a", "b", "c", "f"], * ["a", "g", "h"] * ]; * * // only keep words which occur more than once * const filtered = filterDocsFrequency(docs, frequencies, (_, x) => x > 1); * * // show before & after * for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]); * // [ "a", "b", "c" ] => [ "a", "b", "c" ] * // [ "a", "b", "d", "e" ] => [ "a", "b" ] * // [ "b", "f", "g" ] => [ "b", "f", "g" ] * // [ "a", "b", "c", "f" ] => [ "a", "b", "c", "f" ] * // [ "a", "g", "h" ] => [ "a", "g" ] * ``` * * @param docs * @param frequencies * @param pred */ export declare const filterDocsFrequency: (docs: string[][], frequencies: Fn<Iterable<string>, Map<string, number>>, pred: Fn2<string, number, boolean>) => string[][]; //# sourceMappingURL=frequencies.d.ts.map