UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

103 lines 3.49 kB
import { Untransformed } from "@thi.ng/distance/untransformed"; import { type KMeansOpts } from "@thi.ng/k-means"; import type { ReadonlyVec } from "@thi.ng/vectors"; import type { Vocab } from "./api.js"; /** * Jaccard distance metric wrapper for {@link kmeansDense} */ export declare const JACCARD_DIST_DENSE: Untransformed<ReadonlyVec>; /** * k-means clustering for dense multi-hot vectors. Uses thi.ng/k-means for * actual clustering and squared L2 as default distance metric. * Default max. iterations = 100. * * @remarks * Use {@link JACCARD_DIST_DENSE} for alternative distance metric. * * @param k * @param docs * @param opts */ export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) => { docs: ReadonlyVec[]; id: number; centroid: ReadonlyVec; items: number[]; }[]; /** * k-means clustering for sparse multi-hot vectors. First converts vectors into * dense versions (using {@link toDense}), then calls {@link kmeansDense} to * perform the clustering. * * @remarks * Since sparse vector sizes vary, the number of dimensions used (aka the * vocabulary size) MUST be given via `opts`. * * @param k * @param docs * @param opts */ export declare const kmeansSparse: (k: number, docs: ReadonlyVec[], opts: Partial<KMeansOpts> & { dim: number; }) => { docs: ReadonlyVec[]; id: number; centroid: ReadonlyVec; items: number[]; }[]; export declare function clusterBounds(docs: ReadonlyVec[]): { centroid: ReadonlyVec; radius: number; }; export declare function clusterBounds(docs: ReadonlyVec[], ids: number[]): { centroid: ReadonlyVec; radius: number; }; /** * Takes a vocab and array of docs encoded as dense multi-hot vectors. Computes * centroid of given docs and then calls {@link centralTermsVec} to return the * `k`-most central terms (or less if there're insufficient non-zero vector * components). * * @example * ```ts tangle:../export/central-terms.ts * import { centralTerms, encodeAllDense } from "@thi.ng/text-analysis"; * * const inputs = [ * ["a", "b", "c"], * ["a", "b", "d", "e"], * ["b", "f", "g"], * ["a", "b", "c", "f"], * ["a", "g", "h"] * ]; * * // create vocab & encode documents into multi-hot vectors * const { vocab, docs } = encodeAllDense(inputs); * * // extract top-4 common terms * console.log(centralTerms(vocab, 4, docs)); * // [ "b", "a", "g", "f" ] * ``` * * @param vocab * @param k * @param docs */ export declare const centralTerms: (vocab: Vocab, k: number, docs: ReadonlyVec[]) => string[]; /** * Takes a vocab and dense vector representing a point in the n-dimensional * space of the given vocab. Returns an array of terms corresponding to the `k` * largest non-zero components of the vector (or less if there're insufficient * non-zero vector components). * * @remarks * Also see {@link centralTerms} (incl. code example). * * @param vocab * @param k * @param centroid */ export declare const centralTermsVec: (vocab: Vocab, k: number, centroid: ReadonlyVec) => string[]; export declare const knearest: (query: ReadonlyVec, k: number, r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => import("@thi.ng/distance").KNearest<ReadonlyVec, unknown>; export declare const knearestDocs: (query: ReadonlyVec, k: number, docs: ReadonlyVec[], r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => [ReadonlyVec, number][]; //# sourceMappingURL=cluster.d.ts.map