@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
103 lines • 3.49 kB
TypeScript
import { Untransformed } from "@thi.ng/distance/untransformed";
import { type KMeansOpts } from "@thi.ng/k-means";
import type { ReadonlyVec } from "@thi.ng/vectors";
import type { Vocab } from "./api.js";
/**
* Jaccard distance metric wrapper for {@link kmeansDense}
*/
export declare const JACCARD_DIST_DENSE: Untransformed<ReadonlyVec>;
/**
* k-means clustering for dense multi-hot vectors. Uses thi.ng/k-means for
* actual clustering and squared L2 as default distance metric.
* Default max. iterations = 100.
*
* @remarks
* Use {@link JACCARD_DIST_DENSE} for alternative distance metric.
*
* @param k
* @param docs
* @param opts
*/
export declare const kmeansDense: (k: number, docs: ReadonlyVec[], opts?: Partial<KMeansOpts>) => {
docs: ReadonlyVec[];
id: number;
centroid: ReadonlyVec;
items: number[];
}[];
/**
* k-means clustering for sparse multi-hot vectors. First converts vectors into
* dense versions (using {@link toDense}), then calls {@link kmeansDense} to
* perform the clustering.
*
* @remarks
* Since sparse vector sizes vary, the number of dimensions used (aka the
* vocabulary size) MUST be given via `opts`.
*
* @param k
* @param docs
* @param opts
*/
export declare const kmeansSparse: (k: number, docs: ReadonlyVec[], opts: Partial<KMeansOpts> & {
dim: number;
}) => {
docs: ReadonlyVec[];
id: number;
centroid: ReadonlyVec;
items: number[];
}[];
export declare function clusterBounds(docs: ReadonlyVec[]): {
centroid: ReadonlyVec;
radius: number;
};
export declare function clusterBounds(docs: ReadonlyVec[], ids: number[]): {
centroid: ReadonlyVec;
radius: number;
};
/**
* Takes a vocab and array of docs encoded as dense multi-hot vectors. Computes
* centroid of given docs and then calls {@link centralTermsVec} to return the
* `k`-most central terms (or less if there're insufficient non-zero vector
* components).
*
* @example
* ```ts tangle:../export/central-terms.ts
* import { centralTerms, encodeAllDense } from "@thi.ng/text-analysis";
*
* const inputs = [
* ["a", "b", "c"],
* ["a", "b", "d", "e"],
* ["b", "f", "g"],
* ["a", "b", "c", "f"],
* ["a", "g", "h"]
* ];
*
* // create vocab & encode documents into multi-hot vectors
* const { vocab, docs } = encodeAllDense(inputs);
*
* // extract top-4 common terms
* console.log(centralTerms(vocab, 4, docs));
* // [ "b", "a", "g", "f" ]
* ```
*
* @param vocab
* @param k
* @param docs
*/
export declare const centralTerms: (vocab: Vocab, k: number, docs: ReadonlyVec[]) => string[];
/**
* Takes a vocab and dense vector representing a point in the n-dimensional
* space of the given vocab. Returns an array of terms corresponding to the `k`
* largest non-zero components of the vector (or less if there're insufficient
* non-zero vector components).
*
* @remarks
* Also see {@link centralTerms} (incl. code example).
*
* @param vocab
* @param k
* @param centroid
*/
export declare const centralTermsVec: (vocab: Vocab, k: number, centroid: ReadonlyVec) => string[];
export declare const knearest: (query: ReadonlyVec, k: number, r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => import("@thi.ng/distance").KNearest<ReadonlyVec, unknown>;
export declare const knearestDocs: (query: ReadonlyVec, k: number, docs: ReadonlyVec[], r?: number, dist?: Untransformed<ReadonlyVec>, sorted?: boolean) => [ReadonlyVec, number][];
//# sourceMappingURL=cluster.d.ts.map