UNPKG

@thi.ng/text-analysis

Version:

Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities

55 lines (54 loc) 2 kB
import { argSort } from "@thi.ng/arrays/arg-sort"; import { lookup, lookupUnsafe } from "@thi.ng/arrays/lookup"; import { knearest as $knearest } from "@thi.ng/distance/knearest"; import { Untransformed } from "@thi.ng/distance/untransformed"; import { kmeans } from "@thi.ng/k-means"; import { map } from "@thi.ng/transducers/map"; import { max } from "@thi.ng/transducers/max"; import { transduce } from "@thi.ng/transducers/transduce"; import { distJaccard } from "@thi.ng/vectors/dist-jaccard"; import { distSq } from "@thi.ng/vectors/distsq"; import { mean } from "@thi.ng/vectors/mean"; import { toDense } from "./vec.js"; const JACCARD_DIST_DENSE = new Untransformed(distJaccard); const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts }).map((cluster) => ({ ...cluster, docs: lookupUnsafe(docs, cluster.items) })); const kmeansSparse = (k, docs, opts) => kmeansDense( k, docs.map((x) => toDense(opts.dim, x)), opts ); function clusterBounds(docs, ids) { if (ids) docs = lookup(docs, ids); const centroid = mean([], docs); return { centroid, radius: transduce( map((x) => distSq(centroid, x)), max(), docs ) }; } const centralTerms = (vocab, k, docs) => centralTermsVec(vocab, k, mean([], docs)); const centralTermsVec = (vocab, k, centroid) => vocab.getAllIDs( argSort(centroid, (a, b) => b - a).slice(0, k).filter((i) => centroid[i] != 0) ); const knearest = (query, k, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => $knearest(query, k, r, dist, sorted); const knearestDocs = (query, k, docs, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => { const neighborhood = $knearest(query, k, r, dist, sorted); for (let i = 0; i < docs.length; i++) neighborhood.consider(docs[i], i); return neighborhood.deref().map((n) => [docs[n[1]], n[0]]); }; export { JACCARD_DIST_DENSE, centralTerms, centralTermsVec, clusterBounds, kmeansDense, kmeansSparse, knearest, knearestDocs };