@thi.ng/text-analysis
Version:
Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities
55 lines (54 loc) • 2 kB
JavaScript
import { argSort } from "@thi.ng/arrays/arg-sort";
import { lookup, lookupUnsafe } from "@thi.ng/arrays/lookup";
import { knearest as $knearest } from "@thi.ng/distance/knearest";
import { Untransformed } from "@thi.ng/distance/untransformed";
import { kmeans } from "@thi.ng/k-means";
import { map } from "@thi.ng/transducers/map";
import { max } from "@thi.ng/transducers/max";
import { transduce } from "@thi.ng/transducers/transduce";
import { distJaccard } from "@thi.ng/vectors/dist-jaccard";
import { distSq } from "@thi.ng/vectors/distsq";
import { mean } from "@thi.ng/vectors/mean";
import { toDense } from "./vec.js";
const JACCARD_DIST_DENSE = new Untransformed(distJaccard);
const kmeansDense = (k, docs, opts) => kmeans(k, docs, { maxIter: 100, ...opts }).map((cluster) => ({
...cluster,
docs: lookupUnsafe(docs, cluster.items)
}));
const kmeansSparse = (k, docs, opts) => kmeansDense(
k,
docs.map((x) => toDense(opts.dim, x)),
opts
);
function clusterBounds(docs, ids) {
if (ids) docs = lookup(docs, ids);
const centroid = mean([], docs);
return {
centroid,
radius: transduce(
map((x) => distSq(centroid, x)),
max(),
docs
)
};
}
const centralTerms = (vocab, k, docs) => centralTermsVec(vocab, k, mean([], docs));
const centralTermsVec = (vocab, k, centroid) => vocab.getAllIDs(
argSort(centroid, (a, b) => b - a).slice(0, k).filter((i) => centroid[i] != 0)
);
const knearest = (query, k, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => $knearest(query, k, r, dist, sorted);
const knearestDocs = (query, k, docs, r = Infinity, dist = JACCARD_DIST_DENSE, sorted = false) => {
const neighborhood = $knearest(query, k, r, dist, sorted);
for (let i = 0; i < docs.length; i++) neighborhood.consider(docs[i], i);
return neighborhood.deref().map((n) => [docs[n[1]], n[0]]);
};
export {
JACCARD_DIST_DENSE,
centralTerms,
centralTermsVec,
clusterBounds,
kmeansDense,
kmeansSparse,
knearest,
knearestDocs
};