related-documents
Version:
Find related text documents.
148 lines (145 loc) • 4.13 kB
JavaScript
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var src_exports = {};
__export(src_exports, {
Related: () => Related
});
module.exports = __toCommonJS(src_exports);
// src/related.ts
var import_natural = require("natural");
var Related = class {
stems_;
tfidfs_;
documents_;
options_;
debug = false;
constructor(documents, options) {
if (!Array.isArray(documents)) {
throw new Error("documents must be an array");
}
this.documents_ = documents;
this.options_ = {
stemmer: import_natural.PorterStemmer,
tokenizer: new import_natural.WordTokenizer(),
...options
};
}
get documents() {
return this.documents_;
}
get weights() {
return this.options_.weights ?? Array(this.numParts).fill(1);
}
set weights(weights) {
this.options_.weights = weights;
}
get numParts() {
return this.serialize(this.documents[0]).length;
}
get stems() {
return this.stems_;
}
get tfidfs() {
return this.tfidfs_;
}
get serializer() {
return this.options_.serializer;
}
set serializer(serializer) {
this.reset();
this.options_.serializer = serializer;
}
get tokenizer() {
return this.options_.tokenizer;
}
set tokenizer(tokenizer) {
this.reset();
this.options_.tokenizer = tokenizer;
}
get stemmer() {
return this.options_.stemmer;
}
set stemmer(stemmer) {
this.reset();
this.options_.stemmer = stemmer;
}
reset() {
this.stems_ = null;
this.tfidfs_ = null;
}
prepare() {
if (this.stems_ && this.tfidfs_) {
return;
}
this.stems_ = this.documents_.map((item) => this.process(item));
this.tfidfs_ = [];
for (let i = 0; i < this.numParts; i++) {
this.tfidfs_.push(new import_natural.TfIdf());
for (let parts of this.stems_) {
this.tfidfs_[i].addDocument(parts[i]);
}
}
}
serialize(document) {
return this.serializer(document);
}
tokenize(parts) {
return parts.map((part) => this.tokenizer.tokenize(part));
}
stem(parts) {
if (this.stemmer) {
return parts.map((words) => words.map((w) => this.stemmer.stem(w)));
} else {
return parts;
}
}
process(document) {
const serialized = this.serialize(document);
if (!Array.isArray(serialized) || serialized.some((s) => typeof s !== "string")) {
throw new Error(
`serializer must return an array of strings: ${serialized}`
);
}
const tokens = this.tokenize(serialized);
const stems = this.stem(tokens);
if (this.debug) {
console.log({ serialized, tokens, stems });
}
return stems;
}
rank(document) {
this.prepare();
const documentStems = this.process(document);
const measures = Array(this.documents.length).fill(0);
this.weights.forEach((w, i) => {
this.tfidfs[i].tfidfs(documentStems[i], (j, measure) => {
measures[j] += measure * w;
});
});
const scores = measures.map((score, i) => ({ score, document: this.documents_[i] })).sort((a, b) => b.score - a.score);
const max = scores[0].score;
const min = scores[scores.length - 1].score;
return scores.map(({ score, document: document2 }) => ({
relative: (score - min) / (max - min),
absolute: score,
document: document2
})).slice(1, measures.length);
}
};
//# sourceMappingURL=index.js.map