UNPKG

related-documents

Version:
148 lines (145 loc) 4.13 kB
var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { Related: () => Related }); module.exports = __toCommonJS(src_exports); // src/related.ts var import_natural = require("natural"); var Related = class { stems_; tfidfs_; documents_; options_; debug = false; constructor(documents, options) { if (!Array.isArray(documents)) { throw new Error("documents must be an array"); } this.documents_ = documents; this.options_ = { stemmer: import_natural.PorterStemmer, tokenizer: new import_natural.WordTokenizer(), ...options }; } get documents() { return this.documents_; } get weights() { return this.options_.weights ?? Array(this.numParts).fill(1); } set weights(weights) { this.options_.weights = weights; } get numParts() { return this.serialize(this.documents[0]).length; } get stems() { return this.stems_; } get tfidfs() { return this.tfidfs_; } get serializer() { return this.options_.serializer; } set serializer(serializer) { this.reset(); this.options_.serializer = serializer; } get tokenizer() { return this.options_.tokenizer; } set tokenizer(tokenizer) { this.reset(); this.options_.tokenizer = tokenizer; } get stemmer() { return this.options_.stemmer; } set stemmer(stemmer) { this.reset(); this.options_.stemmer = stemmer; } reset() { this.stems_ = null; this.tfidfs_ = null; } prepare() { if (this.stems_ && this.tfidfs_) { return; } this.stems_ = this.documents_.map((item) => this.process(item)); this.tfidfs_ = []; for (let i = 0; i < this.numParts; i++) { this.tfidfs_.push(new import_natural.TfIdf()); for (let parts of this.stems_) { this.tfidfs_[i].addDocument(parts[i]); } } } serialize(document) { return this.serializer(document); } tokenize(parts) { return parts.map((part) => this.tokenizer.tokenize(part)); } stem(parts) { if (this.stemmer) { return parts.map((words) => words.map((w) => this.stemmer.stem(w))); } else { return parts; } } process(document) { const serialized = this.serialize(document); if (!Array.isArray(serialized) || serialized.some((s) => typeof s !== "string")) { throw new Error( `serializer must return an array of strings: ${serialized}` ); } const tokens = this.tokenize(serialized); const stems = this.stem(tokens); if (this.debug) { console.log({ serialized, tokens, stems }); } return stems; } rank(document) { this.prepare(); const documentStems = this.process(document); const measures = Array(this.documents.length).fill(0); this.weights.forEach((w, i) => { this.tfidfs[i].tfidfs(documentStems[i], (j, measure) => { measures[j] += measure * w; }); }); const scores = measures.map((score, i) => ({ score, document: this.documents_[i] })).sort((a, b) => b.score - a.score); const max = scores[0].score; const min = scores[scores.length - 1].score; return scores.map(({ score, document: document2 }) => ({ relative: (score - min) / (max - min), absolute: score, document: document2 })).slice(1, measures.length); } }; //# sourceMappingURL=index.js.map