related-documents

/** * Copyright 2022 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { PorterStemmer, WordTokenizer, TfIdf } from "natural"; import type { Stemmer, Tokenizer } from "natural"; export type serializer = (item: any) => string[]; export interface Options { serializer: serializer; weights: number[]; stemmer?: Stemmer; tokenizer?: Tokenizer; } export class Related { private stems_: string[][][]; private tfidfs_: TfIdf[]; private documents_: any; private options_: Options; private debug = false; constructor(documents: any[], options: Options) { if (!Array.isArray(documents)) { throw new Error("documents must be an array"); } this.documents_ = documents; this.options_ = { stemmer: PorterStemmer, tokenizer: new WordTokenizer(), ...options, }; } get documents() { return this.documents_; } get weights() { return this.options_.weights ?? Array(this.numParts).fill(1); } set weights(weights: number[]) { this.options_.weights = weights; } get numParts() { return this.serialize(this.documents[0]).length; } get stems() { return this.stems_; } get tfidfs() { return this.tfidfs_; } get serializer() { return this.options_.serializer; } set serializer(serializer: serializer) { this.reset(); this.options_.serializer = serializer; } get tokenizer() { return this.options_.tokenizer; } set tokenizer(tokenizer: Tokenizer) { this.reset(); this.options_.tokenizer = tokenizer; } get stemmer() { return this.options_.stemmer; } set stemmer(stemmer: Stemmer | undefined) { this.reset(); this.options_.stemmer = stemmer; } private reset() { this.stems_ = null; this.tfidfs_ = null; } private prepare() { if (this.stems_ && this.tfidfs_) { return; } // documents -> parts -> words -> stems this.stems_ = this.documents_.map((item) => this.process(item)); this.tfidfs_ = []; for (let i = 0; i < this.numParts; i++) { this.tfidfs_.push(new TfIdf()); // add the documents for (let parts of this.stems_) { this.tfidfs_[i].addDocument(parts[i]); } } } /** * Serialize according the {@link Options.serializer} where an object is * serialized into an array of strings based upon specific parts of the * document such as `title`, `summary`, etc. */ serialize(document: any): string[] { return this.serializer(document); } /** * Tokenize the individual serialized parts of the document. */ tokenize(parts: string[]): string[][] { return parts.map((part) => this.tokenizer.tokenize(part)); } /** * Run the stemmer over the words. */ stem(parts: string[][]): string[][] { if (this.stemmer) { return parts.map((words) => words.map((w) => this.stemmer.stem(w))); } else { return parts; } } /** * Convert the document into parts, each having an array of stems or words */ private process(document: any): string[][] { const serialized = this.serialize(document); // validate an array of strings is returned by the serializer if ( !Array.isArray(serialized) || serialized.some((s) => typeof s !== "string") ) { throw new Error( `serializer must return an array of strings: ${serialized}` ); } const tokens = this.tokenize(serialized); const stems = this.stem(tokens); if (this.debug) { console.log({ serialized, tokens, stems }); } return stems; } public rank( document: any ): { relative: number; absolute: number; document: any }[] { this.prepare(); const documentStems = this.process(document); const measures: number[] = Array(this.documents.length).fill(0); this.weights.forEach((w, i) => { this.tfidfs[i].tfidfs(documentStems[i], (j, measure) => { measures[j] += measure * w; }); }); const scores = measures .map((score, i) => ({ score, document: this.documents_[i] })) .sort((a, b) => b.score - a.score); const max = scores[0].score; const min = scores[scores.length - 1].score; return scores .map(({ score, document }) => ({ relative: (score - min) / (max - min), absolute: score, document, })) .slice(1, measures.length); } }