UNPKG

@rr0/cms

Version:

RR0 Content Management System (CMS)

106 lines (105 loc) 3.96 kB
import fs from "fs"; /** * Builds an index of pages. */ export class SearchVisitor { constructor(config, timeTextBuilder) { this.config = config; this.timeTextBuilder = timeTextBuilder; this.index = { pages: [], words: {} }; const indexContent = this.config.indexContent; if (indexContent) { this.contentStream = fs.createWriteStream(indexContent); } } async contentStepEnd() { const contentStream = this.contentStream; if (contentStream) { contentStream.write("\n]"); contentStream.end(); } } async visit(context) { const file = context.file; const title = file.title; const outDir = "out/"; const url = file.name.startsWith(outDir) ? file.name.substring(outDir.length) : file.name; if (title && !this.config.notIndexedUrls.includes(url)) { const indexedPages = this.index.pages; const titleIndexed = indexedPages.find(page => page.title === title && page.url !== url); if (titleIndexed) { this.handleAlreadyIndexed(title, url, titleIndexed); } const time = this.timeTextBuilder.build(context, { year: "numeric", month: "short", day: "numeric" }).toLowerCase(); indexedPages.push({ title, url, time }); } if (this.config.indexWords) { this.indexWords(context, file); } if (this.config.indexContent) { this.indexContent(context, file); } } handleAlreadyIndexed(title, url, titleIndexed) { throw new Error(`Title "${title}" with URL ${url} is already indexed with URL ${titleIndexed.url}`); } getContents(doc) { const div = doc.createElement("div"); div.append(doc.body); this.removeTags(div, "script"); this.removeTags(div, "nav"); this.removeTags(div, "footer"); return div.textContent; } indexContent(context, outputFile) { const contents = this.getContents(outputFile.document); const contentsRecord = { title: outputFile.title, url: context.file.name, time: context.time.toString(), html: contents }; const prefix = this.contentStream.bytesWritten === 0 ? "[\n" : ",\n"; const str = prefix + JSON.stringify(contentsRecord); this.contentStream.write(str); } indexWords(context, outputFile) { const pageIndex = this.index.pages.length; const nonSignificant = context.messages.nonSignificantWords; const contents = this.getContents(outputFile.document); const pageText = contents.toLowerCase(); const pageWords = pageText.split(/[ \t,.…'’\-" :!?;()\[\]\n]/g) .filter(w => w.length > 1) .filter(w => !nonSignificant.includes(w)) .filter(w => { const num = parseInt(w, 10); return Number.isNaN(num) || num > 1000; }); const pageWordsCount = new Map(); for (const pageWord of pageWords) { let pageWordCount = pageWordsCount.get(pageWord); if (!pageWordCount) { pageWordCount = 0; } pageWordsCount.set(pageWord, pageWordCount + 1); } for (const word of new Set(pageWords)) { let existingWordCounts = this.index.words[word]; if (!existingWordCounts) { existingWordCounts = this.index.words[word] = []; } const pageWordCount = pageWordsCount.get(word); existingWordCounts.push({ pageIndex, count: pageWordCount }); } } removeTags(div, selector) { const found = div.querySelectorAll(selector); let i = found.length; while (i--) { found[i].parentNode.removeChild(found[i]); } } }