@rr0/cms
Version:
RR0 Content Management System (CMS)
106 lines (105 loc) • 3.96 kB
JavaScript
import fs from "fs";
/**
* Builds an index of pages.
*/
export class SearchVisitor {
constructor(config, timeTextBuilder) {
this.config = config;
this.timeTextBuilder = timeTextBuilder;
this.index = {
pages: [],
words: {}
};
const indexContent = this.config.indexContent;
if (indexContent) {
this.contentStream = fs.createWriteStream(indexContent);
}
}
async contentStepEnd() {
const contentStream = this.contentStream;
if (contentStream) {
contentStream.write("\n]");
contentStream.end();
}
}
async visit(context) {
const file = context.file;
const title = file.title;
const outDir = "out/";
const url = file.name.startsWith(outDir) ? file.name.substring(outDir.length) : file.name;
if (title && !this.config.notIndexedUrls.includes(url)) {
const indexedPages = this.index.pages;
const titleIndexed = indexedPages.find(page => page.title === title && page.url !== url);
if (titleIndexed) {
this.handleAlreadyIndexed(title, url, titleIndexed);
}
const time = this.timeTextBuilder.build(context, { year: "numeric", month: "short", day: "numeric" }).toLowerCase();
indexedPages.push({ title, url, time });
}
if (this.config.indexWords) {
this.indexWords(context, file);
}
if (this.config.indexContent) {
this.indexContent(context, file);
}
}
handleAlreadyIndexed(title, url, titleIndexed) {
throw new Error(`Title "${title}" with URL ${url} is already indexed with URL ${titleIndexed.url}`);
}
getContents(doc) {
const div = doc.createElement("div");
div.append(doc.body);
this.removeTags(div, "script");
this.removeTags(div, "nav");
this.removeTags(div, "footer");
return div.textContent;
}
indexContent(context, outputFile) {
const contents = this.getContents(outputFile.document);
const contentsRecord = {
title: outputFile.title,
url: context.file.name,
time: context.time.toString(),
html: contents
};
const prefix = this.contentStream.bytesWritten === 0 ? "[\n" : ",\n";
const str = prefix + JSON.stringify(contentsRecord);
this.contentStream.write(str);
}
indexWords(context, outputFile) {
const pageIndex = this.index.pages.length;
const nonSignificant = context.messages.nonSignificantWords;
const contents = this.getContents(outputFile.document);
const pageText = contents.toLowerCase();
const pageWords = pageText.split(/[ \t,.…'’\-" :!?;()\[\]\n]/g)
.filter(w => w.length > 1)
.filter(w => !nonSignificant.includes(w))
.filter(w => {
const num = parseInt(w, 10);
return Number.isNaN(num) || num > 1000;
});
const pageWordsCount = new Map();
for (const pageWord of pageWords) {
let pageWordCount = pageWordsCount.get(pageWord);
if (!pageWordCount) {
pageWordCount = 0;
}
pageWordsCount.set(pageWord, pageWordCount + 1);
}
for (const word of new Set(pageWords)) {
let existingWordCounts = this.index.words[word];
if (!existingWordCounts) {
existingWordCounts = this.index.words[word] = [];
}
const pageWordCount = pageWordsCount.get(word);
existingWordCounts.push({ pageIndex, count: pageWordCount });
}
}
removeTags(div, selector) {
const found = div.querySelectorAll(selector);
let i = found.length;
while (i--) {
found[i].parentNode.removeChild(found[i]);
}
}
}