vexify
Version:
Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.
109 lines (90 loc) • 3 kB
JavaScript
;
const crypto = require('crypto');
const { BaseProcessor } = require('./base');
// Lazy-load heavy dependencies to avoid blocking on module import
let JSDOM, Readability, NodeHtmlMarkdown;
function loadDependencies() {
if (!JSDOM) {
({ JSDOM } = require('jsdom'));
({ Readability } = require('@mozilla/readability'));
({ NodeHtmlMarkdown } = require('node-html-markdown'));
}
}
class HtmlProcessor extends BaseProcessor {
static get extensions() {
return ['.html', '.htm'];
}
async process(filePath, options = {}) {
const fs = require('fs');
const content = fs.readFileSync(filePath, 'utf-8');
return this.processContent(content, filePath, options);
}
async processBuffer(buffer, fileName, options = {}) {
const content = buffer.toString('utf-8');
return this.processContent(content, fileName, options);
}
removeBoilerplate(text) {
const patterns = [
/important:.*?please read/gi,
/disclaimer.*?policy/gi,
/privacy policy.*?terms/gi,
/copyright.*?\d{4}/gi,
/all rights reserved/gi,
/watch.*?online.*?dub.*?anime/gi,
/log-?in.*?register/gi,
/mobile html.*?player/gi
];
let cleaned = text;
for (const pattern of patterns) {
cleaned = cleaned.replace(pattern, '');
}
return cleaned.replace(/\s+/g, ' ').trim();
}
processContent(htmlContent, filePath, options = {}) {
loadDependencies();
let text;
let title = '';
let article = null;
// Try JSDOM first, but fall back to markdown if parse5 fails
try {
const dom = new JSDOM(htmlContent, { url: options.url || 'http://localhost' });
const reader = new Readability(dom.window.document);
article = reader.parse();
} catch (jsdomError) {
// If JSDOM fails (parse5 ES Module issue), fall through to markdown extraction
console.warn(`JSDOM parsing failed for ${filePath}, using markdown fallback`);
}
if (article && article.content) {
text = article.textContent.trim();
title = article.title || '';
text = this.removeBoilerplate(text);
} else {
// Use markdown extraction as fallback
const markdown = NodeHtmlMarkdown.translate(htmlContent);
text = this.removeBoilerplate(markdown.trim());
}
if (!text) {
return [];
}
const metadata = {
type: 'html',
format: article ? 'readability' : 'markdown',
originalLength: htmlContent.length,
contentLength: text.length
};
if (title) {
metadata.title = title;
}
if (options.url) {
metadata.crawlUrl = options.url;
}
const finalContent = title ? `${title}\n\n${text}` : text;
metadata.contentHash = crypto.createHash('sha256').update(finalContent).digest('hex');
return [{
id: this.generateDocumentId(filePath),
content: finalContent,
metadata: this.createMetadata(filePath, metadata)
}];
}
}
module.exports = { HtmlProcessor };