UNPKG

vexify

Version:

Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.

109 lines (90 loc) 3 kB
'use strict'; const crypto = require('crypto'); const { BaseProcessor } = require('./base'); // Lazy-load heavy dependencies to avoid blocking on module import let JSDOM, Readability, NodeHtmlMarkdown; function loadDependencies() { if (!JSDOM) { ({ JSDOM } = require('jsdom')); ({ Readability } = require('@mozilla/readability')); ({ NodeHtmlMarkdown } = require('node-html-markdown')); } } class HtmlProcessor extends BaseProcessor { static get extensions() { return ['.html', '.htm']; } async process(filePath, options = {}) { const fs = require('fs'); const content = fs.readFileSync(filePath, 'utf-8'); return this.processContent(content, filePath, options); } async processBuffer(buffer, fileName, options = {}) { const content = buffer.toString('utf-8'); return this.processContent(content, fileName, options); } removeBoilerplate(text) { const patterns = [ /important:.*?please read/gi, /disclaimer.*?policy/gi, /privacy policy.*?terms/gi, /copyright.*?\d{4}/gi, /all rights reserved/gi, /watch.*?online.*?dub.*?anime/gi, /log-?in.*?register/gi, /mobile html.*?player/gi ]; let cleaned = text; for (const pattern of patterns) { cleaned = cleaned.replace(pattern, ''); } return cleaned.replace(/\s+/g, ' ').trim(); } processContent(htmlContent, filePath, options = {}) { loadDependencies(); let text; let title = ''; let article = null; // Try JSDOM first, but fall back to markdown if parse5 fails try { const dom = new JSDOM(htmlContent, { url: options.url || 'http://localhost' }); const reader = new Readability(dom.window.document); article = reader.parse(); } catch (jsdomError) { // If JSDOM fails (parse5 ES Module issue), fall through to markdown extraction console.warn(`JSDOM parsing failed for ${filePath}, using markdown fallback`); } if (article && article.content) { text = article.textContent.trim(); title = article.title || ''; text = this.removeBoilerplate(text); } else { // Use markdown extraction as fallback const markdown = NodeHtmlMarkdown.translate(htmlContent); text = this.removeBoilerplate(markdown.trim()); } if (!text) { return []; } const metadata = { type: 'html', format: article ? 'readability' : 'markdown', originalLength: htmlContent.length, contentLength: text.length }; if (title) { metadata.title = title; } if (options.url) { metadata.crawlUrl = options.url; } const finalContent = title ? `${title}\n\n${text}` : text; metadata.contentHash = crypto.createHash('sha256').update(finalContent).digest('hex'); return [{ id: this.generateDocumentId(filePath), content: finalContent, metadata: this.createMetadata(filePath, metadata) }]; } } module.exports = { HtmlProcessor };