UNPKG

@devilsdev/rag-pipeline-utils

Version:

A modular toolkit for building RAG (Retrieval-Augmented Generation) pipelines in Node.js

58 lines (52 loc) 1.38 kB
/** * Version: 0.1.0 * Path: /src/loader/html-loader.js * Description: Loader for HTML (.html) documents * Author: Ali Kahwaji */ import fs from 'fs/promises'; import path from 'path'; import { JSDOM } from 'jsdom'; /** * HTMLLoader reads HTML files and extracts visible text. * Implements the Loader interface. */ export class HTMLLoader { /** * Load and extract text from an HTML file * @param {string} filePath - Path to .html file * @returns {Promise<Array<{ chunk(): string[] }>>} */ async load(filePath) { const absPath = path.resolve(filePath); const raw = await fs.readFile(absPath, 'utf-8'); const dom = new JSDOM(raw); const text = dom.window.document.body.textContent || ''; return [ { chunk: () => this._chunkText(text) } ]; } /** * Chunk HTML text content by sentence * @param {string} input * @param {number} maxLen * @returns {string[]} */ _chunkText(input, maxLen = 500) { const sentences = input.split(/(?<=[.!?])\s+/); const chunks = []; let buffer = ''; for (const sentence of sentences) { if ((buffer + sentence).length <= maxLen) { buffer += sentence + ' '; } else { chunks.push(buffer.trim()); buffer = sentence + ' '; } } if (buffer.trim()) chunks.push(buffer.trim()); return chunks; } }