UNPKG

@devilsdev/rag-pipeline-utils

Version:

A modular toolkit for building RAG (Retrieval-Augmented Generation) pipelines in Node.js

59 lines (52 loc) 1.39 kB
/** * Version: 0.1.0 * Path: /src/loader/markdown-loader.js * Description: Loader for Markdown (.md) documents * Author: Ali Kahwaji */ import fs from 'fs/promises'; import path from 'path'; import { marked } from 'marked'; /** * MarkdownLoader reads and parses .md files into plain text chunks. * Implements the Loader interface. */ export class MarkdownLoader { /** * Load and parse Markdown file * @param {string} filePath - Path to .md file * @returns {Promise<Array<{ chunk(): string[] }>>} */ async load(filePath) { const absPath = path.resolve(filePath); const raw = await fs.readFile(absPath, 'utf-8'); const html = marked(raw); const text = html.replace(/<[^>]+>/g, ''); // strip tags return [ { chunk: () => this._chunkText(text) } ]; } /** * Simple sentence-based chunking strategy * @param {string} input * @param {number} maxLen * @returns {string[]} */ _chunkText(input, maxLen = 500) { const sentences = input.split(/(?<=[.!?])\s+/); const chunks = []; let buffer = ''; for (const sentence of sentences) { if ((buffer + sentence).length <= maxLen) { buffer += sentence + ' '; } else { chunks.push(buffer.trim()); buffer = sentence + ' '; } } if (buffer.trim()) chunks.push(buffer.trim()); return chunks; } }