UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

501 lines 16.8 kB
/** * Document Loaders * * Provides loaders for various document formats including: * - Text files * - Markdown files * - HTML files and web pages * - JSON files * - CSV files * - PDF files * * @example * ```typescript * import { loadDocument, WebLoader, PDFLoader } from 'neurolink/rag'; * * // Load from file path * const doc = await loadDocument('/path/to/document.md'); * * // Load from URL * const webDoc = await WebLoader.load('https://example.com/article'); * * // Load PDF * const pdfDoc = await PDFLoader.load('/path/to/document.pdf'); * ``` */ import { existsSync } from "fs"; import { readFile } from "fs/promises"; import { basename, extname } from "path"; import { logger } from "../../utils/logger.js"; import { MDocument } from "./MDocument.js"; /** * Text file loader */ export class TextLoader { async load(source, options) { const content = await this.loadContent(source, options?.encoding); return MDocument.fromText(content, { source: this.getSourceName(source), ...options?.metadata, }); } canHandle(source) { const ext = extname(source).toLowerCase(); return ext === ".txt" || ext === ""; } async loadContent(source, encoding = "utf-8") { if (existsSync(source)) { return await readFile(source, encoding); } // Assume source is content if not a file return source; } getSourceName(source) { return existsSync(source) ? basename(source) : "inline-content"; } } /** * Markdown file loader */ export class MarkdownLoader extends TextLoader { async load(source, options) { const content = await this.loadContent(source, options?.encoding); return MDocument.fromMarkdown(content, { source: this.getSourceName(source), ...options?.metadata, }); } canHandle(source) { const ext = extname(source).toLowerCase(); return ext === ".md" || ext === ".markdown" || ext === ".mdx"; } } /** * HTML file loader */ export class HTMLLoader extends TextLoader { async load(source, options) { const content = await this.loadContent(source, options?.encoding); return MDocument.fromHTML(content, { source: this.getSourceName(source), ...options?.metadata, }); } canHandle(source) { const ext = extname(source).toLowerCase(); return ext === ".html" || ext === ".htm" || ext === ".xhtml"; } } /** * JSON file loader */ export class JSONLoader extends TextLoader { async load(source, options) { const content = await this.loadContent(source, options?.encoding); // Validate JSON try { JSON.parse(content); } catch (error) { throw new Error(`Invalid JSON: ${error instanceof Error ? error.message : String(error)}`, { cause: error }); } return MDocument.fromJSONContent(content, { source: this.getSourceName(source), ...options?.metadata, }); } canHandle(source) { const ext = extname(source).toLowerCase(); return ext === ".json" || ext === ".jsonl"; } } /** * CSV file loader */ export class CSVLoader extends TextLoader { async load(source, options) { const content = await this.loadContent(source, options?.encoding); const { delimiter = ",", hasHeader = true, columns, outputFormat = "text", } = options || {}; const lines = content.split("\n").filter((line) => line.trim()); const headers = hasHeader ? this.parseCSVLine(lines[0], delimiter) : columns || lines[0]?.split(delimiter).map((_, i) => `col${i + 1}`); const dataLines = hasHeader ? lines.slice(1) : lines; const rows = dataLines.map((line) => this.parseCSVLine(line, delimiter)); let formattedContent; switch (outputFormat) { case "json": formattedContent = JSON.stringify(rows.map((row) => Object.fromEntries(headers.map((h, i) => [h, row[i]]))), null, 2); break; case "markdown": formattedContent = this.toMarkdownTable(headers, rows); break; default: formattedContent = this.toTextTable(headers, rows); } return MDocument.fromCSV(formattedContent, { source: this.getSourceName(source), rowCount: rows.length, columnCount: headers.length, columns: headers, ...options?.metadata, }); } canHandle(source) { const ext = extname(source).toLowerCase(); return ext === ".csv" || ext === ".tsv"; } parseCSVLine(line, delimiter) { const result = []; let current = ""; let inQuotes = false; for (let i = 0; i < line.length; i++) { const char = line[i]; if (char === '"' && (i === 0 || line[i - 1] !== "\\")) { inQuotes = !inQuotes; } else if (char === delimiter && !inQuotes) { result.push(current.trim()); current = ""; } else { current += char; } } result.push(current.trim()); return result; } toMarkdownTable(headers, rows) { const headerRow = `| ${headers.join(" | ")} |`; const separator = `| ${headers.map(() => "---").join(" | ")} |`; const dataRows = rows.map((row) => `| ${row.join(" | ")} |`); return [headerRow, separator, ...dataRows].join("\n"); } toTextTable(headers, rows) { const allRows = [headers, ...rows]; const colWidths = headers.map((_, i) => Math.max(...allRows.map((row) => (row[i] || "").length))); const formatRow = (row) => row.map((cell, i) => (cell || "").padEnd(colWidths[i])).join(" | "); return [ formatRow(headers), colWidths.map((w) => "-".repeat(w)).join("-+-"), ...rows.map(formatRow), ].join("\n"); } } /** * PDF file loader * * Note: Requires external PDF processing library for full functionality. * Falls back to placeholder implementation if pdf-parse is not available. */ export class PDFLoader { async load(source, options) { if (!existsSync(source)) { throw new Error(`PDF file not found: ${source}`); } logger.debug("[PDFLoader] Loading PDF", { source, pageRange: options?.pageRange, }); try { // Try to use pdf-parse if available const pdfParse = await this.loadPdfParser(); const buffer = await readFile(source); const data = await pdfParse(buffer); const text = data.text; // Handle page range if specified if (options?.pageRange) { const _pages = this.parsePageRange(options.pageRange, data.numpages); // Note: pdf-parse doesn't support page selection directly // This is a placeholder for more sophisticated page handling logger.debug("[PDFLoader] Page range requested but not fully supported", { pageRange: options.pageRange, totalPages: data.numpages, }); } return new MDocument(text, { type: "pdf", metadata: { source: basename(source), pageCount: data.numpages, info: data.info, ...options?.metadata, }, }); } catch (error) { // Fallback: Return placeholder document logger.warn("[PDFLoader] pdf-parse not available, using fallback", { error: error instanceof Error ? error.message : String(error), }); return new MDocument(`[PDF Document: ${basename(source)}]\n\nNote: PDF parsing requires the 'pdf-parse' package. Install it with:\n npm install pdf-parse`, { type: "pdf", metadata: { source: basename(source), parseError: "pdf-parse not available", ...options?.metadata, }, }); } } canHandle(source) { const ext = extname(source).toLowerCase(); return ext === ".pdf"; } async loadPdfParser() { try { // pdf-parse is an optional dependency - use dynamic import with type assertion const pdfParse = (await import("pdf-parse")); return pdfParse.default || pdfParse; } catch { throw new Error("pdf-parse module not available"); } } parsePageRange(range, totalPages) { const pages = []; const parts = range.split(","); for (const part of parts) { if (part.includes("-")) { const [start, end] = part.split("-").map(Number); for (let i = start; i <= Math.min(end, totalPages); i++) { pages.push(i); } } else { const page = Number(part); if (page <= totalPages) { pages.push(page); } } } return [...new Set(pages)].sort((a, b) => a - b); } } /** * Web page loader * * Fetches and extracts content from web pages. * Supports basic HTML parsing without external dependencies. */ export class WebLoader { defaultUserAgent = "Mozilla/5.0 (compatible; NeuroLink/1.0; +https://github.com/juspay/neurolink)"; async load(source, options) { if (!this.canHandle(source)) { throw new Error(`Invalid URL: ${source}`); } logger.debug("[WebLoader] Fetching URL", { url: source, timeout: options?.timeout, }); const response = await fetch(source, { signal: options?.timeout ? AbortSignal.timeout(options.timeout) : undefined, headers: { "User-Agent": options?.userAgent || this.defaultUserAgent, Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", ...options?.headers, }, }); if (!response.ok) { throw new Error(`HTTP error ${response.status}: ${response.statusText}`); } const html = await response.text(); let content = html; // Extract main content if requested if (options?.extractMainContent) { content = this.extractMainContent(html, options.contentSelector); } // Convert HTML to plain text for better processing const text = this.htmlToText(content); return new MDocument(text, { type: "html", metadata: { source, url: source, fetchedAt: new Date().toISOString(), contentType: response.headers.get("content-type") || "text/html", ...options?.metadata, }, }); } canHandle(source) { try { const url = new URL(source); return url.protocol === "http:" || url.protocol === "https:"; } catch { return false; } } /** * Extract main content from HTML */ extractMainContent(html, selector) { // Simple extraction based on common content patterns // For production use, consider using a library like cheerio // Try to extract content from common containers const patterns = selector ? [`<${selector}[^>]*>([\\s\\S]*?)</${selector}>`] : [ /<main[^>]*>([\s\S]*?)<\/main>/i, /<article[^>]*>([\s\S]*?)<\/article>/i, /<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i, /<div[^>]*id="content"[^>]*>([\s\S]*?)<\/div>/i, /<body[^>]*>([\s\S]*?)<\/body>/i, ]; for (const pattern of patterns) { const match = html.match(new RegExp(pattern, "i")); if (match) { return match[1] || match[0]; } } return html; } /** * Convert HTML to plain text */ htmlToText(html) { return (html // Remove script and style elements .replace(/<script[\s\S]*?<\/script>/gi, "") .replace(/<style[\s\S]*?<\/style>/gi, "") // Remove HTML comments .replace(/<!--[\s\S]*?-->/g, "") // Replace common block elements with newlines .replace(/<\/(p|div|h[1-6]|br|li|tr|blockquote)>/gi, "\n") .replace(/<(br|hr)\s*\/?>/gi, "\n") // Remove remaining tags .replace(/<[^>]+>/g, "") // Decode common HTML entities .replace(/&nbsp;/gi, " ") .replace(/&amp;/gi, "&") .replace(/&lt;/gi, "<") .replace(/&gt;/gi, ">") .replace(/&quot;/gi, '"') .replace(/&#039;/gi, "'") .replace(/&apos;/gi, "'") // Decode numeric entities .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code))) // Normalize whitespace .replace(/\n\s*\n/g, "\n\n") .replace(/[ \t]+/g, " ") .trim()); } } /** * Registry of document loaders */ const loaderRegistry = [ new MarkdownLoader(), new HTMLLoader(), new JSONLoader(), new CSVLoader(), new PDFLoader(), new WebLoader(), new TextLoader(), // Default fallback ]; /** * Detect document type from source */ function _detectDocumentType(source) { const ext = extname(source).toLowerCase(); const typeMap = { ".md": "markdown", ".markdown": "markdown", ".mdx": "markdown", ".html": "html", ".htm": "html", ".xhtml": "html", ".json": "json", ".jsonl": "json", ".csv": "csv", ".tsv": "csv", ".tex": "latex", ".latex": "latex", ".pdf": "pdf", }; // Check if it's a URL try { const url = new URL(source); if (url.protocol === "http:" || url.protocol === "https:") { return "html"; } } catch { // Not a URL } return typeMap[ext] || "text"; } /** * Load document from file path, URL, or content * * Automatically detects the document type and uses the appropriate loader. * * @param source - File path, URL, or raw content * @param options - Loader options * @returns Promise resolving to MDocument * * @example * ```typescript * // Load from file * const doc = await loadDocument('/path/to/document.md'); * * // Load from URL * const webDoc = await loadDocument('https://example.com/article'); * * // Load with options * const pdfDoc = await loadDocument('/path/to/doc.pdf', { * pageRange: '1-5', * metadata: { project: 'research' } * }); * ``` */ export async function loadDocument(source, options) { // Find appropriate loader const loader = loaderRegistry.find((l) => l.canHandle(source)); if (!loader) { // Fall back to text loader return new TextLoader().load(source, options); } logger.debug("[loadDocument] Loading document", { source: source.slice(0, 100), loaderType: loader.constructor.name, }); return loader.load(source, options); } /** * Load multiple documents * * @param sources - Array of file paths, URLs, or content * @param options - Loader options (applied to all) * @returns Promise resolving to array of MDocuments */ export async function loadDocuments(sources, options) { const results = await Promise.allSettled(sources.map((source) => loadDocument(source, options))); const documents = []; const errors = []; results.forEach((result, index) => { if (result.status === "fulfilled") { documents.push(result.value); } else { errors.push({ source: sources[index], error: result.reason instanceof Error ? result.reason.message : String(result.reason), }); } }); if (errors.length > 0) { logger.warn("[loadDocuments] Some documents failed to load", { loaded: documents.length, failed: errors.length, errors, }); } return documents; } //# sourceMappingURL=loaders.js.map