UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

168 lines (167 loc) 5.18 kB
/** * HTML File Processor * * Processes HTML files with text extraction and security analysis. * HTML files are processed as text content for AI analysis, with * extraction of plain text content (tags stripped) for easier processing. * * Features: * - Original HTML content preservation * - Text extraction (all tags stripped) * - Script and style tag detection * - Title extraction * - Security warnings for dangerous content * * Security: Uses OWASP-compliant HTML sanitization utilities * * @module processors/markup/HtmlProcessor * * @example * ```typescript * import { htmlProcessor, processHtml, isHtmlFile } from "./markup/HtmlProcessor.js"; * * // Check if file is HTML * if (isHtmlFile(mimetype, filename)) { * const result = await processHtml(fileInfo); * if (result.success) { * console.log('Text content:', result.data.textContent); * console.log('Has scripts:', result.data.hasScripts); * if (result.data.title) { * console.log('Page title:', result.data.title); * } * } * } * ``` */ import { BaseFileProcessor } from "../base/BaseFileProcessor.js"; import type { FileInfo, ProcessorFileProcessingResult, ProcessOptions, ProcessedHtml } from "../../types/index.js"; /** * HTML Processor - processes HTML files with text extraction. * * This processor extracts both the original HTML content and a plain text * version with all tags stripped. It also performs security analysis to * detect potentially dangerous content. * * Priority: 20 (after SVG at priority 5, before generic text) * * @example * ```typescript * const processor = new HtmlProcessor(); * * const result = await processor.processFile({ * id: 'html-123', * name: 'page.html', * mimetype: 'text/html', * size: 8192, * url: 'https://example.com/page.html', * }); * * if (result.success) { * console.log('Title:', result.data.title); * console.log('Text content:', result.data.textContent); * } * ``` */ export declare class HtmlProcessor extends BaseFileProcessor<ProcessedHtml> { constructor(); /** * Validate downloaded HTML file. * Performs basic validation to ensure content appears to be HTML. * * @param buffer - Downloaded file content * @param _fileInfo - Original file information * @returns null if valid, error message if invalid */ protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>; /** * Build processed HTML result with text extraction. * * Processing steps: * 1. Preserve original HTML content * 2. Extract plain text (strip all tags) * 3. Detect script and style tags * 4. Extract page title if present * 5. Check for dangerous content * * @param buffer - Downloaded file content * @param fileInfo - Original file information * @returns Processed HTML result */ protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedHtml; } /** * Singleton HTML processor instance. * Use this for most processing needs. * * @example * ```typescript * import { htmlProcessor } from "./markup/HtmlProcessor.js"; * * const result = await htmlProcessor.processFile(fileInfo); * ``` */ export declare const htmlProcessor: HtmlProcessor; /** * Check if a file is an HTML file. * * @param mimetype - MIME type of the file * @param filename - Filename (for extension-based detection) * @returns true if the file is an HTML file * * @example * ```typescript * if (isHtmlFile('text/html', 'page.html')) { * // Handle as HTML * } * * // Also works with just filename * if (isHtmlFile('', 'index.htm')) { * // Handle as HTML based on extension * } * ``` */ export declare function isHtmlFile(mimetype: string, filename: string): boolean; /** * Validate HTML file size against configured limit. * * @param sizeBytes - File size in bytes * @returns true if size is within the allowed limit * * @example * ```typescript * if (!validateHtmlSize(fileInfo.size)) { * console.error('HTML file is too large'); * } * ``` */ export declare function validateHtmlSize(sizeBytes: number): boolean; /** * Process a single HTML file. * Convenience function that uses the singleton processor. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, retry config) * @returns Processing result with HTML content and extracted text * * @example * ```typescript * const result = await processHtml({ * id: 'html-123', * name: 'page.html', * mimetype: 'text/html', * size: 8192, * buffer: htmlBuffer, * }); * * if (result.success) { * console.log('Page title:', result.data.title); * console.log('Text content:', result.data.textContent); * if (result.data.hasDangerousContent) { * console.warn('HTML contains potentially dangerous content'); * } * } else { * console.error('Processing failed:', result.error.userMessage); * } * ``` */ export declare function processHtml(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedHtml>>;