@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
168 lines (167 loc) • 5.18 kB
TypeScript
/**
* HTML File Processor
*
* Processes HTML files with text extraction and security analysis.
* HTML files are processed as text content for AI analysis, with
* extraction of plain text content (tags stripped) for easier processing.
*
* Features:
* - Original HTML content preservation
* - Text extraction (all tags stripped)
* - Script and style tag detection
* - Title extraction
* - Security warnings for dangerous content
*
* Security: Uses OWASP-compliant HTML sanitization utilities
*
* @module processors/markup/HtmlProcessor
*
* @example
* ```typescript
* import { htmlProcessor, processHtml, isHtmlFile } from "./markup/HtmlProcessor.js";
*
* // Check if file is HTML
* if (isHtmlFile(mimetype, filename)) {
* const result = await processHtml(fileInfo);
* if (result.success) {
* console.log('Text content:', result.data.textContent);
* console.log('Has scripts:', result.data.hasScripts);
* if (result.data.title) {
* console.log('Page title:', result.data.title);
* }
* }
* }
* ```
*/
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
import type { FileInfo, ProcessorFileProcessingResult, ProcessOptions, ProcessedHtml } from "../../types/index.js";
/**
* HTML Processor - processes HTML files with text extraction.
*
* This processor extracts both the original HTML content and a plain text
* version with all tags stripped. It also performs security analysis to
* detect potentially dangerous content.
*
* Priority: 20 (after SVG at priority 5, before generic text)
*
* @example
* ```typescript
* const processor = new HtmlProcessor();
*
* const result = await processor.processFile({
* id: 'html-123',
* name: 'page.html',
* mimetype: 'text/html',
* size: 8192,
* url: 'https://example.com/page.html',
* });
*
* if (result.success) {
* console.log('Title:', result.data.title);
* console.log('Text content:', result.data.textContent);
* }
* ```
*/
export declare class HtmlProcessor extends BaseFileProcessor<ProcessedHtml> {
constructor();
/**
* Validate downloaded HTML file.
* Performs basic validation to ensure content appears to be HTML.
*
* @param buffer - Downloaded file content
* @param _fileInfo - Original file information
* @returns null if valid, error message if invalid
*/
protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
/**
* Build processed HTML result with text extraction.
*
* Processing steps:
* 1. Preserve original HTML content
* 2. Extract plain text (strip all tags)
* 3. Detect script and style tags
* 4. Extract page title if present
* 5. Check for dangerous content
*
* @param buffer - Downloaded file content
* @param fileInfo - Original file information
* @returns Processed HTML result
*/
protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedHtml;
}
/**
* Singleton HTML processor instance.
* Use this for most processing needs.
*
* @example
* ```typescript
* import { htmlProcessor } from "./markup/HtmlProcessor.js";
*
* const result = await htmlProcessor.processFile(fileInfo);
* ```
*/
export declare const htmlProcessor: HtmlProcessor;
/**
* Check if a file is an HTML file.
*
* @param mimetype - MIME type of the file
* @param filename - Filename (for extension-based detection)
* @returns true if the file is an HTML file
*
* @example
* ```typescript
* if (isHtmlFile('text/html', 'page.html')) {
* // Handle as HTML
* }
*
* // Also works with just filename
* if (isHtmlFile('', 'index.htm')) {
* // Handle as HTML based on extension
* }
* ```
*/
export declare function isHtmlFile(mimetype: string, filename: string): boolean;
/**
* Validate HTML file size against configured limit.
*
* @param sizeBytes - File size in bytes
* @returns true if size is within the allowed limit
*
* @example
* ```typescript
* if (!validateHtmlSize(fileInfo.size)) {
* console.error('HTML file is too large');
* }
* ```
*/
export declare function validateHtmlSize(sizeBytes: number): boolean;
/**
* Process a single HTML file.
* Convenience function that uses the singleton processor.
*
* @param fileInfo - File information (can include URL or buffer)
* @param options - Optional processing options (auth headers, timeout, retry config)
* @returns Processing result with HTML content and extracted text
*
* @example
* ```typescript
* const result = await processHtml({
* id: 'html-123',
* name: 'page.html',
* mimetype: 'text/html',
* size: 8192,
* buffer: htmlBuffer,
* });
*
* if (result.success) {
* console.log('Page title:', result.data.title);
* console.log('Text content:', result.data.textContent);
* if (result.data.hasDangerousContent) {
* console.warn('HTML contains potentially dangerous content');
* }
* } else {
* console.error('Processing failed:', result.error.userMessage);
* }
* ```
*/
export declare function processHtml(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedHtml>>;