UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

364 lines 12.2 kB
/** * RTF Document Processor * * Processes Rich Text Format (.rtf) files by extracting plain text content * from RTF control codes. Uses a lightweight text extraction approach * without requiring external dependencies. * * Key features: * - RTF control code stripping * - Text content extraction * - Raw content preservation for debugging * - No external dependencies required * * Priority: ~110 (document format, processed after binary formats) * * @module processors/document/RtfProcessor * * @example * ```typescript * import { rtfProcessor, processRtf, isRtfFile } from "./document/index.js"; * * // Check if a file is an RTF file * if (isRtfFile("application/rtf", "document.rtf")) { * const result = await processRtf({ * id: "file-123", * name: "document.rtf", * mimetype: "application/rtf", * size: 10240, * buffer: rtfBuffer, * }); * * if (result.success) { * console.log(`Text content: ${result.data.textContent}`); * } * } * ``` */ import { BaseFileProcessor } from "../base/BaseFileProcessor.js"; import { SIZE_LIMITS } from "../config/index.js"; // Re-export for consumers who import from this module // Import for local use // ============================================================================= // CONSTANTS // ============================================================================= /** * Supported MIME types for RTF documents */ const SUPPORTED_RTF_MIME_TYPES = [ "application/rtf", "text/rtf", "text/richtext", ]; /** * Supported file extensions for RTF documents */ const SUPPORTED_RTF_EXTENSIONS = [".rtf"]; /** * Default timeout for RTF processing (30 seconds) */ const RTF_TIMEOUT_MS = 30000; // ============================================================================= // RTF PROCESSOR CLASS // ============================================================================= /** * RTF Processor - handles Rich Text Format files. * * Extracts plain text from RTF documents by stripping RTF control codes. * This is a lightweight implementation that doesn't require external * RTF parsing libraries. * * Priority: ~110 (document format) * * @example * ```typescript * const processor = new RtfProcessor(); * * const result = await processor.processFile({ * id: "file-123", * name: "report.rtf", * mimetype: "application/rtf", * size: 5120, * buffer: rtfBuffer, * }); * * if (result.success) { * console.log("Extracted text:", result.data.textContent); * } * ``` */ export class RtfProcessor extends BaseFileProcessor { constructor() { super({ maxSizeMB: SIZE_LIMITS.DOCUMENT_MAX_MB, timeoutMs: RTF_TIMEOUT_MS, supportedMimeTypes: SUPPORTED_RTF_MIME_TYPES, supportedExtensions: SUPPORTED_RTF_EXTENSIONS, fileTypeName: "RTF", defaultFilename: "document.rtf", }); } /** * Validate downloaded RTF document. * Checks for RTF header signature "{\\rtf". * * @param buffer - Downloaded file content * @param fileInfo - Original file information * @returns null if valid, error message if invalid */ async validateDownloadedFile(buffer, _fileInfo) { if (buffer.length < 5) { return "Invalid RTF document - file too small"; } // RTF files should start with "{\rtf" const header = buffer.subarray(0, 10).toString("ascii"); if (!header.startsWith("{\\rtf")) { // Check if it might be HTML error page const preview = buffer .subarray(0, 100) .toString("utf8") .substring(0, 100); if (preview.includes("<!DOCTYPE") || preview.includes("<html")) { return "Invalid RTF document - received HTML response instead of file content"; } return "Invalid RTF document - missing RTF header signature"; } return null; } /** * Build the processed RTF result. * Extracts plain text by stripping RTF control codes. * * @param buffer - Raw file content * @param fileInfo - Original file information * @returns Processed RTF with extracted text content */ buildProcessedResult(buffer, fileInfo) { const rawContent = buffer.toString("utf-8"); const textContent = this.extractText(rawContent); return { textContent, rawContent, buffer, mimetype: fileInfo.mimetype || "application/rtf", size: fileInfo.size, filename: this.getFilename(fileInfo), }; } /** * Extract plain text from RTF content. * Strips RTF control codes, groups, and formatting commands. * * This is a basic RTF parser that handles common RTF constructs: * - Control groups like {\fonttbl...} * - Control words like \par, \b, \i * - Special characters like \' hex escapes * - Newlines from \par and \line commands * * @param rtf - Raw RTF content * @returns Extracted plain text */ extractText(rtf) { const text = rtf; let result = ""; let depth = 0; let skipGroup = false; let skipGroupDepth = 0; let i = 0; // Groups that should be skipped entirely (metadata, not content) const skipGroupNames = [ "fonttbl", "colortbl", "stylesheet", "info", "pict", "object", "header", "footer", ]; while (i < text.length) { const char = text[i]; if (char === "{") { depth++; // Check if this is a group we should skip const nextChars = text.substring(i + 1, i + 20); const groupMatch = nextChars.match(/^\\([a-z]+)/); if (groupMatch && skipGroupNames.includes(groupMatch[1]) && !skipGroup) { skipGroup = true; skipGroupDepth = depth; } i++; continue; } if (char === "}") { depth--; if (skipGroup && depth < skipGroupDepth) { skipGroup = false; skipGroupDepth = 0; } i++; continue; } if (skipGroup) { i++; continue; } if (char === "\\") { // Control word or symbol const remaining = text.substring(i); // Handle special escapes if (remaining.startsWith("\\\\")) { result += "\\"; i += 2; continue; } if (remaining.startsWith("\\{")) { result += "{"; i += 2; continue; } if (remaining.startsWith("\\}")) { result += "}"; i += 2; continue; } // Handle hex escapes like \'e9 (é) const hexMatch = remaining.match(/^\\'([0-9a-f]{2})/i); if (hexMatch) { const charCode = parseInt(hexMatch[1], 16); result += String.fromCharCode(charCode); i += 4; continue; } // Handle Unicode escapes like \u233? (é) const unicodeMatch = remaining.match(/^\\u(-?\d+)\??/); if (unicodeMatch) { let charCode = parseInt(unicodeMatch[1], 10); if (charCode < 0) { charCode += 65536; // Convert negative to positive } result += String.fromCharCode(charCode); i += unicodeMatch[0].length; continue; } // Handle control words const controlMatch = remaining.match(/^\\([a-z]+)(-?\d*)[ ]?/i); if (controlMatch) { const controlWord = controlMatch[1].toLowerCase(); // Convert some control words to text if (controlWord === "par" || controlWord === "line") { result += "\n"; } else if (controlWord === "tab") { result += "\t"; } else if (controlWord === "emdash") { result += "—"; } else if (controlWord === "endash") { result += "–"; } else if (controlWord === "bullet") { result += "•"; } else if (controlWord === "lquote") { result += "'"; } else if (controlWord === "rquote") { result += "'"; } else if (controlWord === "ldblquote") { result += '"'; } else if (controlWord === "rdblquote") { result += '"'; } i += controlMatch[0].length; continue; } // Unknown control sequence, skip the backslash and control word i++; continue; } // Regular character if (char !== "\r" && char !== "\n") { result += char; } i++; } // Clean up the result result = result .replace(/\s+/g, " ") // Normalize whitespace .replace(/ +\n/g, "\n") // Remove trailing spaces before newlines .replace(/\n +/g, "\n") // Remove leading spaces after newlines .replace(/\n{3,}/g, "\n\n") // Collapse multiple newlines .trim(); return result; } } // ============================================================================= // SINGLETON INSTANCE // ============================================================================= /** * Singleton instance of the RtfProcessor. * Use this for all RTF document processing to share configuration. */ export const rtfProcessor = new RtfProcessor(); // ============================================================================= // HELPER FUNCTIONS // ============================================================================= /** * Check if a file is an RTF document. * * @param mimetype - MIME type of the file * @param filename - Filename for detection * @returns true if the file is a supported RTF document * * @example * ```typescript * if (isRtfFile("application/rtf", "document.rtf")) { * console.log("This is an RTF document"); * } * ``` */ export function isRtfFile(mimetype, filename) { return rtfProcessor.isFileSupported(mimetype, filename); } /** * Validate RTF document size against configured limit. * * @param sizeBytes - File size in bytes * @returns true if size is within the allowed limit */ export function validateRtfSize(sizeBytes) { const maxBytes = SIZE_LIMITS.DOCUMENT_MAX_MB * 1024 * 1024; return sizeBytes <= maxBytes; } /** * Process an RTF document. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options * @returns Processing result with success flag and either data or error * * @example * ```typescript * const result = await processRtf({ * id: "file-123", * name: "report.rtf", * mimetype: "application/rtf", * size: 10240, * buffer: rtfBuffer, * }); * * if (result.success) { * console.log("Extracted text:", result.data.textContent); * } * ``` */ export async function processRtf(fileInfo, options) { return rtfProcessor.processFile(fileInfo, options); } //# sourceMappingURL=RtfProcessor.js.map