@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

github.com/juspay/neurolink

juspay/neurolink

167 lines (166 loc) • 5.96 kB

TypeScript

/** * Word Document Processing Utility * * Handles downloading, validating, and processing Word (.docx, .doc) files. * Uses mammoth library to extract text and HTML content from Word documents. * * Features: * - DOCX format validation via ZIP/PK signature check * - Text extraction using mammoth.extractRawText() * - HTML conversion using mammoth.convertToHtml() * - Warning collection from mammoth processing * - Support for both URL downloads and direct buffer input * * @module processors/document/WordProcessor * * @example * ```typescript * import { wordProcessor, processWord, isWordFile } from "./WordProcessor.js"; * * // Check if file is supported * if (isWordFile(file.mimetype, file.name)) { * const result = await processWord(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log("Text:", result.data.textContent); * console.log("HTML:", result.data.htmlContent); * console.log("Warnings:", result.data.warnings); * } * } * ``` */ import { BaseFileProcessor } from "../base/BaseFileProcessor.js"; import type { FileInfo, ProcessorFileProcessingResult, ProcessOptions, ProcessedWord } from "../../types/index.js"; /** * Word Processor - handles .docx and .doc files * * Uses mammoth library for both text and HTML extraction. The processor * validates DOCX files by checking for the ZIP/PK signature (since DOCX * files are actually ZIP archives). * * @example * ```typescript * const processor = new WordProcessor(); * * // Check if file is supported * if (processor.isFileSupported("application/msword", "report.doc")) { * const result = await processor.processFile(fileInfo); * if (result.success) { * console.log("Extracted text:", result.data.textContent); * } * } * ``` */ export declare class WordProcessor extends BaseFileProcessor<ProcessedWord> { constructor(); /** * Validate downloaded Word document has correct magic bytes. * DOCX files are ZIP archives starting with PK signature (0x50 0x4B). * * @param buffer - Downloaded file content * @param fileInfo - Original file information * @returns null if valid, error message if invalid */ protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>; /** * Build processed Word result with extracted text and HTML content. * This is a stub that returns an empty result - actual processing * happens in the overridden processFile method since mammoth * operations are asynchronous. * * @param buffer - Downloaded file content * @param fileInfo - Original file information * @returns Processed Word result (placeholder) */ protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedWord; /** * Override processFile for async mammoth extraction. * * The mammoth library's extractRawText and convertToHtml methods are * asynchronous, so we need to override the entire processFile method * rather than just buildProcessedResult. * * Processing steps: * 1. Validate file type and size * 2. Get buffer (download from URL or use provided buffer) * 3. Validate downloaded file (check PK signature) * 4. Extract text with mammoth.extractRawText() * 5. Convert to HTML with mammoth.convertToHtml() * 6. Collect any warnings from mammoth * 7. Return structured result * * @param fileInfo - File information with URL or buffer * @param options - Optional processing options * @returns Processing result with text, HTML, and warnings */ processFile(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedWord>>; } /** * Singleton Word processor instance. * Use this for most use cases to avoid creating multiple instances. */ export declare const wordProcessor: WordProcessor; /** * Check if a file is a Word document (.docx or .doc). * * @param mimetype - MIME type of the file * @param filename - Filename (for extension-based detection) * @returns true if the file is a supported Word document * * @example * ```typescript * if (isWordFile(file.mimetype, file.name)) { * const result = await processWord(file); * } * ``` */ export declare function isWordFile(mimetype: string, filename: string): boolean; /** * Validate Word document size against configured limit. * * @param sizeBytes - File size in bytes * @returns true if size is within the allowed limit * * @example * ```typescript * if (!validateWordSize(file.size)) { * throw new Error(`File exceeds ${SIZE_LIMITS.WORD_MAX_MB}MB limit`); * } * ``` */ export declare function validateWordSize(sizeBytes: number): boolean; /** * Process a single Word document. * * Convenience function that uses the singleton wordProcessor instance. * * @param fileInfo - File information with URL or buffer * @param options - Optional processing options (auth headers, timeout, retry config) * @returns Processing result with extracted text, HTML, and warnings * * @example * ```typescript * const result = await processWord({ * id: "doc-123", * name: "report.docx", * mimetype: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", * size: 12345, * url: "https://example.com/files/report.docx", * }, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log("Text content:", result.data.textContent); * console.log("HTML content:", result.data.htmlContent); * if (result.data.warnings.length > 0) { * console.warn("Warnings:", result.data.warnings); * } * } else { * console.error("Failed:", result.error.userMessage); * } * ``` */ export declare function processWord(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedWord>>;