@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
167 lines (166 loc) • 5.96 kB
TypeScript
/**
* Word Document Processing Utility
*
* Handles downloading, validating, and processing Word (.docx, .doc) files.
* Uses mammoth library to extract text and HTML content from Word documents.
*
* Features:
* - DOCX format validation via ZIP/PK signature check
* - Text extraction using mammoth.extractRawText()
* - HTML conversion using mammoth.convertToHtml()
* - Warning collection from mammoth processing
* - Support for both URL downloads and direct buffer input
*
* @module processors/document/WordProcessor
*
* @example
* ```typescript
* import { wordProcessor, processWord, isWordFile } from "./WordProcessor.js";
*
* // Check if file is supported
* if (isWordFile(file.mimetype, file.name)) {
* const result = await processWord(fileInfo, {
* authHeaders: { Authorization: "Bearer token" },
* });
*
* if (result.success) {
* console.log("Text:", result.data.textContent);
* console.log("HTML:", result.data.htmlContent);
* console.log("Warnings:", result.data.warnings);
* }
* }
* ```
*/
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
import type { FileInfo, ProcessorFileProcessingResult, ProcessOptions, ProcessedWord } from "../../types/index.js";
/**
* Word Processor - handles .docx and .doc files
*
* Uses mammoth library for both text and HTML extraction. The processor
* validates DOCX files by checking for the ZIP/PK signature (since DOCX
* files are actually ZIP archives).
*
* @example
* ```typescript
* const processor = new WordProcessor();
*
* // Check if file is supported
* if (processor.isFileSupported("application/msword", "report.doc")) {
* const result = await processor.processFile(fileInfo);
* if (result.success) {
* console.log("Extracted text:", result.data.textContent);
* }
* }
* ```
*/
export declare class WordProcessor extends BaseFileProcessor<ProcessedWord> {
constructor();
/**
* Validate downloaded Word document has correct magic bytes.
* DOCX files are ZIP archives starting with PK signature (0x50 0x4B).
*
* @param buffer - Downloaded file content
* @param fileInfo - Original file information
* @returns null if valid, error message if invalid
*/
protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
/**
* Build processed Word result with extracted text and HTML content.
* This is a stub that returns an empty result - actual processing
* happens in the overridden processFile method since mammoth
* operations are asynchronous.
*
* @param buffer - Downloaded file content
* @param fileInfo - Original file information
* @returns Processed Word result (placeholder)
*/
protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedWord;
/**
* Override processFile for async mammoth extraction.
*
* The mammoth library's extractRawText and convertToHtml methods are
* asynchronous, so we need to override the entire processFile method
* rather than just buildProcessedResult.
*
* Processing steps:
* 1. Validate file type and size
* 2. Get buffer (download from URL or use provided buffer)
* 3. Validate downloaded file (check PK signature)
* 4. Extract text with mammoth.extractRawText()
* 5. Convert to HTML with mammoth.convertToHtml()
* 6. Collect any warnings from mammoth
* 7. Return structured result
*
* @param fileInfo - File information with URL or buffer
* @param options - Optional processing options
* @returns Processing result with text, HTML, and warnings
*/
processFile(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedWord>>;
}
/**
* Singleton Word processor instance.
* Use this for most use cases to avoid creating multiple instances.
*/
export declare const wordProcessor: WordProcessor;
/**
* Check if a file is a Word document (.docx or .doc).
*
* @param mimetype - MIME type of the file
* @param filename - Filename (for extension-based detection)
* @returns true if the file is a supported Word document
*
* @example
* ```typescript
* if (isWordFile(file.mimetype, file.name)) {
* const result = await processWord(file);
* }
* ```
*/
export declare function isWordFile(mimetype: string, filename: string): boolean;
/**
* Validate Word document size against configured limit.
*
* @param sizeBytes - File size in bytes
* @returns true if size is within the allowed limit
*
* @example
* ```typescript
* if (!validateWordSize(file.size)) {
* throw new Error(`File exceeds ${SIZE_LIMITS.WORD_MAX_MB}MB limit`);
* }
* ```
*/
export declare function validateWordSize(sizeBytes: number): boolean;
/**
* Process a single Word document.
*
* Convenience function that uses the singleton wordProcessor instance.
*
* @param fileInfo - File information with URL or buffer
* @param options - Optional processing options (auth headers, timeout, retry config)
* @returns Processing result with extracted text, HTML, and warnings
*
* @example
* ```typescript
* const result = await processWord({
* id: "doc-123",
* name: "report.docx",
* mimetype: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
* size: 12345,
* url: "https://example.com/files/report.docx",
* }, {
* authHeaders: { Authorization: "Bearer token" },
* });
*
* if (result.success) {
* console.log("Text content:", result.data.textContent);
* console.log("HTML content:", result.data.htmlContent);
* if (result.data.warnings.length > 0) {
* console.warn("Warnings:", result.data.warnings);
* }
* } else {
* console.error("Failed:", result.error.userMessage);
* }
* ```
*/
export declare function processWord(fileInfo: FileInfo, options?: ProcessOptions): Promise<ProcessorFileProcessingResult<ProcessedWord>>;