@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
373 lines • 14 kB
JavaScript
/**
* Word Document Processing Utility
*
* Handles downloading, validating, and processing Word (.docx, .doc) files.
* Uses mammoth library to extract text and HTML content from Word documents.
*
* Features:
* - DOCX format validation via ZIP/PK signature check
* - Text extraction using mammoth.extractRawText()
* - HTML conversion using mammoth.convertToHtml()
* - Warning collection from mammoth processing
* - Support for both URL downloads and direct buffer input
*
* @module processors/document/WordProcessor
*
* @example
* ```typescript
* import { wordProcessor, processWord, isWordFile } from "./WordProcessor.js";
*
* // Check if file is supported
* if (isWordFile(file.mimetype, file.name)) {
* const result = await processWord(fileInfo, {
* authHeaders: { Authorization: "Bearer token" },
* });
*
* if (result.success) {
* console.log("Text:", result.data.textContent);
* console.log("HTML:", result.data.htmlContent);
* console.log("Warnings:", result.data.warnings);
* }
* }
* ```
*/
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
import { SIZE_LIMITS } from "../config/index.js";
import { FileErrorCode } from "../errors/index.js";
let _mammoth = null;
async function loadMammoth() {
if (_mammoth) {
return _mammoth;
}
try {
_mammoth = await import(/* @vite-ignore */ "mammoth");
return _mammoth;
}
catch (err) {
const e = err instanceof Error ? err : null;
if (e?.code === "ERR_MODULE_NOT_FOUND" && e.message.includes("mammoth")) {
throw new Error('Word document processing requires the "mammoth" package. Install it with:\n pnpm add mammoth', { cause: err });
}
throw err;
}
}
// Re-export for consumers who import from this module
// Import for local use
// =============================================================================
// CONSTANTS
// =============================================================================
/**
* Supported MIME types for Word documents
*/
const SUPPORTED_WORD_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword",
];
/**
* Supported file extensions for Word documents
*/
const SUPPORTED_WORD_EXTENSIONS = [".docx", ".doc"];
/**
* Default timeout for Word processing (60 seconds)
* Word documents can be larger due to embedded images and complex formatting
*/
const WORD_TIMEOUT_MS = 60000;
// =============================================================================
// WORD PROCESSOR CLASS
// =============================================================================
/**
* Word Processor - handles .docx and .doc files
*
* Uses mammoth library for both text and HTML extraction. The processor
* validates DOCX files by checking for the ZIP/PK signature (since DOCX
* files are actually ZIP archives).
*
* @example
* ```typescript
* const processor = new WordProcessor();
*
* // Check if file is supported
* if (processor.isFileSupported("application/msword", "report.doc")) {
* const result = await processor.processFile(fileInfo);
* if (result.success) {
* console.log("Extracted text:", result.data.textContent);
* }
* }
* ```
*/
export class WordProcessor extends BaseFileProcessor {
constructor() {
super({
maxSizeMB: SIZE_LIMITS.WORD_MAX_MB,
timeoutMs: WORD_TIMEOUT_MS,
supportedMimeTypes: SUPPORTED_WORD_MIME_TYPES,
supportedExtensions: SUPPORTED_WORD_EXTENSIONS,
fileTypeName: "Word",
defaultFilename: "document.docx",
});
}
/**
* Validate downloaded Word document has correct magic bytes.
* DOCX files are ZIP archives starting with PK signature (0x50 0x4B).
*
* @param buffer - Downloaded file content
* @param fileInfo - Original file information
* @returns null if valid, error message if invalid
*/
async validateDownloadedFile(buffer, _fileInfo) {
// Minimum size check
if (buffer.length < 4) {
return "Invalid Word document - file too small";
}
// DOCX files are ZIP archives (PK signature: 0x50 0x4B)
const pkSignature = buffer.subarray(0, 2).toString("ascii");
if (pkSignature !== "PK") {
// Log what we actually received to help debug
const preview = buffer
.subarray(0, 100)
.toString("utf8")
.substring(0, 100);
const looksLikeHtml = preview.includes("<!DOCTYPE") || preview.includes("<html");
// Provide more specific error message
if (looksLikeHtml) {
return "Invalid Word document - received HTML response instead of file content (possibly an error page)";
}
return "Invalid Word document - not a valid DOCX format (expected ZIP/PK signature)";
}
return null;
}
/**
* Build processed Word result with extracted text and HTML content.
* This is a stub that returns an empty result - actual processing
* happens in the overridden processFile method since mammoth
* operations are asynchronous.
*
* @param buffer - Downloaded file content
* @param fileInfo - Original file information
* @returns Processed Word result (placeholder)
*/
buildProcessedResult(buffer, fileInfo) {
// Note: This is a synchronous placeholder since buildProcessedResult is sync
// The actual mammoth extraction happens in the overridden processFile method
return {
textContent: "",
htmlContent: "",
warnings: [],
buffer,
mimetype: fileInfo.mimetype ||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
size: fileInfo.size,
filename: this.getFilename(fileInfo),
};
}
/**
* Override processFile for async mammoth extraction.
*
* The mammoth library's extractRawText and convertToHtml methods are
* asynchronous, so we need to override the entire processFile method
* rather than just buildProcessedResult.
*
* Processing steps:
* 1. Validate file type and size
* 2. Get buffer (download from URL or use provided buffer)
* 3. Validate downloaded file (check PK signature)
* 4. Extract text with mammoth.extractRawText()
* 5. Convert to HTML with mammoth.convertToHtml()
* 6. Collect any warnings from mammoth
* 7. Return structured result
*
* @param fileInfo - File information with URL or buffer
* @param options - Optional processing options
* @returns Processing result with text, HTML, and warnings
*/
async processFile(fileInfo, options) {
try {
// Step 1: Validate file type and size
const validationResult = this.validateFileWithResult(fileInfo);
if (!validationResult.success) {
return {
success: false,
error: validationResult.error,
};
}
// Step 2: Get file buffer (from direct buffer or download from URL)
let buffer;
if (fileInfo.buffer) {
// Direct buffer provided - skip download
buffer = fileInfo.buffer;
}
else if (fileInfo.url) {
// Download from URL
const downloadResult = await this.downloadFileWithRetry(fileInfo, options);
if (!downloadResult.success) {
return {
success: false,
error: downloadResult.error,
};
}
if (!downloadResult.data) {
return {
success: false,
error: this.createError(FileErrorCode.DOWNLOAD_FAILED, {
reason: "Download succeeded but returned no data",
}),
};
}
buffer = downloadResult.data;
}
else {
// No buffer or URL provided
return {
success: false,
error: this.createError(FileErrorCode.DOWNLOAD_FAILED, {
reason: "No buffer or URL provided for file",
}),
};
}
// Step 3: Validate downloaded file (check magic bytes)
const postValidationError = await this.validateDownloadedFile(buffer, fileInfo);
if (postValidationError) {
return {
success: false,
error: this.createError(FileErrorCode.INVALID_FORMAT, {
reason: postValidationError,
}),
};
}
// Step 4 & 5: Extract text and HTML content using mammoth
let textContent = "";
let htmlContent = "";
const warnings = [];
try {
const mammoth = await loadMammoth();
// Extract plain text
const textResult = await mammoth.extractRawText({ buffer });
textContent = textResult.value;
// Collect warnings from text extraction
if (textResult.messages && textResult.messages.length > 0) {
warnings.push(...textResult.messages.map((m) => `[text] ${m.message}`));
}
// Convert to HTML for richer formatting
const htmlResult = await mammoth.convertToHtml({ buffer });
htmlContent = htmlResult.value;
// Collect warnings from HTML conversion
if (htmlResult.messages && htmlResult.messages.length > 0) {
warnings.push(...htmlResult.messages.map((m) => `[html] ${m.message}`));
}
}
catch (extractError) {
return {
success: false,
error: this.createError(FileErrorCode.PROCESSING_FAILED, {
reason: "Failed to extract Word document content",
fileType: "Word",
}, extractError instanceof Error ? extractError : undefined),
};
}
// Step 6: Return structured result
return {
success: true,
data: {
buffer,
mimetype: fileInfo.mimetype ||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
size: fileInfo.size,
filename: this.getFilename(fileInfo),
textContent,
htmlContent,
warnings,
},
};
}
catch (error) {
// Catch any unexpected errors
return {
success: false,
error: this.createError(FileErrorCode.UNKNOWN_ERROR, {
error: error instanceof Error ? error.message : String(error),
}, error instanceof Error ? error : undefined),
};
}
}
}
// =============================================================================
// SINGLETON INSTANCE
// =============================================================================
/**
* Singleton Word processor instance.
* Use this for most use cases to avoid creating multiple instances.
*/
export const wordProcessor = new WordProcessor();
// =============================================================================
// HELPER FUNCTIONS
// =============================================================================
/**
* Check if a file is a Word document (.docx or .doc).
*
* @param mimetype - MIME type of the file
* @param filename - Filename (for extension-based detection)
* @returns true if the file is a supported Word document
*
* @example
* ```typescript
* if (isWordFile(file.mimetype, file.name)) {
* const result = await processWord(file);
* }
* ```
*/
export function isWordFile(mimetype, filename) {
return wordProcessor.isFileSupported(mimetype, filename);
}
/**
* Validate Word document size against configured limit.
*
* @param sizeBytes - File size in bytes
* @returns true if size is within the allowed limit
*
* @example
* ```typescript
* if (!validateWordSize(file.size)) {
* throw new Error(`File exceeds ${SIZE_LIMITS.WORD_MAX_MB}MB limit`);
* }
* ```
*/
export function validateWordSize(sizeBytes) {
const maxBytes = SIZE_LIMITS.WORD_MAX_MB * 1024 * 1024;
return sizeBytes <= maxBytes;
}
/**
* Process a single Word document.
*
* Convenience function that uses the singleton wordProcessor instance.
*
* @param fileInfo - File information with URL or buffer
* @param options - Optional processing options (auth headers, timeout, retry config)
* @returns Processing result with extracted text, HTML, and warnings
*
* @example
* ```typescript
* const result = await processWord({
* id: "doc-123",
* name: "report.docx",
* mimetype: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
* size: 12345,
* url: "https://example.com/files/report.docx",
* }, {
* authHeaders: { Authorization: "Bearer token" },
* });
*
* if (result.success) {
* console.log("Text content:", result.data.textContent);
* console.log("HTML content:", result.data.htmlContent);
* if (result.data.warnings.length > 0) {
* console.warn("Warnings:", result.data.warnings);
* }
* } else {
* console.error("Failed:", result.error.userMessage);
* }
* ```
*/
export async function processWord(fileInfo, options) {
return wordProcessor.processFile(fileInfo, options);
}
//# sourceMappingURL=WordProcessor.js.map