UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

611 lines (610 loc) 22.5 kB
/** * Excel Processor * * Handles downloading, validating, and processing Excel files (.xlsx, .xls). * Uses exceljs library for parsing with streaming support for large files. * * Key features: * - Supports both .xlsx and legacy .xls formats * - Extracts worksheet data with headers * - Handles complex cell types (formulas, rich text, dates) * - Respects configurable row and sheet limits * - Provides truncation metadata when limits are exceeded * * @module processors/document/ExcelProcessor * * @example * ```typescript * import { excelProcessor, processExcel, isExcelFile } from "./ExcelProcessor.js"; * * // Check if a file is an Excel file * if (isExcelFile(fileInfo.mimetype, fileInfo.name)) { * // Process the Excel file * const result = await processExcel(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log(`Processed ${result.data.sheetCount} sheets`); * console.log(`Total rows: ${result.data.totalRows}`); * * for (const sheet of result.data.worksheets) { * console.log(`Sheet: ${sheet.name}, Rows: ${sheet.rowCount}`); * } * } * } * ``` */ import { BaseFileProcessor } from "../base/BaseFileProcessor.js"; import { SIZE_LIMITS } from "../config/index.js"; import { FileErrorCode } from "../errors/index.js"; let _exceljs = null; async function loadExcelJS() { if (_exceljs) { return _exceljs; } try { _exceljs = await import(/* @vite-ignore */ "exceljs"); return _exceljs; } catch (err) { const e = err instanceof Error ? err : null; if (e?.code === "ERR_MODULE_NOT_FOUND" && e.message.includes("exceljs")) { throw new Error('Excel file processing requires the "exceljs" package. Install it with:\n pnpm add exceljs', { cause: err }); } throw err; } } // Re-export for consumers who import from this module // Import for local use // ============================================================================= // CONSTANTS // ============================================================================= /** Supported MIME types for Excel files */ const SUPPORTED_EXCEL_TYPES = [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", // .xlsx "application/vnd.ms-excel", // .xls ]; /** Supported file extensions for Excel files */ const SUPPORTED_EXCEL_EXTENSIONS = [".xlsx", ".xls"]; // ============================================================================= // EXCEL PROCESSOR CLASS // ============================================================================= /** * Excel Processor - handles .xlsx and .xls files. * Uses exceljs library for parsing with support for large files. * * Features: * - ZIP format validation (XLSX files are ZIP archives) * - Sheet count limiting (MAX_EXCEL_SHEETS) * - Row count limiting per sheet (MAX_EXCEL_ROWS) * - Cell type handling (text, numbers, formulas, dates, rich text) * * @example * ```typescript * const processor = new ExcelProcessor(); * * // Process a file * const result = await processor.processFile(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * }); * * if (result.success) { * console.log(`Sheets: ${result.data.sheetCount}`); * console.log(`Truncated: ${result.data.truncated}`); * } * ``` */ export class ExcelProcessor extends BaseFileProcessor { constructor() { super({ maxSizeMB: SIZE_LIMITS.EXCEL_MAX_MB, timeoutMs: 60000, // Excel parsing can take longer than text files supportedMimeTypes: [...SUPPORTED_EXCEL_TYPES], supportedExtensions: [...SUPPORTED_EXCEL_EXTENSIONS], fileTypeName: "Excel", defaultFilename: "spreadsheet.xlsx", }); } // =========================================================================== // VALIDATION // =========================================================================== /** * Validate downloaded Excel file has correct format. * XLSX files are ZIP archives starting with PK signature. * * @param buffer - Downloaded file content * @param _fileInfo - Original file information (unused but required by interface) * @returns null if valid, error message if invalid */ async validateDownloadedFile(buffer, _fileInfo) { // Check minimum size if (buffer.length < 4) { return "Invalid Excel file - file too small"; } // XLSX files are ZIP archives (PK signature: 0x50 0x4B) const pkSignature = buffer.subarray(0, 2).toString("ascii"); if (pkSignature !== "PK") { // Provide helpful error for common issues const preview = buffer .subarray(0, 100) .toString("utf8") .substring(0, 100); if (preview.includes("<!DOCTYPE") || preview.includes("<html")) { return "Invalid Excel file - received HTML response instead of file content"; } return "Invalid Excel file - not a valid XLSX format (missing PK signature)"; } return null; } // =========================================================================== // PROCESSING // =========================================================================== /** * Build processed result stub. * Note: This is a synchronous stub - actual parsing happens in processFile override. * * @param buffer - Downloaded file content * @param fileInfo - Original file information * @returns Empty ProcessedExcel structure (populated by processFile) */ buildProcessedResult(buffer, fileInfo) { return { worksheets: [], buffer, mimetype: fileInfo.mimetype || "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", size: fileInfo.size, filename: this.getFilename(fileInfo), sheetCount: 0, totalRows: 0, truncated: false, truncatedSheets: [], }; } /** * Override processFile for async Excel parsing with exceljs. * This override is necessary because exceljs uses async parsing. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, etc.) * @returns Processing result with parsed Excel data or error */ async processFile(fileInfo, options) { try { // Step 1: Validate file type and size const validationResult = this.validateFileWithResult(fileInfo); if (!validationResult.success) { return { success: false, error: validationResult.error, }; } // Step 2: Get file buffer (from direct buffer or download from URL) let buffer; if (fileInfo.buffer) { buffer = fileInfo.buffer; } else if (fileInfo.url) { const downloadResult = await this.downloadFileWithRetry(fileInfo, options); if (!downloadResult.success) { return { success: false, error: downloadResult.error, }; } if (!downloadResult.data) { return { success: false, error: this.createError(FileErrorCode.DOWNLOAD_FAILED, { reason: "Download succeeded but returned no data", }), }; } buffer = downloadResult.data; } else { return { success: false, error: this.createError(FileErrorCode.DOWNLOAD_FAILED, { reason: "No buffer or URL provided for file", }), }; } // Step 3: Validate downloaded file (magic bytes check) const postValidationResult = await this.validateDownloadedFileWithResult(buffer, fileInfo); if (!postValidationResult.success) { return { success: false, error: postValidationResult.error, }; } // Step 4: Parse Excel file asynchronously using exceljs const workbook = await this.parseWorkbook(buffer); // Step 5: Extract worksheet data with limits const { worksheets, truncated, truncatedSheets } = this.extractWorksheets(workbook); // Calculate total rows across all worksheets const totalRows = worksheets.reduce((sum, sheet) => sum + sheet.rowCount, 0); // Build final result return { success: true, data: { worksheets, buffer, mimetype: fileInfo.mimetype || "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", size: fileInfo.size, filename: this.getFilename(fileInfo), sheetCount: worksheets.length, totalRows, truncated, truncatedSheets, }, }; } catch (error) { return { success: false, error: this.createError(FileErrorCode.PROCESSING_FAILED, { fileType: "Excel", error: error instanceof Error ? error.message : String(error), }, error instanceof Error ? error : undefined), }; } } // =========================================================================== // PRIVATE HELPER METHODS // =========================================================================== /** * Parse Excel buffer into workbook using exceljs. * * @param buffer - Excel file content * @returns Parsed ExcelJS Workbook */ async parseWorkbook(buffer) { const ExcelJS = await loadExcelJS(); const workbook = new ExcelJS.Workbook(); // ExcelJS load() types expect Buffer but Node 22+ Buffer<ArrayBufferLike> // is not directly assignable. Extract a clean ArrayBuffer for the exact // byte range via slice, then cast for type compatibility. await workbook.xlsx.load(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)); return workbook; } /** * Extract worksheet data from workbook with row and sheet limits. * * @param workbook - Parsed ExcelJS Workbook * @returns Extracted worksheets with truncation metadata */ extractWorksheets(workbook) { const worksheets = []; const truncatedSheets = []; let truncated = false; const maxRows = SIZE_LIMITS.EXCEL_MAX_ROWS; const maxSheets = SIZE_LIMITS.EXCEL_MAX_SHEETS; let sheetIndex = 0; for (const worksheet of workbook.worksheets) { // Check sheet limit if (sheetIndex >= maxSheets) { truncated = true; break; } const rows = []; let headers = []; let rowIndex = 0; let hitLimit = false; worksheet.eachRow((row, rowNumber) => { if (hitLimit) { return; } // Check row limit if (rowIndex >= maxRows) { if (!truncatedSheets.includes(worksheet.name)) { truncatedSheets.push(worksheet.name); } truncated = true; hitLimit = true; return; } // ExcelJS row.values is 1-indexed, so first element is undefined const rowValues = row.values; // Convert cell values to primitive types and remove the first undefined element const cleanRow = rowValues .slice(1) .map((cell) => this.getCellValue(cell)); // Extract headers from first row if (rowNumber === 1) { headers = cleanRow.map((v) => String(v ?? "")); } rows.push(cleanRow); rowIndex++; }); worksheets.push({ name: worksheet.name, rows, headers, rowCount: rows.length, columnCount: headers.length || (rows[0]?.length ?? 0), }); sheetIndex++; } return { worksheets, truncated, truncatedSheets }; } /** * Convert an Excel cell value to a primitive type. * Handles various cell types including formulas, rich text, and dates. * * @param cell - ExcelJS cell value (can be various types) * @returns Primitive value (string, number, boolean, or null) */ getCellValue(cell) { if (cell === null || cell === undefined) { return null; } // Handle primitive types directly if (typeof cell === "string" || typeof cell === "number" || typeof cell === "boolean") { return cell; } // Handle Date objects if (cell instanceof Date) { return cell.toISOString(); } // Handle ExcelJS cell objects if (typeof cell === "object" && cell !== null) { const cellObj = cell; // Formula result (prioritize result over formula string) if ("result" in cellObj && cellObj.result !== undefined) { if (typeof cellObj.result === "object" && cellObj.result !== null) { // Handle error values like { error: '#VALUE!' } if ("error" in cellObj.result) { return String(cellObj.result.error); } } return typeof cellObj.result === "string" || typeof cellObj.result === "number" || typeof cellObj.result === "boolean" ? cellObj.result : String(cellObj.result); } // Rich text if ("richText" in cellObj && Array.isArray(cellObj.richText)) { return this.extractRichText(cellObj.richText); } // Simple text value if ("text" in cellObj && cellObj.text !== undefined) { return cellObj.text; } // Hyperlink (return the display text or URL) if ("hyperlink" in cellObj && cellObj.hyperlink) { return cellObj.text || cellObj.hyperlink; } } // Fallback: convert to string return String(cell); } /** * Extract text from rich text cell format. * Rich text cells contain an array of text fragments with formatting. * * @param richText - Array of rich text fragments * @returns Concatenated plain text */ extractRichText(richText) { if (!Array.isArray(richText)) { return ""; } return richText .map((rt) => { if (typeof rt === "object" && rt !== null && "text" in rt) { return rt.text || ""; } return ""; }) .join(""); } // =========================================================================== // TARGETED EXTRACTION API // =========================================================================== /** * Extract a specific range from a spreadsheet. * * Called by the `extract_file_content` tool for targeted data access. * Returns TSV-formatted text for the specified sheet, row range, and columns. * * @param buffer - Excel file buffer * @param sheet - Sheet name or 0-based index (default: first sheet) * @param rowStart - Starting row (1-indexed, default: 1) * @param rowEnd - Ending row (1-indexed, default: all rows) * @param columns - Specific column letters to include (e.g., ["A", "B", "D"]) * @returns TSV-formatted string with the extracted data */ async extractSheetRange(buffer, sheet, rowStart = 1, rowEnd, columns) { const workbook = await this.parseWorkbook(buffer); // Resolve the target worksheet let worksheet; if (typeof sheet === "number") { // exceljs worksheets are 1-indexed worksheet = workbook.worksheets[sheet]; } else if (typeof sheet === "string") { worksheet = workbook.getWorksheet(sheet); } else { worksheet = workbook.worksheets[0]; } if (!worksheet) { const sheetNames = workbook.worksheets .map((ws) => ws.name) .join(", "); return `Sheet not found. Available sheets: ${sheetNames}`; } // Convert column letters to 1-based column indices if specified const columnIndices = columns?.map((col) => { let index = 0; for (let i = 0; i < col.length; i++) { index = index * 26 + col.toUpperCase().charCodeAt(i) - 64; } return index; }); const lines = []; lines.push(`## Sheet: ${worksheet.name}`); const actualRowEnd = rowEnd ?? worksheet.rowCount; let rowCount = 0; worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => { if (rowNumber < rowStart || rowNumber > actualRowEnd) { return; } rowCount++; const values = []; row.eachCell({ includeEmpty: true }, (cell, colNumber) => { if (columnIndices && !columnIndices.includes(colNumber)) { return; } const val = this.getCellValue(cell.value); values.push(val === null ? "" : String(val)); }); // Add row number prefix for easy reference lines.push(`${rowNumber}\t${values.join("\t")}`); }); if (rowCount === 0) { lines.push(`(No data in rows ${rowStart}-${actualRowEnd})`); } else { lines.push(`\n(${rowCount} rows, range ${rowStart}-${actualRowEnd})`); } return lines.join("\n"); } } // ============================================================================= // SINGLETON INSTANCE // ============================================================================= /** * Singleton Excel processor instance. * Use this for standard Excel processing operations. * * @example * ```typescript * import { excelProcessor } from "./ExcelProcessor.js"; * * const result = await excelProcessor.processFile(fileInfo); * ``` */ export const excelProcessor = new ExcelProcessor(); // ============================================================================= // HELPER FUNCTIONS // ============================================================================= /** * Check if a file is an Excel file. * Matches by MIME type or file extension. * * @param mimetype - MIME type of the file * @param filename - Filename (for extension-based detection) * @returns true if the file is an Excel file * * @example * ```typescript * if (isExcelFile("application/vnd.ms-excel", "data.xls")) { * // Process as Excel * } * * if (isExcelFile("", "report.xlsx")) { * // Also matches by extension * } * ``` */ export function isExcelFile(mimetype, filename) { return excelProcessor.isFileSupported(mimetype, filename); } /** * Validate Excel file size against configured limit. * * @param sizeBytes - File size in bytes * @returns true if size is within the Excel file limit * * @example * ```typescript * if (!validateExcelSize(fileInfo.size)) { * console.error(`File too large: max ${SIZE_LIMITS.EXCEL_MAX_MB}MB`); * } * ``` */ export function validateExcelSize(sizeBytes) { const maxBytes = SIZE_LIMITS.EXCEL_MAX_MB * 1024 * 1024; return sizeBytes <= maxBytes; } /** * Process a single Excel file. * Convenience function that uses the singleton processor. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, etc.) * @returns Processing result with parsed Excel data or error * * @example * ```typescript * import { processExcel } from "./ExcelProcessor.js"; * * const result = await processExcel(fileInfo, { * authHeaders: { Authorization: "Bearer token" }, * timeout: 120000, // 2 minutes for large files * }); * * if (result.success) { * const { worksheets, totalRows, truncated } = result.data; * console.log(`Extracted ${totalRows} rows from ${worksheets.length} sheets`); * * if (truncated) { * console.warn("Some data was truncated due to size limits"); * } * } else { * console.error(`Processing failed: ${result.error?.userMessage}`); * } * ``` */ export async function processExcel(fileInfo, options) { return excelProcessor.processFile(fileInfo, options); } /** * Get Excel max size in MB. * * @returns Maximum Excel file size in megabytes * * @example * ```typescript * const maxSize = getExcelMaxSizeMB(); // 10 * console.log(`Maximum Excel file size: ${maxSize}MB`); * ``` */ export function getExcelMaxSizeMB() { return SIZE_LIMITS.EXCEL_MAX_MB; } /** * Get Excel max rows per sheet. * * @returns Maximum rows to process per worksheet * * @example * ```typescript * const maxRows = getExcelMaxRows(); // 5000 * console.log(`Maximum rows per sheet: ${maxRows}`); * ``` */ export function getExcelMaxRows() { return SIZE_LIMITS.EXCEL_MAX_ROWS; } /** * Get Excel max sheets to process. * * @returns Maximum number of worksheets to process * * @example * ```typescript * const maxSheets = getExcelMaxSheets(); // 10 * console.log(`Maximum sheets to process: ${maxSheets}`); * ``` */ export function getExcelMaxSheets() { return SIZE_LIMITS.EXCEL_MAX_SHEETS; }