UNPKG

@xynehq/jaf

Version:

Juspay Agent Framework - A purely functional agent framework with immutable state and composable tools

265 lines 9.93 kB
import * as XLSX from 'xlsx'; import mammoth from 'mammoth'; import Papa from 'papaparse'; import yauzl from 'yauzl'; const FETCH_TIMEOUT = 30000; const MAX_DOCUMENT_SIZE = 25 * 1024 * 1024; const MAX_CSV_PREVIEW_ROWS = 10; const MAX_EXCEL_SHEETS = 3; const MAX_EXCEL_ROWS_PER_SHEET = 20; class DocumentProcessingError extends Error { cause; constructor(message, cause) { super(message); this.cause = cause; this.name = 'DocumentProcessingError'; } } class NetworkError extends Error { statusCode; constructor(message, statusCode) { super(message); this.statusCode = statusCode; this.name = 'NetworkError'; } } /** * Fetch content from URL and return as buffer */ async function fetchUrlContent(url) { try { const response = await fetch(url, { method: 'GET', headers: { 'User-Agent': 'JAF-DocumentProcessor/1.0' }, // 30 second timeout for large files signal: AbortSignal.timeout(FETCH_TIMEOUT) }); if (!response.ok) { throw new NetworkError(`HTTP ${response.status}: ${response.statusText}`, response.status); } const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); const contentType = response.headers.get('content-type') || undefined; // Basic size check (25MB limit) if (buffer.length > MAX_DOCUMENT_SIZE) { throw new DocumentProcessingError(`File size (${Math.round(buffer.length / 1024 / 1024)}MB) exceeds maximum allowed size (${Math.round(MAX_DOCUMENT_SIZE / 1024 / 1024)}MB)`); } return { buffer, contentType }; } catch (error) { if (error instanceof Error) { throw new NetworkError(`Failed to fetch URL content: ${error.message}`); } throw new NetworkError('Failed to fetch URL content: Unknown error'); } } /** * Extract text content from various document formats */ export async function extractDocumentContent(attachment) { let buffer; let mimeType = attachment.mimeType?.toLowerCase(); // Handle URL-based attachments if (attachment.url && !attachment.data) { const urlData = await fetchUrlContent(attachment.url); buffer = urlData.buffer; // Use content type from response if mimeType wasn't provided if (!mimeType && urlData.contentType) { mimeType = urlData.contentType.toLowerCase(); } } // Handle base64 data attachments else if (attachment.data) { buffer = Buffer.from(attachment.data, 'base64'); } // Error if neither URL nor data provided else { throw new DocumentProcessingError('No document data or URL provided'); } switch (mimeType) { case 'application/pdf': throw new DocumentProcessingError('PDF processing is not supported'); case 'text/plain': case 'text/csv': return extractTextContent(buffer, mimeType); case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': case 'application/vnd.ms-excel': return extractExcelContent(buffer); case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': return await extractDocxContent(buffer); case 'application/json': return extractJsonContent(buffer); case 'application/zip': return await extractZipContent(buffer); default: // Fallback: try to extract as text return extractTextContent(buffer, 'text/plain'); } } function extractTextContent(buffer, mimeType) { const content = buffer.toString('utf-8').trim(); if (mimeType === 'text/csv') { // Parse CSV to provide structured overview try { const parsed = Papa.parse(content, { header: true, skipEmptyLines: true }); const rows = parsed.data.length; const columns = parsed.meta.fields?.length || 0; return { content: `CSV File Content:\nRows: ${rows}, Columns: ${columns}\nColumns: ${parsed.meta.fields?.join(', ') || 'N/A'}\n\nFirst few rows:\n${content.split('\n').slice(0, MAX_CSV_PREVIEW_ROWS).join('\n')}`, metadata: { rows, columns, fields: parsed.meta.fields } }; } catch (error) { // Fallback to raw text if CSV parsing fails return { content }; } } return { content }; } function extractExcelContent(buffer) { try { const workbook = XLSX.read(buffer, { type: 'buffer' }); const sheetNames = workbook.SheetNames; let content = `Excel File Content:\nSheets: ${sheetNames.join(', ')}\n\n`; // Extract content from each sheet sheetNames.forEach((sheetName, index) => { if (index < MAX_EXCEL_SHEETS) { // Limit to first 3 sheets to avoid overwhelming const worksheet = workbook.Sheets[sheetName]; const csvContent = XLSX.utils.sheet_to_csv(worksheet); content += `Sheet: ${sheetName}\n`; content += csvContent.split('\n').slice(0, MAX_EXCEL_ROWS_PER_SHEET).join('\n'); // First 20 rows content += '\n\n'; } }); return { content: content.trim(), metadata: { sheets: sheetNames } }; } catch (error) { throw new DocumentProcessingError(`Failed to extract Excel content: ${error instanceof Error ? error.message : 'Unknown error'}`, error); } } async function extractDocxContent(buffer) { try { const result = await mammoth.extractRawText({ buffer }); return { content: result.value.trim(), metadata: { messages: result.messages.length > 0 ? result.messages : undefined } }; } catch (error) { throw new DocumentProcessingError(`Failed to extract DOCX content: ${error instanceof Error ? error.message : 'Unknown error'}`, error); } } function extractJsonContent(buffer) { try { const jsonStr = buffer.toString('utf-8'); const jsonObj = JSON.parse(jsonStr); // Pretty print JSON with some metadata const content = `JSON File Content:\n${JSON.stringify(jsonObj, null, 2)}`; return { content, metadata: { keys: typeof jsonObj === 'object' && jsonObj !== null ? Object.keys(jsonObj) : [], type: Array.isArray(jsonObj) ? 'array' : typeof jsonObj } }; } catch (error) { // Fallback to raw text if JSON parsing fails return { content: buffer.toString('utf-8').trim() }; } } async function extractZipContent(buffer) { return new Promise((resolve, reject) => { yauzl.fromBuffer(buffer, { lazyEntries: true }, (err, zipfile) => { if (err) { reject(new DocumentProcessingError(`Failed to read ZIP file: ${err.message}`, err)); return; } if (!zipfile) { reject(new DocumentProcessingError('Failed to read ZIP file: No zipfile')); return; } const files = []; let content = 'ZIP File Contents:\n\n'; zipfile.readEntry(); zipfile.on('entry', (entry) => { files.push(entry.fileName); if (entry.fileName.endsWith('/')) { content += `DIR: ${entry.fileName}\n`; } else { const size = entry.uncompressedSize; content += `FILE: ${entry.fileName} (${size} bytes)\n`; } zipfile.readEntry(); }); zipfile.on('end', () => { resolve({ content: content.trim(), metadata: { files, totalFiles: files.length } }); }); zipfile.on('error', (error) => { reject(new DocumentProcessingError(`Failed to process ZIP file: ${error.message}`, error)); }); }); }); } /** * Check if a MIME type is supported for content extraction */ export function isDocumentSupported(mimeType) { if (!mimeType) return false; const supportedTypes = [ 'text/plain', 'text/csv', 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.ms-excel', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/json', 'application/zip' ]; return supportedTypes.includes(mimeType.toLowerCase()); } /** * Get a human-readable description of what content will be extracted */ export function getDocumentDescription(mimeType) { switch (mimeType?.toLowerCase()) { case 'application/pdf': return 'PDF processing not supported'; case 'text/plain': return 'plain text content'; case 'text/csv': return 'CSV data structure and sample rows'; case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': case 'application/vnd.ms-excel': return 'Excel spreadsheet data'; case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': return 'Word document text content'; case 'application/json': return 'JSON data structure'; case 'application/zip': return 'ZIP file listing'; default: return 'document content'; } } //# sourceMappingURL=document-processor.js.map