UNPKG

uns-mcp-server

Version:

Pure JavaScript MCP server for Unstructured.io - No Python required!

208 lines (180 loc) 5.94 kB
/** * Invoice Processing Example * Extract data from invoices stored in cloud storage */ // Example workflow for processing invoices from multiple sources async function processInvoices() { // Step 1: Set up source connectors for different storage systems const sources = { s3: await create_source_connector({ name: "invoice-s3-source", type: "s3", config: { bucket: "company-invoices", prefix: "2024/", aws_key: process.env.AWS_KEY, aws_secret: process.env.AWS_SECRET } }), azure: await create_source_connector({ name: "invoice-azure-source", type: "azure", config: { container: "financial-docs", connection_string: process.env.AZURE_CONNECTION_STRING } }), gdrive: await create_source_connector({ name: "invoice-gdrive-source", type: "googledrive", config: { folder_id: "1234567890", service_account_key: process.env.GOOGLEDRIVE_SERVICE_ACCOUNT_KEY } }) }; // Step 2: Set up destination for processed data const destination = await create_destination_connector({ name: "invoice-mongodb", type: "mongodb", config: { connection_string: process.env.MONGO_DB_CONNECTION_STRING, database: "finance", collection: "invoices" } }); // Step 3: Create processing workflow with advanced options const workflow = await create_workflow({ name: "invoice-extraction-pipeline", source: sources.s3.id, destination: destination.id, schedule: "0 9 * * *", // Daily at 9 AM settings: { strategy: "auto", ocr_enabled: true, // Enable OCR for scanned invoices pdf_infer_table_structure: true, extract_metadata: true, chunking_strategy: "by_page", max_characters: 1500, // Custom extraction rules extraction_config: { extract_tables: true, extract_forms: true, extract_amounts: true, extract_dates: true, extract_entities: ["ORGANIZATION", "PERSON", "MONEY", "DATE"], // Field mapping for structured output field_mapping: { "invoice_number": "metadata.invoice_id", "vendor": "entities.organization[0]", "amount": "extracted.total_amount", "date": "extracted.invoice_date", "line_items": "tables[0].rows" } } } }); // Step 4: Run the workflow const job = await run_workflow(workflow.id); console.log(`Processing job started: ${job.id}`); // Step 5: Monitor job progress let status = await get_job_info(job.id); while (status.state === "running") { console.log(`Progress: ${status.progress}% - ${status.current_step}`); await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds status = await get_job_info(job.id); } if (status.state === "completed") { console.log("✅ Invoice processing completed successfully!"); console.log(`Processed ${status.documents_processed} invoices`); console.log(`Extracted ${status.entities_extracted} entities`); console.log(`Generated ${status.tables_extracted} tables`); } else { console.error("❌ Processing failed:", status.error); } // Step 6: Query processed data from MongoDB // This would be done using MongoDB client // Example structure of processed invoice: /* { "_id": "doc_12345", "source_file": "invoice_2024_001.pdf", "invoice_number": "INV-2024-001", "vendor": "Acme Corp", "amount": 1500.00, "date": "2024-01-15", "line_items": [ { "description": "Consulting Services", "amount": 1000.00 }, { "description": "Travel Expenses", "amount": 500.00 } ], "metadata": { "processed_at": "2024-01-20T10:30:00Z", "confidence_score": 0.95, "ocr_applied": false, "page_count": 2 } } */ } // Example: Batch processing with parallel workflows async function batchProcessInvoices() { const sources = ["q1-invoices", "q2-invoices", "q3-invoices", "q4-invoices"]; // Create parallel workflows for each quarter const workflows = await Promise.all( sources.map(async (source) => { return create_workflow({ name: `process-${source}`, source: source, destination: "invoice-warehouse", settings: { parallel_processing: true, max_workers: 4, batch_size: 50 } }); }) ); // Run all workflows in parallel const jobs = await Promise.all( workflows.map(w => run_workflow(w.id)) ); console.log(`Started ${jobs.length} parallel processing jobs`); } // Example: Error handling and retry logic async function processWithRetry(workflowId, maxRetries = 3) { let attempt = 0; while (attempt < maxRetries) { try { const job = await run_workflow(workflowId); const result = await waitForCompletion(job.id); if (result.state === "completed") { return result; } // If failed, wait before retry attempt++; console.log(`Attempt ${attempt} failed. Retrying...`); await new Promise(resolve => setTimeout(resolve, 10000)); } catch (error) { console.error(`Error on attempt ${attempt}:`, error); attempt++; } } throw new Error(`Failed after ${maxRetries} attempts`); } // Helper function to wait for job completion async function waitForCompletion(jobId, timeout = 300000) { const startTime = Date.now(); while (Date.now() - startTime < timeout) { const status = await get_job_info(jobId); if (status.state === "completed" || status.state === "failed") { return status; } await new Promise(resolve => setTimeout(resolve, 2000)); } throw new Error("Job timeout exceeded"); } module.exports = { processInvoices, batchProcessInvoices, processWithRetry };