uns-mcp-server
Version:
Pure JavaScript MCP server for Unstructured.io - No Python required!
208 lines (180 loc) • 5.94 kB
JavaScript
/**
* Invoice Processing Example
* Extract data from invoices stored in cloud storage
*/
// Example workflow for processing invoices from multiple sources
async function processInvoices() {
// Step 1: Set up source connectors for different storage systems
const sources = {
s3: await create_source_connector({
name: "invoice-s3-source",
type: "s3",
config: {
bucket: "company-invoices",
prefix: "2024/",
aws_key: process.env.AWS_KEY,
aws_secret: process.env.AWS_SECRET
}
}),
azure: await create_source_connector({
name: "invoice-azure-source",
type: "azure",
config: {
container: "financial-docs",
connection_string: process.env.AZURE_CONNECTION_STRING
}
}),
gdrive: await create_source_connector({
name: "invoice-gdrive-source",
type: "googledrive",
config: {
folder_id: "1234567890",
service_account_key: process.env.GOOGLEDRIVE_SERVICE_ACCOUNT_KEY
}
})
};
// Step 2: Set up destination for processed data
const destination = await create_destination_connector({
name: "invoice-mongodb",
type: "mongodb",
config: {
connection_string: process.env.MONGO_DB_CONNECTION_STRING,
database: "finance",
collection: "invoices"
}
});
// Step 3: Create processing workflow with advanced options
const workflow = await create_workflow({
name: "invoice-extraction-pipeline",
source: sources.s3.id,
destination: destination.id,
schedule: "0 9 * * *", // Daily at 9 AM
settings: {
strategy: "auto",
ocr_enabled: true, // Enable OCR for scanned invoices
pdf_infer_table_structure: true,
extract_metadata: true,
chunking_strategy: "by_page",
max_characters: 1500,
// Custom extraction rules
extraction_config: {
extract_tables: true,
extract_forms: true,
extract_amounts: true,
extract_dates: true,
extract_entities: ["ORGANIZATION", "PERSON", "MONEY", "DATE"],
// Field mapping for structured output
field_mapping: {
"invoice_number": "metadata.invoice_id",
"vendor": "entities.organization[0]",
"amount": "extracted.total_amount",
"date": "extracted.invoice_date",
"line_items": "tables[0].rows"
}
}
}
});
// Step 4: Run the workflow
const job = await run_workflow(workflow.id);
console.log(`Processing job started: ${job.id}`);
// Step 5: Monitor job progress
let status = await get_job_info(job.id);
while (status.state === "running") {
console.log(`Progress: ${status.progress}% - ${status.current_step}`);
await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
status = await get_job_info(job.id);
}
if (status.state === "completed") {
console.log("✅ Invoice processing completed successfully!");
console.log(`Processed ${status.documents_processed} invoices`);
console.log(`Extracted ${status.entities_extracted} entities`);
console.log(`Generated ${status.tables_extracted} tables`);
} else {
console.error("❌ Processing failed:", status.error);
}
// Step 6: Query processed data from MongoDB
// This would be done using MongoDB client
// Example structure of processed invoice:
/*
{
"_id": "doc_12345",
"source_file": "invoice_2024_001.pdf",
"invoice_number": "INV-2024-001",
"vendor": "Acme Corp",
"amount": 1500.00,
"date": "2024-01-15",
"line_items": [
{ "description": "Consulting Services", "amount": 1000.00 },
{ "description": "Travel Expenses", "amount": 500.00 }
],
"metadata": {
"processed_at": "2024-01-20T10:30:00Z",
"confidence_score": 0.95,
"ocr_applied": false,
"page_count": 2
}
}
*/
}
// Example: Batch processing with parallel workflows
async function batchProcessInvoices() {
const sources = ["q1-invoices", "q2-invoices", "q3-invoices", "q4-invoices"];
// Create parallel workflows for each quarter
const workflows = await Promise.all(
sources.map(async (source) => {
return create_workflow({
name: `process-${source}`,
source: source,
destination: "invoice-warehouse",
settings: {
parallel_processing: true,
max_workers: 4,
batch_size: 50
}
});
})
);
// Run all workflows in parallel
const jobs = await Promise.all(
workflows.map(w => run_workflow(w.id))
);
console.log(`Started ${jobs.length} parallel processing jobs`);
}
// Example: Error handling and retry logic
async function processWithRetry(workflowId, maxRetries = 3) {
let attempt = 0;
while (attempt < maxRetries) {
try {
const job = await run_workflow(workflowId);
const result = await waitForCompletion(job.id);
if (result.state === "completed") {
return result;
}
// If failed, wait before retry
attempt++;
console.log(`Attempt ${attempt} failed. Retrying...`);
await new Promise(resolve => setTimeout(resolve, 10000));
} catch (error) {
console.error(`Error on attempt ${attempt}:`, error);
attempt++;
}
}
throw new Error(`Failed after ${maxRetries} attempts`);
}
// Helper function to wait for job completion
async function waitForCompletion(jobId, timeout = 300000) {
const startTime = Date.now();
while (Date.now() - startTime < timeout) {
const status = await get_job_info(jobId);
if (status.state === "completed" || status.state === "failed") {
return status;
}
await new Promise(resolve => setTimeout(resolve, 2000));
}
throw new Error("Job timeout exceeded");
}
module.exports = {
processInvoices,
batchProcessInvoices,
processWithRetry
};