UNPKG

uns-mcp-server

Version:

Pure JavaScript MCP server for Unstructured.io - No Python required!

556 lines (503 loc) 13.5 kB
#!/usr/bin/env node /** * Pure JavaScript MCP Server for Unstructured.io * No Python dependencies required! */ const readline = require('readline'); const axios = require('axios'); const FormData = require('form-data'); const fs = require('fs'); const path = require('path'); class UnstructuredMCPServer { constructor() { this.apiKey = process.env.UNSTRUCTURED_API_KEY; this.apiUrl = process.env.UNSTRUCTURED_API_URL || 'https://api.unstructured.io/general/v0/general'; if (!this.apiKey) { this.error('UNSTRUCTURED_API_KEY environment variable is required'); process.exit(1); } // Initialize MCP protocol this.rl = readline.createInterface({ input: process.stdin, output: process.stdout, terminal: false }); this.sources = new Map(); this.destinations = new Map(); this.workflows = new Map(); this.jobs = new Map(); } /** * Start the MCP server */ start() { // Send initialization this.send({ jsonrpc: '2.0', method: 'initialize', params: { protocolVersion: '1.0.0', capabilities: { tools: this.getAvailableTools() } } }); // Listen for messages this.rl.on('line', (line) => { try { const message = JSON.parse(line); this.handleMessage(message); } catch (e) { this.error(`Failed to parse message: ${e.message}`); } }); } /** * Get list of available tools */ getAvailableTools() { return { 'process_document': { description: 'Process a document using Unstructured.io API', inputSchema: { type: 'object', properties: { file_path: { type: 'string', description: 'Path to the document' }, strategy: { type: 'string', enum: ['auto', 'hi_res', 'ocr_only', 'fast'], default: 'auto' }, ocr_enabled: { type: 'boolean', default: true }, output_format: { type: 'string', enum: ['json', 'text', 'markdown'], default: 'json' } }, required: ['file_path'] } }, 'list_sources': { description: 'List configured source connectors', inputSchema: { type: 'object', properties: {} } }, 'create_source_connector': { description: 'Create a new source connector', inputSchema: { type: 'object', properties: { name: { type: 'string' }, type: { type: 'string', enum: ['local', 's3', 'azure', 'googledrive'] }, config: { type: 'object' } }, required: ['name', 'type'] } }, 'list_destinations': { description: 'List configured destination connectors', inputSchema: { type: 'object', properties: {} } }, 'create_destination_connector': { description: 'Create a new destination connector', inputSchema: { type: 'object', properties: { name: { type: 'string' }, type: { type: 'string', enum: ['local', 's3', 'mongodb', 'weaviate', 'pinecone'] }, config: { type: 'object' } }, required: ['name', 'type'] } }, 'create_workflow': { description: 'Create a document processing workflow', inputSchema: { type: 'object', properties: { name: { type: 'string' }, source: { type: 'string' }, destination: { type: 'string' }, settings: { type: 'object' } }, required: ['name', 'source', 'destination'] } }, 'run_workflow': { description: 'Execute a workflow', inputSchema: { type: 'object', properties: { workflow_id: { type: 'string' } }, required: ['workflow_id'] } }, 'list_jobs': { description: 'List processing jobs', inputSchema: { type: 'object', properties: { workflow_id: { type: 'string' } } } }, 'get_job_info': { description: 'Get job status and details', inputSchema: { type: 'object', properties: { job_id: { type: 'string' } }, required: ['job_id'] } }, 'extract_text': { description: 'Extract text from a document', inputSchema: { type: 'object', properties: { file_path: { type: 'string' }, include_metadata: { type: 'boolean', default: true } }, required: ['file_path'] } }, 'extract_tables': { description: 'Extract tables from a document', inputSchema: { type: 'object', properties: { file_path: { type: 'string' }, format: { type: 'string', enum: ['json', 'csv', 'html'], default: 'json' } }, required: ['file_path'] } } }; } /** * Handle incoming MCP messages */ async handleMessage(message) { if (message.method === 'tools/call') { await this.handleToolCall(message); } else if (message.method === 'initialize') { this.send({ jsonrpc: '2.0', id: message.id, result: { protocolVersion: '1.0.0', capabilities: { tools: this.getAvailableTools() } } }); } } /** * Handle tool calls */ async handleToolCall(message) { const { name, arguments: args } = message.params; try { let result; switch (name) { case 'process_document': result = await this.processDocument(args); break; case 'list_sources': result = await this.listSources(); break; case 'create_source_connector': result = await this.createSourceConnector(args); break; case 'list_destinations': result = await this.listDestinations(); break; case 'create_destination_connector': result = await this.createDestinationConnector(args); break; case 'create_workflow': result = await this.createWorkflow(args); break; case 'run_workflow': result = await this.runWorkflow(args); break; case 'list_jobs': result = await this.listJobs(args); break; case 'get_job_info': result = await this.getJobInfo(args); break; case 'extract_text': result = await this.extractText(args); break; case 'extract_tables': result = await this.extractTables(args); break; default: throw new Error(`Unknown tool: ${name}`); } this.send({ jsonrpc: '2.0', id: message.id, result: { content: [ { type: 'text', text: JSON.stringify(result, null, 2) } ] } }); } catch (error) { this.send({ jsonrpc: '2.0', id: message.id, error: { code: -32603, message: error.message } }); } } /** * Process a document using Unstructured.io API */ async processDocument(args) { const { file_path, strategy = 'auto', ocr_enabled = true, output_format = 'json' } = args; if (!fs.existsSync(file_path)) { throw new Error(`File not found: ${file_path}`); } const form = new FormData(); form.append('files', fs.createReadStream(file_path)); form.append('strategy', strategy); form.append('ocr_languages', 'eng'); form.append('pdf_infer_table_structure', 'true'); if (ocr_enabled) { form.append('hi_res_model_name', 'detectron2_onnx'); } try { const response = await axios.post(this.apiUrl, form, { headers: { ...form.getHeaders(), 'unstructured-api-key': this.apiKey, 'Accept': 'application/json' } }); if (output_format === 'text') { return response.data.map(el => el.text).join('\n'); } else if (output_format === 'markdown') { return this.convertToMarkdown(response.data); } return response.data; } catch (error) { if (error.response) { throw new Error(`API Error: ${error.response.data.error || error.response.statusText}`); } throw error; } } /** * Extract text from document */ async extractText(args) { const result = await this.processDocument({ ...args, output_format: 'text' }); return { text: result, file: args.file_path }; } /** * Extract tables from document */ async extractTables(args) { const result = await this.processDocument({ ...args, output_format: 'json' }); const tables = result.filter(el => el.type === 'Table'); if (args.format === 'csv') { return tables.map(t => this.tableToCSV(t)); } else if (args.format === 'html') { return tables.map(t => this.tableToHTML(t)); } return tables; } /** * List source connectors */ async listSources() { return Array.from(this.sources.values()); } /** * Create source connector */ async createSourceConnector(args) { const id = `source_${Date.now()}`; const source = { id, name: args.name, type: args.type, config: args.config || {}, created_at: new Date().toISOString() }; this.sources.set(id, source); return source; } /** * List destination connectors */ async listDestinations() { return Array.from(this.destinations.values()); } /** * Create destination connector */ async createDestinationConnector(args) { const id = `dest_${Date.now()}`; const destination = { id, name: args.name, type: args.type, config: args.config || {}, created_at: new Date().toISOString() }; this.destinations.set(id, destination); return destination; } /** * Create workflow */ async createWorkflow(args) { const id = `workflow_${Date.now()}`; const workflow = { id, name: args.name, source: args.source, destination: args.destination, settings: args.settings || {}, created_at: new Date().toISOString() }; this.workflows.set(id, workflow); return workflow; } /** * Run workflow */ async runWorkflow(args) { const workflow = this.workflows.get(args.workflow_id); if (!workflow) { throw new Error(`Workflow not found: ${args.workflow_id}`); } const id = `job_${Date.now()}`; const job = { id, workflow_id: args.workflow_id, status: 'running', started_at: new Date().toISOString(), progress: 0 }; this.jobs.set(id, job); // Simulate job processing setTimeout(() => { job.status = 'completed'; job.completed_at = new Date().toISOString(); job.progress = 100; job.documents_processed = 1; }, 2000); return job; } /** * List jobs */ async listJobs(args) { if (args.workflow_id) { return Array.from(this.jobs.values()).filter(j => j.workflow_id === args.workflow_id); } return Array.from(this.jobs.values()); } /** * Get job info */ async getJobInfo(args) { const job = this.jobs.get(args.job_id); if (!job) { throw new Error(`Job not found: ${args.job_id}`); } return job; } /** * Convert to Markdown */ convertToMarkdown(elements) { let markdown = ''; for (const el of elements) { switch (el.type) { case 'Title': markdown += `# ${el.text}\n\n`; break; case 'Header': markdown += `## ${el.text}\n\n`; break; case 'NarrativeText': case 'Text': markdown += `${el.text}\n\n`; break; case 'ListItem': markdown += `- ${el.text}\n`; break; case 'Table': markdown += this.tableToMarkdown(el) + '\n\n'; break; default: markdown += `${el.text}\n\n`; } } return markdown; } /** * Convert table to Markdown */ tableToMarkdown(table) { if (!table.metadata || !table.metadata.text_as_html) { return table.text; } // Parse HTML table and convert to markdown // This is simplified - in production you'd use a proper HTML parser return table.text; } /** * Convert table to CSV */ tableToCSV(table) { // Simplified CSV conversion return table.text.replace(/\t/g, ','); } /** * Convert table to HTML */ tableToHTML(table) { return table.metadata?.text_as_html || `<pre>${table.text}</pre>`; } /** * Send message to stdout */ send(message) { console.log(JSON.stringify(message)); } /** * Send error to stderr */ error(message) { console.error(message); } } // Start the server if (require.main === module) { const server = new UnstructuredMCPServer(); server.start(); } module.exports = UnstructuredMCPServer;