UNPKG

uns-mcp-server

Version:

Pure JavaScript MCP server for Unstructured.io - No Python required!

463 lines (420 loc) 11.5 kB
#!/usr/bin/env node /** * Pure JavaScript MCP Server for Unstructured.io * Implements proper MCP protocol with stdio communication */ const axios = require('axios'); const FormData = require('form-data'); const fs = require('fs'); const path = require('path'); class UnstructuredMCPServer { constructor() { this.apiKey = process.env.UNSTRUCTURED_API_KEY; this.apiUrl = process.env.UNSTRUCTURED_API_URL || 'https://api.unstructuredapp.io/general/v0/general'; if (!this.apiKey) { console.error('UNSTRUCTURED_API_KEY environment variable is required'); console.error('Get your API key at: https://unstructured.io'); process.exit(1); } this.sources = new Map(); this.destinations = new Map(); this.workflows = new Map(); this.jobs = new Map(); // Buffer for incomplete messages this.buffer = ''; } /** * Start the MCP server */ start() { // Set up stdin for reading messages process.stdin.setEncoding('utf8'); // Handle incoming data process.stdin.on('data', (chunk) => { this.buffer += chunk; // Process complete messages (each ends with newline) let lines = this.buffer.split('\n'); this.buffer = lines.pop() || ''; // Keep incomplete line in buffer for (const line of lines) { if (line.trim()) { try { const message = JSON.parse(line); this.handleMessage(message).catch(err => { this.sendError(message.id, -32603, err.message); }); } catch (e) { console.error('Failed to parse message:', e.message); } } } }); // Handle stdin close process.stdin.on('end', () => { process.exit(0); }); } /** * Handle incoming MCP messages */ async handleMessage(message) { // Handle different message types if (message.method === 'initialize') { this.handleInitialize(message); } else if (message.method === 'initialized') { // Client confirmed initialization return; } else if (message.method === 'tools/list') { this.handleToolsList(message); } else if (message.method === 'tools/call') { await this.handleToolCall(message); } else if (message.method === 'completion/complete') { this.handleCompletion(message); } else if (message.id && !message.method) { // Response to a request we made (not expected for server) return; } } /** * Handle initialization request */ handleInitialize(message) { this.send({ jsonrpc: '2.0', id: message.id, result: { protocolVersion: '2024-11-05', capabilities: { tools: {} }, serverInfo: { name: 'uns-mcp-server', version: '2.0.0' } } }); } /** * Handle tools/list request */ handleToolsList(message) { const tools = [ { name: 'process_document', description: 'Process a document using Unstructured.io API', inputSchema: { type: 'object', properties: { file_path: { type: 'string', description: 'Path to the document to process' }, strategy: { type: 'string', enum: ['auto', 'hi_res', 'ocr_only', 'fast'], default: 'auto', description: 'Processing strategy to use' }, ocr_enabled: { type: 'boolean', default: true, description: 'Enable OCR for scanned documents' }, output_format: { type: 'string', enum: ['json', 'text', 'markdown'], default: 'json', description: 'Output format for results' } }, required: ['file_path'] } }, { name: 'extract_text', description: 'Extract plain text from a document', inputSchema: { type: 'object', properties: { file_path: { type: 'string', description: 'Path to the document' }, include_metadata: { type: 'boolean', default: true, description: 'Include metadata in response' } }, required: ['file_path'] } }, { name: 'extract_tables', description: 'Extract tables from a document', inputSchema: { type: 'object', properties: { file_path: { type: 'string', description: 'Path to the document' }, format: { type: 'string', enum: ['json', 'csv', 'html'], default: 'json', description: 'Output format for tables' } }, required: ['file_path'] } }, { name: 'list_sources', description: 'List configured source connectors', inputSchema: { type: 'object', properties: {} } }, { name: 'create_source_connector', description: 'Create a new source connector', inputSchema: { type: 'object', properties: { name: { type: 'string' }, type: { type: 'string', enum: ['local', 's3', 'azure', 'googledrive'] }, config: { type: 'object' } }, required: ['name', 'type'] } } ]; this.send({ jsonrpc: '2.0', id: message.id, result: { tools } }); } /** * Handle tool calls */ async handleToolCall(message) { const { name, arguments: args } = message.params; try { let result; switch (name) { case 'process_document': result = await this.processDocument(args); break; case 'extract_text': result = await this.extractText(args); break; case 'extract_tables': result = await this.extractTables(args); break; case 'list_sources': result = await this.listSources(); break; case 'create_source_connector': result = await this.createSourceConnector(args); break; default: throw new Error(`Unknown tool: ${name}`); } // Send successful result this.send({ jsonrpc: '2.0', id: message.id, result: { content: [ { type: 'text', text: typeof result === 'string' ? result : JSON.stringify(result, null, 2) } ] } }); } catch (error) { this.sendError(message.id, -32603, error.message); } } /** * Process a document using Unstructured.io API */ async processDocument(args) { const { file_path, strategy = 'auto', ocr_enabled = true, output_format = 'json' } = args; // Check if file exists if (!fs.existsSync(file_path)) { throw new Error(`File not found: ${file_path}`); } // Prepare form data const form = new FormData(); form.append('files', fs.createReadStream(file_path)); form.append('strategy', strategy); if (ocr_enabled) { form.append('ocr_languages', 'eng'); form.append('pdf_infer_table_structure', 'true'); } try { const response = await axios.post(this.apiUrl, form, { headers: { ...form.getHeaders(), 'unstructured-api-key': this.apiKey }, maxBodyLength: Infinity, timeout: 300000 // 5 minutes timeout for large files }); // Format output based on requested format if (output_format === 'text') { return response.data.map(el => el.text).join('\n'); } else if (output_format === 'markdown') { return this.convertToMarkdown(response.data); } return response.data; } catch (error) { if (error.response) { throw new Error(`API Error (${error.response.status}): ${error.response.data?.error || error.response.statusText}`); } else if (error.request) { throw new Error('No response from Unstructured API. Check your API key and network connection.'); } else { throw new Error(`Request failed: ${error.message}`); } } } /** * Extract text from document */ async extractText(args) { const result = await this.processDocument({ ...args, output_format: 'text' }); return { text: result, file: args.file_path, timestamp: new Date().toISOString() }; } /** * Extract tables from document */ async extractTables(args) { const result = await this.processDocument({ ...args, output_format: 'json' }); // Filter for table elements const tables = result.filter(el => el.type === 'Table'); if (args.format === 'csv') { return tables.map(t => ({ text: t.text.replace(/\t/g, ','), metadata: t.metadata })); } else if (args.format === 'html') { return tables.map(t => ({ html: t.metadata?.text_as_html || `<pre>${t.text}</pre>`, text: t.text })); } return tables; } /** * List source connectors */ async listSources() { return Array.from(this.sources.values()); } /** * Create source connector */ async createSourceConnector(args) { const id = `source_${Date.now()}`; const source = { id, name: args.name, type: args.type, config: args.config || {}, created_at: new Date().toISOString() }; this.sources.set(id, source); return source; } /** * Convert elements to Markdown */ convertToMarkdown(elements) { let markdown = ''; for (const el of elements) { switch (el.type) { case 'Title': markdown += `# ${el.text}\n\n`; break; case 'Header': markdown += `## ${el.text}\n\n`; break; case 'NarrativeText': case 'Text': markdown += `${el.text}\n\n`; break; case 'ListItem': markdown += `- ${el.text}\n`; break; case 'Table': markdown += `\`\`\`\n${el.text}\n\`\`\`\n\n`; break; default: if (el.text) { markdown += `${el.text}\n\n`; } } } return markdown; } /** * Handle completion request */ handleCompletion(message) { // For now, we don't provide completions this.send({ jsonrpc: '2.0', id: message.id, result: { completion: { values: [] } } }); } /** * Send message to stdout */ send(message) { const json = JSON.stringify(message); process.stdout.write(json + '\n'); } /** * Send error response */ sendError(id, code, message) { this.send({ jsonrpc: '2.0', id, error: { code, message } }); } } // Start the server when run directly if (require.main === module) { const server = new UnstructuredMCPServer(); server.start(); } module.exports = UnstructuredMCPServer;