uns-mcp-server
Version:
Pure JavaScript MCP server for Unstructured.io - No Python required!
463 lines (420 loc) • 11.5 kB
JavaScript
/**
* Pure JavaScript MCP Server for Unstructured.io
* Implements proper MCP protocol with stdio communication
*/
const axios = require('axios');
const FormData = require('form-data');
const fs = require('fs');
const path = require('path');
class UnstructuredMCPServer {
constructor() {
this.apiKey = process.env.UNSTRUCTURED_API_KEY;
this.apiUrl = process.env.UNSTRUCTURED_API_URL || 'https://api.unstructuredapp.io/general/v0/general';
if (!this.apiKey) {
console.error('UNSTRUCTURED_API_KEY environment variable is required');
console.error('Get your API key at: https://unstructured.io');
process.exit(1);
}
this.sources = new Map();
this.destinations = new Map();
this.workflows = new Map();
this.jobs = new Map();
// Buffer for incomplete messages
this.buffer = '';
}
/**
* Start the MCP server
*/
start() {
// Set up stdin for reading messages
process.stdin.setEncoding('utf8');
// Handle incoming data
process.stdin.on('data', (chunk) => {
this.buffer += chunk;
// Process complete messages (each ends with newline)
let lines = this.buffer.split('\n');
this.buffer = lines.pop() || ''; // Keep incomplete line in buffer
for (const line of lines) {
if (line.trim()) {
try {
const message = JSON.parse(line);
this.handleMessage(message).catch(err => {
this.sendError(message.id, -32603, err.message);
});
} catch (e) {
console.error('Failed to parse message:', e.message);
}
}
}
});
// Handle stdin close
process.stdin.on('end', () => {
process.exit(0);
});
}
/**
* Handle incoming MCP messages
*/
async handleMessage(message) {
// Handle different message types
if (message.method === 'initialize') {
this.handleInitialize(message);
} else if (message.method === 'initialized') {
// Client confirmed initialization
return;
} else if (message.method === 'tools/list') {
this.handleToolsList(message);
} else if (message.method === 'tools/call') {
await this.handleToolCall(message);
} else if (message.method === 'completion/complete') {
this.handleCompletion(message);
} else if (message.id && !message.method) {
// Response to a request we made (not expected for server)
return;
}
}
/**
* Handle initialization request
*/
handleInitialize(message) {
this.send({
jsonrpc: '2.0',
id: message.id,
result: {
protocolVersion: '2024-11-05',
capabilities: {
tools: {}
},
serverInfo: {
name: 'uns-mcp-server',
version: '2.0.0'
}
}
});
}
/**
* Handle tools/list request
*/
handleToolsList(message) {
const tools = [
{
name: 'process_document',
description: 'Process a document using Unstructured.io API',
inputSchema: {
type: 'object',
properties: {
file_path: {
type: 'string',
description: 'Path to the document to process'
},
strategy: {
type: 'string',
enum: ['auto', 'hi_res', 'ocr_only', 'fast'],
default: 'auto',
description: 'Processing strategy to use'
},
ocr_enabled: {
type: 'boolean',
default: true,
description: 'Enable OCR for scanned documents'
},
output_format: {
type: 'string',
enum: ['json', 'text', 'markdown'],
default: 'json',
description: 'Output format for results'
}
},
required: ['file_path']
}
},
{
name: 'extract_text',
description: 'Extract plain text from a document',
inputSchema: {
type: 'object',
properties: {
file_path: {
type: 'string',
description: 'Path to the document'
},
include_metadata: {
type: 'boolean',
default: true,
description: 'Include metadata in response'
}
},
required: ['file_path']
}
},
{
name: 'extract_tables',
description: 'Extract tables from a document',
inputSchema: {
type: 'object',
properties: {
file_path: {
type: 'string',
description: 'Path to the document'
},
format: {
type: 'string',
enum: ['json', 'csv', 'html'],
default: 'json',
description: 'Output format for tables'
}
},
required: ['file_path']
}
},
{
name: 'list_sources',
description: 'List configured source connectors',
inputSchema: {
type: 'object',
properties: {}
}
},
{
name: 'create_source_connector',
description: 'Create a new source connector',
inputSchema: {
type: 'object',
properties: {
name: { type: 'string' },
type: {
type: 'string',
enum: ['local', 's3', 'azure', 'googledrive']
},
config: { type: 'object' }
},
required: ['name', 'type']
}
}
];
this.send({
jsonrpc: '2.0',
id: message.id,
result: {
tools
}
});
}
/**
* Handle tool calls
*/
async handleToolCall(message) {
const { name, arguments: args } = message.params;
try {
let result;
switch (name) {
case 'process_document':
result = await this.processDocument(args);
break;
case 'extract_text':
result = await this.extractText(args);
break;
case 'extract_tables':
result = await this.extractTables(args);
break;
case 'list_sources':
result = await this.listSources();
break;
case 'create_source_connector':
result = await this.createSourceConnector(args);
break;
default:
throw new Error(`Unknown tool: ${name}`);
}
// Send successful result
this.send({
jsonrpc: '2.0',
id: message.id,
result: {
content: [
{
type: 'text',
text: typeof result === 'string' ? result : JSON.stringify(result, null, 2)
}
]
}
});
} catch (error) {
this.sendError(message.id, -32603, error.message);
}
}
/**
* Process a document using Unstructured.io API
*/
async processDocument(args) {
const { file_path, strategy = 'auto', ocr_enabled = true, output_format = 'json' } = args;
// Check if file exists
if (!fs.existsSync(file_path)) {
throw new Error(`File not found: ${file_path}`);
}
// Prepare form data
const form = new FormData();
form.append('files', fs.createReadStream(file_path));
form.append('strategy', strategy);
if (ocr_enabled) {
form.append('ocr_languages', 'eng');
form.append('pdf_infer_table_structure', 'true');
}
try {
const response = await axios.post(this.apiUrl, form, {
headers: {
...form.getHeaders(),
'unstructured-api-key': this.apiKey
},
maxBodyLength: Infinity,
timeout: 300000 // 5 minutes timeout for large files
});
// Format output based on requested format
if (output_format === 'text') {
return response.data.map(el => el.text).join('\n');
} else if (output_format === 'markdown') {
return this.convertToMarkdown(response.data);
}
return response.data;
} catch (error) {
if (error.response) {
throw new Error(`API Error (${error.response.status}): ${error.response.data?.error || error.response.statusText}`);
} else if (error.request) {
throw new Error('No response from Unstructured API. Check your API key and network connection.');
} else {
throw new Error(`Request failed: ${error.message}`);
}
}
}
/**
* Extract text from document
*/
async extractText(args) {
const result = await this.processDocument({
...args,
output_format: 'text'
});
return {
text: result,
file: args.file_path,
timestamp: new Date().toISOString()
};
}
/**
* Extract tables from document
*/
async extractTables(args) {
const result = await this.processDocument({
...args,
output_format: 'json'
});
// Filter for table elements
const tables = result.filter(el => el.type === 'Table');
if (args.format === 'csv') {
return tables.map(t => ({
text: t.text.replace(/\t/g, ','),
metadata: t.metadata
}));
} else if (args.format === 'html') {
return tables.map(t => ({
html: t.metadata?.text_as_html || `<pre>${t.text}</pre>`,
text: t.text
}));
}
return tables;
}
/**
* List source connectors
*/
async listSources() {
return Array.from(this.sources.values());
}
/**
* Create source connector
*/
async createSourceConnector(args) {
const id = `source_${Date.now()}`;
const source = {
id,
name: args.name,
type: args.type,
config: args.config || {},
created_at: new Date().toISOString()
};
this.sources.set(id, source);
return source;
}
/**
* Convert elements to Markdown
*/
convertToMarkdown(elements) {
let markdown = '';
for (const el of elements) {
switch (el.type) {
case 'Title':
markdown += `# ${el.text}\n\n`;
break;
case 'Header':
markdown += `## ${el.text}\n\n`;
break;
case 'NarrativeText':
case 'Text':
markdown += `${el.text}\n\n`;
break;
case 'ListItem':
markdown += `- ${el.text}\n`;
break;
case 'Table':
markdown += `\`\`\`\n${el.text}\n\`\`\`\n\n`;
break;
default:
if (el.text) {
markdown += `${el.text}\n\n`;
}
}
}
return markdown;
}
/**
* Handle completion request
*/
handleCompletion(message) {
// For now, we don't provide completions
this.send({
jsonrpc: '2.0',
id: message.id,
result: {
completion: {
values: []
}
}
});
}
/**
* Send message to stdout
*/
send(message) {
const json = JSON.stringify(message);
process.stdout.write(json + '\n');
}
/**
* Send error response
*/
sendError(id, code, message) {
this.send({
jsonrpc: '2.0',
id,
error: {
code,
message
}
});
}
}
// Start the server when run directly
if (require.main === module) {
const server = new UnstructuredMCPServer();
server.start();
}
module.exports = UnstructuredMCPServer;