uns-mcp-server
Version:
Pure JavaScript MCP server for Unstructured.io - No Python required!
556 lines (503 loc) • 13.5 kB
JavaScript
/**
* Pure JavaScript MCP Server for Unstructured.io
* No Python dependencies required!
*/
const readline = require('readline');
const axios = require('axios');
const FormData = require('form-data');
const fs = require('fs');
const path = require('path');
class UnstructuredMCPServer {
constructor() {
this.apiKey = process.env.UNSTRUCTURED_API_KEY;
this.apiUrl = process.env.UNSTRUCTURED_API_URL || 'https://api.unstructured.io/general/v0/general';
if (!this.apiKey) {
this.error('UNSTRUCTURED_API_KEY environment variable is required');
process.exit(1);
}
// Initialize MCP protocol
this.rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
this.sources = new Map();
this.destinations = new Map();
this.workflows = new Map();
this.jobs = new Map();
}
/**
* Start the MCP server
*/
start() {
// Send initialization
this.send({
jsonrpc: '2.0',
method: 'initialize',
params: {
protocolVersion: '1.0.0',
capabilities: {
tools: this.getAvailableTools()
}
}
});
// Listen for messages
this.rl.on('line', (line) => {
try {
const message = JSON.parse(line);
this.handleMessage(message);
} catch (e) {
this.error(`Failed to parse message: ${e.message}`);
}
});
}
/**
* Get list of available tools
*/
getAvailableTools() {
return {
'process_document': {
description: 'Process a document using Unstructured.io API',
inputSchema: {
type: 'object',
properties: {
file_path: { type: 'string', description: 'Path to the document' },
strategy: { type: 'string', enum: ['auto', 'hi_res', 'ocr_only', 'fast'], default: 'auto' },
ocr_enabled: { type: 'boolean', default: true },
output_format: { type: 'string', enum: ['json', 'text', 'markdown'], default: 'json' }
},
required: ['file_path']
}
},
'list_sources': {
description: 'List configured source connectors',
inputSchema: {
type: 'object',
properties: {}
}
},
'create_source_connector': {
description: 'Create a new source connector',
inputSchema: {
type: 'object',
properties: {
name: { type: 'string' },
type: { type: 'string', enum: ['local', 's3', 'azure', 'googledrive'] },
config: { type: 'object' }
},
required: ['name', 'type']
}
},
'list_destinations': {
description: 'List configured destination connectors',
inputSchema: {
type: 'object',
properties: {}
}
},
'create_destination_connector': {
description: 'Create a new destination connector',
inputSchema: {
type: 'object',
properties: {
name: { type: 'string' },
type: { type: 'string', enum: ['local', 's3', 'mongodb', 'weaviate', 'pinecone'] },
config: { type: 'object' }
},
required: ['name', 'type']
}
},
'create_workflow': {
description: 'Create a document processing workflow',
inputSchema: {
type: 'object',
properties: {
name: { type: 'string' },
source: { type: 'string' },
destination: { type: 'string' },
settings: { type: 'object' }
},
required: ['name', 'source', 'destination']
}
},
'run_workflow': {
description: 'Execute a workflow',
inputSchema: {
type: 'object',
properties: {
workflow_id: { type: 'string' }
},
required: ['workflow_id']
}
},
'list_jobs': {
description: 'List processing jobs',
inputSchema: {
type: 'object',
properties: {
workflow_id: { type: 'string' }
}
}
},
'get_job_info': {
description: 'Get job status and details',
inputSchema: {
type: 'object',
properties: {
job_id: { type: 'string' }
},
required: ['job_id']
}
},
'extract_text': {
description: 'Extract text from a document',
inputSchema: {
type: 'object',
properties: {
file_path: { type: 'string' },
include_metadata: { type: 'boolean', default: true }
},
required: ['file_path']
}
},
'extract_tables': {
description: 'Extract tables from a document',
inputSchema: {
type: 'object',
properties: {
file_path: { type: 'string' },
format: { type: 'string', enum: ['json', 'csv', 'html'], default: 'json' }
},
required: ['file_path']
}
}
};
}
/**
* Handle incoming MCP messages
*/
async handleMessage(message) {
if (message.method === 'tools/call') {
await this.handleToolCall(message);
} else if (message.method === 'initialize') {
this.send({
jsonrpc: '2.0',
id: message.id,
result: {
protocolVersion: '1.0.0',
capabilities: {
tools: this.getAvailableTools()
}
}
});
}
}
/**
* Handle tool calls
*/
async handleToolCall(message) {
const { name, arguments: args } = message.params;
try {
let result;
switch (name) {
case 'process_document':
result = await this.processDocument(args);
break;
case 'list_sources':
result = await this.listSources();
break;
case 'create_source_connector':
result = await this.createSourceConnector(args);
break;
case 'list_destinations':
result = await this.listDestinations();
break;
case 'create_destination_connector':
result = await this.createDestinationConnector(args);
break;
case 'create_workflow':
result = await this.createWorkflow(args);
break;
case 'run_workflow':
result = await this.runWorkflow(args);
break;
case 'list_jobs':
result = await this.listJobs(args);
break;
case 'get_job_info':
result = await this.getJobInfo(args);
break;
case 'extract_text':
result = await this.extractText(args);
break;
case 'extract_tables':
result = await this.extractTables(args);
break;
default:
throw new Error(`Unknown tool: ${name}`);
}
this.send({
jsonrpc: '2.0',
id: message.id,
result: {
content: [
{
type: 'text',
text: JSON.stringify(result, null, 2)
}
]
}
});
} catch (error) {
this.send({
jsonrpc: '2.0',
id: message.id,
error: {
code: -32603,
message: error.message
}
});
}
}
/**
* Process a document using Unstructured.io API
*/
async processDocument(args) {
const { file_path, strategy = 'auto', ocr_enabled = true, output_format = 'json' } = args;
if (!fs.existsSync(file_path)) {
throw new Error(`File not found: ${file_path}`);
}
const form = new FormData();
form.append('files', fs.createReadStream(file_path));
form.append('strategy', strategy);
form.append('ocr_languages', 'eng');
form.append('pdf_infer_table_structure', 'true');
if (ocr_enabled) {
form.append('hi_res_model_name', 'detectron2_onnx');
}
try {
const response = await axios.post(this.apiUrl, form, {
headers: {
...form.getHeaders(),
'unstructured-api-key': this.apiKey,
'Accept': 'application/json'
}
});
if (output_format === 'text') {
return response.data.map(el => el.text).join('\n');
} else if (output_format === 'markdown') {
return this.convertToMarkdown(response.data);
}
return response.data;
} catch (error) {
if (error.response) {
throw new Error(`API Error: ${error.response.data.error || error.response.statusText}`);
}
throw error;
}
}
/**
* Extract text from document
*/
async extractText(args) {
const result = await this.processDocument({
...args,
output_format: 'text'
});
return {
text: result,
file: args.file_path
};
}
/**
* Extract tables from document
*/
async extractTables(args) {
const result = await this.processDocument({
...args,
output_format: 'json'
});
const tables = result.filter(el => el.type === 'Table');
if (args.format === 'csv') {
return tables.map(t => this.tableToCSV(t));
} else if (args.format === 'html') {
return tables.map(t => this.tableToHTML(t));
}
return tables;
}
/**
* List source connectors
*/
async listSources() {
return Array.from(this.sources.values());
}
/**
* Create source connector
*/
async createSourceConnector(args) {
const id = `source_${Date.now()}`;
const source = {
id,
name: args.name,
type: args.type,
config: args.config || {},
created_at: new Date().toISOString()
};
this.sources.set(id, source);
return source;
}
/**
* List destination connectors
*/
async listDestinations() {
return Array.from(this.destinations.values());
}
/**
* Create destination connector
*/
async createDestinationConnector(args) {
const id = `dest_${Date.now()}`;
const destination = {
id,
name: args.name,
type: args.type,
config: args.config || {},
created_at: new Date().toISOString()
};
this.destinations.set(id, destination);
return destination;
}
/**
* Create workflow
*/
async createWorkflow(args) {
const id = `workflow_${Date.now()}`;
const workflow = {
id,
name: args.name,
source: args.source,
destination: args.destination,
settings: args.settings || {},
created_at: new Date().toISOString()
};
this.workflows.set(id, workflow);
return workflow;
}
/**
* Run workflow
*/
async runWorkflow(args) {
const workflow = this.workflows.get(args.workflow_id);
if (!workflow) {
throw new Error(`Workflow not found: ${args.workflow_id}`);
}
const id = `job_${Date.now()}`;
const job = {
id,
workflow_id: args.workflow_id,
status: 'running',
started_at: new Date().toISOString(),
progress: 0
};
this.jobs.set(id, job);
// Simulate job processing
setTimeout(() => {
job.status = 'completed';
job.completed_at = new Date().toISOString();
job.progress = 100;
job.documents_processed = 1;
}, 2000);
return job;
}
/**
* List jobs
*/
async listJobs(args) {
if (args.workflow_id) {
return Array.from(this.jobs.values()).filter(j => j.workflow_id === args.workflow_id);
}
return Array.from(this.jobs.values());
}
/**
* Get job info
*/
async getJobInfo(args) {
const job = this.jobs.get(args.job_id);
if (!job) {
throw new Error(`Job not found: ${args.job_id}`);
}
return job;
}
/**
* Convert to Markdown
*/
convertToMarkdown(elements) {
let markdown = '';
for (const el of elements) {
switch (el.type) {
case 'Title':
markdown += `# ${el.text}\n\n`;
break;
case 'Header':
markdown += `## ${el.text}\n\n`;
break;
case 'NarrativeText':
case 'Text':
markdown += `${el.text}\n\n`;
break;
case 'ListItem':
markdown += `- ${el.text}\n`;
break;
case 'Table':
markdown += this.tableToMarkdown(el) + '\n\n';
break;
default:
markdown += `${el.text}\n\n`;
}
}
return markdown;
}
/**
* Convert table to Markdown
*/
tableToMarkdown(table) {
if (!table.metadata || !table.metadata.text_as_html) {
return table.text;
}
// Parse HTML table and convert to markdown
// This is simplified - in production you'd use a proper HTML parser
return table.text;
}
/**
* Convert table to CSV
*/
tableToCSV(table) {
// Simplified CSV conversion
return table.text.replace(/\t/g, ',');
}
/**
* Convert table to HTML
*/
tableToHTML(table) {
return table.metadata?.text_as_html || `<pre>${table.text}</pre>`;
}
/**
* Send message to stdout
*/
send(message) {
console.log(JSON.stringify(message));
}
/**
* Send error to stderr
*/
error(message) {
console.error(message);
}
}
// Start the server
if (require.main === module) {
const server = new UnstructuredMCPServer();
server.start();
}
module.exports = UnstructuredMCPServer;