@flexabrain/mcp-server
Version:
Advanced electrical schematic analysis MCP server with rail engineering expertise
454 lines • 19.8 kB
JavaScript
/**
* FlexaBrain MCP Server - PDF Document Processor
*
* Advanced PDF processing service for multi-page technical schematics.
* Handles PDF parsing, page extraction, image conversion, and metadata extraction
* optimized for traction generator monitoring control systems.
*/
import * as fs from 'fs';
import * as path from 'path';
import { randomUUID } from 'crypto';
import { SchematicPageType } from '../types/pdf-schematic.js';
export class PDFProcessor {
processingDirectory;
dependencies;
constructor(processingDir = './temp/pdf-processing', deps = {}) {
this.processingDirectory = processingDir;
this.dependencies = deps;
this.ensureProcessingDirectory();
}
/**
* Process a PDF schematic document
*/
async processPDFSchematic(pdfPath, options = {}) {
const startTime = Date.now();
const documentId = randomUUID();
const errors = [];
const warnings = [];
try {
console.error(`FlexaBrain PDF Processor: Starting processing of ${pdfPath}`);
// Validate input file
if (!fs.existsSync(pdfPath)) {
throw new Error(`PDF file not found: ${pdfPath}`);
}
// Set default options
const processingOptions = {
extract_images: true,
enhance_quality: true,
detect_rotation: true,
extract_cross_references: true,
enable_line_detection: true,
processing_dpi: 300,
ocr_language: 'eng',
component_confidence_threshold: 0.7,
...options
};
// Extract PDF metadata
const metadata = await this.extractPDFMetadata(pdfPath);
// Extract pages as images
const pages = await this.extractPagesAsImages(pdfPath, documentId, processingOptions);
// Process each page for components and text
const processedPages = [];
let totalComponents = 0;
for (const page of pages) {
try {
const processedPage = await this.processSchematicPage(page, processingOptions);
processedPages.push(processedPage);
totalComponents += processedPage.components.length;
}
catch (error) {
errors.push({
code: 'PAGE_PROCESSING_ERROR',
message: `Failed to process page ${page.page_number}: ${error instanceof Error ? error.message : String(error)}`,
page_number: page.page_number,
severity: 'medium',
recovery_suggestions: ['Try reducing DPI settings', 'Check page image quality']
});
}
}
// Extract cross-page references if enabled
const crossReferences = processingOptions.extract_cross_references ?
await this.extractCrossPageReferences(processedPages) : [];
// Calculate processing statistics
const totalProcessingTime = Date.now() - startTime;
const averageConfidence = processedPages.length > 0 ?
processedPages.reduce((sum, page) => sum + page.quality_score, 0) / processedPages.length : 0;
const result = {
document_id: documentId,
metadata,
pages: processedPages,
cross_references: crossReferences,
processing_stats: {
total_processing_time: totalProcessingTime,
pages_processed: processedPages.length,
components_found: totalComponents,
cross_references_found: crossReferences.length,
average_confidence: averageConfidence
},
errors,
warnings
};
console.error(`FlexaBrain PDF Processor: Completed processing in ${totalProcessingTime}ms`);
console.error(`FlexaBrain PDF Processor: Found ${totalComponents} components across ${processedPages.length} pages`);
return result;
}
catch (error) {
console.error('FlexaBrain PDF Processor: Processing failed:', error);
errors.push({
code: 'PDF_PROCESSING_FAILED',
message: `PDF processing failed: ${error instanceof Error ? error.message : String(error)}`,
severity: 'critical',
recovery_suggestions: [
'Verify PDF file is not corrupted',
'Check file permissions',
'Try converting PDF to individual images first'
]
});
return {
document_id: documentId,
metadata: { filename: path.basename(pdfPath), pages_count: 0, document_type: 'general' },
pages: [],
cross_references: [],
processing_stats: {
total_processing_time: Date.now() - startTime,
pages_processed: 0,
components_found: 0,
cross_references_found: 0,
average_confidence: 0
},
errors,
warnings
};
}
}
/**
* Extract PDF metadata and document information
*/
async extractPDFMetadata(pdfPath) {
try {
// In a real implementation, this would use pdf-parse or similar
const stats = fs.statSync(pdfPath);
const filename = path.basename(pdfPath);
// Mock metadata extraction - in real implementation would parse PDF info
const extractedVersion = this.extractVersionFromFilename(filename);
const metadata = {
filename,
creation_date: stats.birthtime,
modification_date: stats.mtime,
pages_count: await this.estimatePageCount(pdfPath),
document_type: this.inferDocumentType(filename),
// These would be extracted from actual PDF metadata in real implementation
title: this.extractTitleFromFilename(filename),
...(extractedVersion && { version: extractedVersion })
};
return metadata;
}
catch (error) {
console.error('Failed to extract PDF metadata:', error);
return {
filename: path.basename(pdfPath),
pages_count: 1,
document_type: 'general'
};
}
}
/**
* Extract pages from PDF as high-quality images
*/
async extractPagesAsImages(pdfPath, documentId, options) {
const pages = [];
try {
// Mock implementation - in real version would use pdf2pic or similar
console.error(`FlexaBrain PDF Processor: Extracting pages at ${options.processing_dpi} DPI`);
// For demo purposes, simulate extracting pages
const pageCount = await this.estimatePageCount(pdfPath);
for (let pageNum = 1; pageNum <= pageCount; pageNum++) {
const pageId = `${documentId}_page_${pageNum}`;
const imagePath = path.join(this.processingDirectory, `${pageId}.png`);
// Mock page extraction - in real implementation would convert PDF page to image
const mockImageBuffer = Buffer.from('mock-image-data');
const page = {
page_number: pageNum,
title: `Page ${pageNum}`,
schematic_type: this.inferPageType(pageNum, pageCount),
image_data: mockImageBuffer,
image_path: imagePath,
dimensions: {
width: Math.floor(8.5 * options.processing_dpi), // Assume Letter size
height: Math.floor(11 * options.processing_dpi),
dpi: options.processing_dpi
},
components: [], // Will be populated during processing
connections: [],
ocr_results: {
text: '',
confidence: 0,
words: [],
processing_time: 0,
language: options.ocr_language
},
processing_time: 0,
quality_score: 0.8 // Will be calculated during processing
};
pages.push(page);
}
return pages;
}
catch (error) {
console.error('Failed to extract PDF pages:', error);
throw new Error(`Page extraction failed: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Process individual schematic page for components and connections
*/
async processSchematicPage(page, options) {
const startTime = Date.now();
try {
// Apply image enhancements if enabled
if (options.enhance_quality) {
await this.enhanceImageQuality(page);
}
// Detect and correct rotation if needed
if (options.detect_rotation) {
await this.detectAndCorrectRotation(page);
}
// Perform OCR text extraction
page.ocr_results = await this.performEnhancedOCR(page, options);
// Extract technical drawing features
const drawingFeatures = await this.extractDrawingFeatures(page);
// Classify page type based on content
page.schematic_type = await this.classifyPageType(page, drawingFeatures);
// Extract electrical components (placeholder - would integrate with existing classifier)
page.components = await this.extractComponents(page, options);
// Detect connections and wiring if enabled
if (options.enable_line_detection) {
page.connections = await this.detectConnections(page);
}
page.processing_time = Date.now() - startTime;
page.quality_score = this.calculatePageQuality(page);
return page;
}
catch (error) {
console.error(`Failed to process page ${page.page_number}:`, error);
page.processing_time = Date.now() - startTime;
page.quality_score = 0.3; // Low quality due to processing failure
return page;
}
}
/**
* Extract cross-page references and component relationships
*/
async extractCrossPageReferences(pages) {
// Mock implementation - would analyze component references across pages
const crossReferences = [];
try {
// Look for cross-references in OCR text
for (const page of pages) {
const pageRefs = this.findPageReferences(page.ocr_results.text, page.page_number);
crossReferences.push(...pageRefs);
}
// Look for continued components (same ID on multiple pages)
const componentMap = new Map();
for (const page of pages) {
for (const component of page.components) {
if (!componentMap.has(component.id)) {
componentMap.set(component.id, []);
}
componentMap.get(component.id).push({
component,
page_number: page.page_number
});
}
}
// Create cross-references for components appearing on multiple pages
for (const [componentId, instances] of componentMap) {
if (instances.length > 1) {
for (let i = 1; i < instances.length; i++) {
crossReferences.push({
id: randomUUID(),
source: {
component_id: componentId,
page_number: instances[0].page_number,
component_type: instances[0].component.type,
location: instances[0].component.location
},
target: {
component_id: componentId,
page_number: instances[i].page_number,
component_type: instances[i].component.type,
location: instances[i].component.location
},
reference_type: 'continuation',
confidence: 0.85,
verified: false
});
}
}
}
return crossReferences;
}
catch (error) {
console.error('Failed to extract cross-page references:', error);
return [];
}
}
// Helper methods (mock implementations - would be fully implemented in production)
ensureProcessingDirectory() {
if (!fs.existsSync(this.processingDirectory)) {
fs.mkdirSync(this.processingDirectory, { recursive: true });
}
}
async estimatePageCount(pdfPath) {
// Mock implementation - would actually count PDF pages
return 3; // Assume 3 pages for demo
}
inferDocumentType(filename) {
const lower = filename.toLowerCase();
if (lower.includes('traction'))
return 'traction_control';
if (lower.includes('generator') || lower.includes('monitor'))
return 'generator_monitoring';
if (lower.includes('power') || lower.includes('distribution'))
return 'power_distribution';
return 'general';
}
extractTitleFromFilename(filename) {
return filename.replace(/\.[^/.]+$/, "").replace(/_/g, ' ');
}
extractVersionFromFilename(filename) {
const versionMatch = filename.match(/[vV]?(\d+\.\d+\.?\d*)/);
return versionMatch ? versionMatch[1] : undefined;
}
inferPageType(pageNum, totalPages) {
if (pageNum === 1)
return SchematicPageType.OVERVIEW;
if (pageNum === totalPages)
return SchematicPageType.COMPONENT_LIST;
return SchematicPageType.DETAIL;
}
async enhanceImageQuality(page) {
// Mock implementation - would use Sharp or similar for image enhancement
console.error(`Enhancing image quality for page ${page.page_number}`);
}
async detectAndCorrectRotation(page) {
// Mock implementation - would detect and correct page rotation
console.error(`Checking rotation for page ${page.page_number}`);
}
async performEnhancedOCR(page, options) {
// Mock implementation - would integrate with enhanced OCR service
return {
text: `Mock OCR text for page ${page.page_number} with generator monitoring components A601 CB101`,
confidence: 0.85,
words: [
{ text: 'A601', confidence: 0.92, bbox: { x: 100, y: 50, width: 40, height: 20 } },
{ text: 'CB101', confidence: 0.88, bbox: { x: 200, y: 100, width: 50, height: 25 } }
],
processing_time: 1500,
language: options.ocr_language
};
}
async extractDrawingFeatures(page) {
// Mock implementation - would analyze drawing features
return {
has_title_block: true,
has_component_list: page.page_number === 1,
has_wire_numbers: true,
has_terminal_references: true,
has_cross_references: true,
sheet_size: 'A1',
revision: 'P01'
};
}
async classifyPageType(page, features) {
// Enhanced page classification based on content
const text = page.ocr_results.text.toLowerCase();
if (text.includes('generator') && text.includes('monitor')) {
return SchematicPageType.GENERATOR_MONITORING;
}
if (text.includes('control') && text.includes('logic')) {
return SchematicPageType.CONTROL_LOGIC;
}
if (text.includes('wiring') || text.includes('cable')) {
return SchematicPageType.WIRING_DIAGRAM;
}
if (features.has_component_list) {
return SchematicPageType.COMPONENT_LIST;
}
return page.schematic_type; // Keep existing classification
}
async extractComponents(page, options) {
// Mock implementation - would integrate with existing component classifier
const mockComponents = [];
// Extract components from OCR words that match patterns
for (const word of page.ocr_results.words) {
if (this.isLikelyComponent(word.text)) {
mockComponents.push({
id: word.text,
type: this.inferComponentType(word.text),
category: 'traction_power',
location: word.bbox,
confidence: word.confidence,
specifications: {},
safety_level: 'medium'
});
}
}
return mockComponents;
}
async detectConnections(page) {
// Mock implementation - would use computer vision for line detection
return [];
}
calculatePageQuality(page) {
// Calculate quality score based on OCR confidence and component detection
const ocrQuality = page.ocr_results.confidence;
const componentQuality = page.components.length > 0 ?
page.components.reduce((sum, comp) => sum + comp.confidence, 0) / page.components.length : 0;
return (ocrQuality * 0.6 + componentQuality * 0.4);
}
findPageReferences(text, currentPage) {
const references = [];
// Look for patterns like "See Page 2", "Ref: Sheet 3", etc.
const pageRefPattern = /(?:see|ref|sheet|page)\s*:?\s*(\d+)/gi;
let match;
while ((match = pageRefPattern.exec(text)) !== null) {
const referencedPage = parseInt(match[1] || '0');
if (referencedPage !== currentPage) {
references.push({
id: randomUUID(),
source_page: currentPage,
target_page: referencedPage,
reference_text: match[0],
reference_type: 'page_reference',
confidence: 0.8
});
}
}
return references;
}
isLikelyComponent(text) {
// Simple pattern matching for component identification
const patterns = [
/^[A-Z]\d+[A-Z]?$/, // A601, CB101A
/^[A-Z]{2,4}\d+$/, // CONV01, TRANS205
/^\d{6,8}$/ // Signal references
];
return patterns.some(pattern => pattern.test(text.trim().toUpperCase()));
}
inferComponentType(componentId) {
const id = componentId.toUpperCase();
if (id.startsWith('A') && /^\d+/.test(id.slice(1)))
return 'converter';
if (id.startsWith('CB'))
return 'circuit_breaker';
if (id.startsWith('T'))
return 'transformer';
if (/^\d{6,8}$/.test(id))
return 'signal_reference';
return 'unknown';
}
}
// Export singleton instance
export const pdfProcessor = new PDFProcessor();
//# sourceMappingURL=pdf-processor.js.map