UNPKG

@flexabrain/mcp-server

Version:

Advanced electrical schematic analysis MCP server with rail engineering expertise

454 lines 19.8 kB
/** * FlexaBrain MCP Server - PDF Document Processor * * Advanced PDF processing service for multi-page technical schematics. * Handles PDF parsing, page extraction, image conversion, and metadata extraction * optimized for traction generator monitoring control systems. */ import * as fs from 'fs'; import * as path from 'path'; import { randomUUID } from 'crypto'; import { SchematicPageType } from '../types/pdf-schematic.js'; export class PDFProcessor { processingDirectory; dependencies; constructor(processingDir = './temp/pdf-processing', deps = {}) { this.processingDirectory = processingDir; this.dependencies = deps; this.ensureProcessingDirectory(); } /** * Process a PDF schematic document */ async processPDFSchematic(pdfPath, options = {}) { const startTime = Date.now(); const documentId = randomUUID(); const errors = []; const warnings = []; try { console.error(`FlexaBrain PDF Processor: Starting processing of ${pdfPath}`); // Validate input file if (!fs.existsSync(pdfPath)) { throw new Error(`PDF file not found: ${pdfPath}`); } // Set default options const processingOptions = { extract_images: true, enhance_quality: true, detect_rotation: true, extract_cross_references: true, enable_line_detection: true, processing_dpi: 300, ocr_language: 'eng', component_confidence_threshold: 0.7, ...options }; // Extract PDF metadata const metadata = await this.extractPDFMetadata(pdfPath); // Extract pages as images const pages = await this.extractPagesAsImages(pdfPath, documentId, processingOptions); // Process each page for components and text const processedPages = []; let totalComponents = 0; for (const page of pages) { try { const processedPage = await this.processSchematicPage(page, processingOptions); processedPages.push(processedPage); totalComponents += processedPage.components.length; } catch (error) { errors.push({ code: 'PAGE_PROCESSING_ERROR', message: `Failed to process page ${page.page_number}: ${error instanceof Error ? error.message : String(error)}`, page_number: page.page_number, severity: 'medium', recovery_suggestions: ['Try reducing DPI settings', 'Check page image quality'] }); } } // Extract cross-page references if enabled const crossReferences = processingOptions.extract_cross_references ? await this.extractCrossPageReferences(processedPages) : []; // Calculate processing statistics const totalProcessingTime = Date.now() - startTime; const averageConfidence = processedPages.length > 0 ? processedPages.reduce((sum, page) => sum + page.quality_score, 0) / processedPages.length : 0; const result = { document_id: documentId, metadata, pages: processedPages, cross_references: crossReferences, processing_stats: { total_processing_time: totalProcessingTime, pages_processed: processedPages.length, components_found: totalComponents, cross_references_found: crossReferences.length, average_confidence: averageConfidence }, errors, warnings }; console.error(`FlexaBrain PDF Processor: Completed processing in ${totalProcessingTime}ms`); console.error(`FlexaBrain PDF Processor: Found ${totalComponents} components across ${processedPages.length} pages`); return result; } catch (error) { console.error('FlexaBrain PDF Processor: Processing failed:', error); errors.push({ code: 'PDF_PROCESSING_FAILED', message: `PDF processing failed: ${error instanceof Error ? error.message : String(error)}`, severity: 'critical', recovery_suggestions: [ 'Verify PDF file is not corrupted', 'Check file permissions', 'Try converting PDF to individual images first' ] }); return { document_id: documentId, metadata: { filename: path.basename(pdfPath), pages_count: 0, document_type: 'general' }, pages: [], cross_references: [], processing_stats: { total_processing_time: Date.now() - startTime, pages_processed: 0, components_found: 0, cross_references_found: 0, average_confidence: 0 }, errors, warnings }; } } /** * Extract PDF metadata and document information */ async extractPDFMetadata(pdfPath) { try { // In a real implementation, this would use pdf-parse or similar const stats = fs.statSync(pdfPath); const filename = path.basename(pdfPath); // Mock metadata extraction - in real implementation would parse PDF info const extractedVersion = this.extractVersionFromFilename(filename); const metadata = { filename, creation_date: stats.birthtime, modification_date: stats.mtime, pages_count: await this.estimatePageCount(pdfPath), document_type: this.inferDocumentType(filename), // These would be extracted from actual PDF metadata in real implementation title: this.extractTitleFromFilename(filename), ...(extractedVersion && { version: extractedVersion }) }; return metadata; } catch (error) { console.error('Failed to extract PDF metadata:', error); return { filename: path.basename(pdfPath), pages_count: 1, document_type: 'general' }; } } /** * Extract pages from PDF as high-quality images */ async extractPagesAsImages(pdfPath, documentId, options) { const pages = []; try { // Mock implementation - in real version would use pdf2pic or similar console.error(`FlexaBrain PDF Processor: Extracting pages at ${options.processing_dpi} DPI`); // For demo purposes, simulate extracting pages const pageCount = await this.estimatePageCount(pdfPath); for (let pageNum = 1; pageNum <= pageCount; pageNum++) { const pageId = `${documentId}_page_${pageNum}`; const imagePath = path.join(this.processingDirectory, `${pageId}.png`); // Mock page extraction - in real implementation would convert PDF page to image const mockImageBuffer = Buffer.from('mock-image-data'); const page = { page_number: pageNum, title: `Page ${pageNum}`, schematic_type: this.inferPageType(pageNum, pageCount), image_data: mockImageBuffer, image_path: imagePath, dimensions: { width: Math.floor(8.5 * options.processing_dpi), // Assume Letter size height: Math.floor(11 * options.processing_dpi), dpi: options.processing_dpi }, components: [], // Will be populated during processing connections: [], ocr_results: { text: '', confidence: 0, words: [], processing_time: 0, language: options.ocr_language }, processing_time: 0, quality_score: 0.8 // Will be calculated during processing }; pages.push(page); } return pages; } catch (error) { console.error('Failed to extract PDF pages:', error); throw new Error(`Page extraction failed: ${error instanceof Error ? error.message : String(error)}`); } } /** * Process individual schematic page for components and connections */ async processSchematicPage(page, options) { const startTime = Date.now(); try { // Apply image enhancements if enabled if (options.enhance_quality) { await this.enhanceImageQuality(page); } // Detect and correct rotation if needed if (options.detect_rotation) { await this.detectAndCorrectRotation(page); } // Perform OCR text extraction page.ocr_results = await this.performEnhancedOCR(page, options); // Extract technical drawing features const drawingFeatures = await this.extractDrawingFeatures(page); // Classify page type based on content page.schematic_type = await this.classifyPageType(page, drawingFeatures); // Extract electrical components (placeholder - would integrate with existing classifier) page.components = await this.extractComponents(page, options); // Detect connections and wiring if enabled if (options.enable_line_detection) { page.connections = await this.detectConnections(page); } page.processing_time = Date.now() - startTime; page.quality_score = this.calculatePageQuality(page); return page; } catch (error) { console.error(`Failed to process page ${page.page_number}:`, error); page.processing_time = Date.now() - startTime; page.quality_score = 0.3; // Low quality due to processing failure return page; } } /** * Extract cross-page references and component relationships */ async extractCrossPageReferences(pages) { // Mock implementation - would analyze component references across pages const crossReferences = []; try { // Look for cross-references in OCR text for (const page of pages) { const pageRefs = this.findPageReferences(page.ocr_results.text, page.page_number); crossReferences.push(...pageRefs); } // Look for continued components (same ID on multiple pages) const componentMap = new Map(); for (const page of pages) { for (const component of page.components) { if (!componentMap.has(component.id)) { componentMap.set(component.id, []); } componentMap.get(component.id).push({ component, page_number: page.page_number }); } } // Create cross-references for components appearing on multiple pages for (const [componentId, instances] of componentMap) { if (instances.length > 1) { for (let i = 1; i < instances.length; i++) { crossReferences.push({ id: randomUUID(), source: { component_id: componentId, page_number: instances[0].page_number, component_type: instances[0].component.type, location: instances[0].component.location }, target: { component_id: componentId, page_number: instances[i].page_number, component_type: instances[i].component.type, location: instances[i].component.location }, reference_type: 'continuation', confidence: 0.85, verified: false }); } } } return crossReferences; } catch (error) { console.error('Failed to extract cross-page references:', error); return []; } } // Helper methods (mock implementations - would be fully implemented in production) ensureProcessingDirectory() { if (!fs.existsSync(this.processingDirectory)) { fs.mkdirSync(this.processingDirectory, { recursive: true }); } } async estimatePageCount(pdfPath) { // Mock implementation - would actually count PDF pages return 3; // Assume 3 pages for demo } inferDocumentType(filename) { const lower = filename.toLowerCase(); if (lower.includes('traction')) return 'traction_control'; if (lower.includes('generator') || lower.includes('monitor')) return 'generator_monitoring'; if (lower.includes('power') || lower.includes('distribution')) return 'power_distribution'; return 'general'; } extractTitleFromFilename(filename) { return filename.replace(/\.[^/.]+$/, "").replace(/_/g, ' '); } extractVersionFromFilename(filename) { const versionMatch = filename.match(/[vV]?(\d+\.\d+\.?\d*)/); return versionMatch ? versionMatch[1] : undefined; } inferPageType(pageNum, totalPages) { if (pageNum === 1) return SchematicPageType.OVERVIEW; if (pageNum === totalPages) return SchematicPageType.COMPONENT_LIST; return SchematicPageType.DETAIL; } async enhanceImageQuality(page) { // Mock implementation - would use Sharp or similar for image enhancement console.error(`Enhancing image quality for page ${page.page_number}`); } async detectAndCorrectRotation(page) { // Mock implementation - would detect and correct page rotation console.error(`Checking rotation for page ${page.page_number}`); } async performEnhancedOCR(page, options) { // Mock implementation - would integrate with enhanced OCR service return { text: `Mock OCR text for page ${page.page_number} with generator monitoring components A601 CB101`, confidence: 0.85, words: [ { text: 'A601', confidence: 0.92, bbox: { x: 100, y: 50, width: 40, height: 20 } }, { text: 'CB101', confidence: 0.88, bbox: { x: 200, y: 100, width: 50, height: 25 } } ], processing_time: 1500, language: options.ocr_language }; } async extractDrawingFeatures(page) { // Mock implementation - would analyze drawing features return { has_title_block: true, has_component_list: page.page_number === 1, has_wire_numbers: true, has_terminal_references: true, has_cross_references: true, sheet_size: 'A1', revision: 'P01' }; } async classifyPageType(page, features) { // Enhanced page classification based on content const text = page.ocr_results.text.toLowerCase(); if (text.includes('generator') && text.includes('monitor')) { return SchematicPageType.GENERATOR_MONITORING; } if (text.includes('control') && text.includes('logic')) { return SchematicPageType.CONTROL_LOGIC; } if (text.includes('wiring') || text.includes('cable')) { return SchematicPageType.WIRING_DIAGRAM; } if (features.has_component_list) { return SchematicPageType.COMPONENT_LIST; } return page.schematic_type; // Keep existing classification } async extractComponents(page, options) { // Mock implementation - would integrate with existing component classifier const mockComponents = []; // Extract components from OCR words that match patterns for (const word of page.ocr_results.words) { if (this.isLikelyComponent(word.text)) { mockComponents.push({ id: word.text, type: this.inferComponentType(word.text), category: 'traction_power', location: word.bbox, confidence: word.confidence, specifications: {}, safety_level: 'medium' }); } } return mockComponents; } async detectConnections(page) { // Mock implementation - would use computer vision for line detection return []; } calculatePageQuality(page) { // Calculate quality score based on OCR confidence and component detection const ocrQuality = page.ocr_results.confidence; const componentQuality = page.components.length > 0 ? page.components.reduce((sum, comp) => sum + comp.confidence, 0) / page.components.length : 0; return (ocrQuality * 0.6 + componentQuality * 0.4); } findPageReferences(text, currentPage) { const references = []; // Look for patterns like "See Page 2", "Ref: Sheet 3", etc. const pageRefPattern = /(?:see|ref|sheet|page)\s*:?\s*(\d+)/gi; let match; while ((match = pageRefPattern.exec(text)) !== null) { const referencedPage = parseInt(match[1] || '0'); if (referencedPage !== currentPage) { references.push({ id: randomUUID(), source_page: currentPage, target_page: referencedPage, reference_text: match[0], reference_type: 'page_reference', confidence: 0.8 }); } } return references; } isLikelyComponent(text) { // Simple pattern matching for component identification const patterns = [ /^[A-Z]\d+[A-Z]?$/, // A601, CB101A /^[A-Z]{2,4}\d+$/, // CONV01, TRANS205 /^\d{6,8}$/ // Signal references ]; return patterns.some(pattern => pattern.test(text.trim().toUpperCase())); } inferComponentType(componentId) { const id = componentId.toUpperCase(); if (id.startsWith('A') && /^\d+/.test(id.slice(1))) return 'converter'; if (id.startsWith('CB')) return 'circuit_breaker'; if (id.startsWith('T')) return 'transformer'; if (/^\d{6,8}$/.test(id)) return 'signal_reference'; return 'unknown'; } } // Export singleton instance export const pdfProcessor = new PDFProcessor(); //# sourceMappingURL=pdf-processor.js.map