UNPKG

pdf-parse-new

Version:

Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.

github.com/simonegosetto/pdf-parse-new

simonegosetto/pdf-parse-new

635 lines (535 loc) • 17.9 kB

JavaScript

const os = require('os'); const PDF = require('./pdf-parse'); const PDFStream = require('./pdf-parse-stream'); const PDFAggressive = require('./pdf-parse-aggressive'); const PDFProcesses = require('./pdf-parse-processes'); const PDFWorkers = require('./pdf-parse-workers'); /** * Smart PDF Parser - Automatically selects the optimal parsing method * based on PDF characteristics and system resources * * Rules are loaded from smart-parser-rules.json which is generated by * benchmark/train-smart-parser.js based on real-world benchmark data. * * This approach allows updating decision rules without modifying code, * making the system more maintainable and allowing custom rule sets. */ class SmartPDFParser { constructor(options = {}) { // Load decision rules from JSON configuration this.rules = require('./smart-parser-rules.json'); this.options = { // System configuration availableCPUs: os.cpus().length, maxMemoryUsage: options.maxMemoryUsage || (os.totalmem() * 0.7), // 70% of total RAM // Override options forceMethod: options.forceMethod || null, // 'sequential', 'batch', 'stream', 'aggressive', 'processes', 'workers' // Performance options enableFastPath: options.enableFastPath !== false, // Fast-path for tiny PDFs enableCache: options.enableCache !== false, // Cache decisions // Parallelism optimization // Oversaturation factor: use more workers than cores because PDF parsing is I/O bound // During I/O wait, cores are idle - oversaturation keeps them busy oversaturationFactor: options.oversaturationFactor || 1.5, // 1.5x cores by default maxWorkerLimit: options.maxWorkerLimit || null, // Hard limit (null = auto) ...options }; // Statistics (in-memory only) this.stats = { totalParses: 0, methodUsage: { sequential: 0, batch: 0, stream: 0, aggressive: 0, processes: 0, workers: 0 }, averageTimes: {}, failedParses: 0, fastPathHits: 0, cacheHits: 0, treeNavigations: 0 }; // Simple LRU cache for memoization (limit to 100 entries) this.decisionCache = new Map(); this.cacheMaxSize = 100; // Pre-compiled common scenarios (from benchmark analysis) this.commonScenarios = [ // Tiny PDFs { pages: [1, 10], sizeMB: [0, 0.5], method: 'sequential', batchSize: null }, // Small PDFs { pages: [11, 50], sizeMB: [0.5, 2], method: 'batch', batchSize: 5 }, { pages: [11, 50], sizeMB: [2, 5], method: 'batch', batchSize: 10 }, // Medium PDFs { pages: [51, 200], sizeMB: [1, 10], method: 'batch', batchSize: 20 }, // Large PDFs { pages: [201, 500], sizeMB: [5, 20], method: 'batch', batchSize: 50 }, // X-Large PDFs { pages: [501, 1000], sizeMB: [10, 50], method: 'batch', batchSize: 50 } // Huge PDFs handled by CPU-normalized threshold ]; } /** * Quick check for fast-path optimization * Returns decision immediately for obvious cases (no tree navigation needed) */ quickCheck(dataBuffer) { if (!this.options.enableFastPath) { return null; } const sizeMB = dataBuffer.byteLength / (1024 * 1024); // Fast-path 1: Tiny PDFs (< 0.5 MB) always use sequential // Overhead of parallel processing > benefit if (sizeMB < 0.5) { return { name: 'sequential', config: { parallelizePages: false }, parser: PDF, reason: 'tiny PDF (< 0.5 MB)' }; } // Fast-path 2: Very small PDFs (< 1 MB) likely < 50 pages → batch-5 if (sizeMB < 1) { return { name: 'batch', config: { parallelizePages: true, batchSize: 5 }, parser: PDF, reason: 'small PDF (< 1 MB)' }; } return null; // No fast-path, continue to full analysis } /** * Check decision cache for similar PDFs */ getCachedDecision(dataBuffer) { if (!this.options.enableCache) { return null; } // Cache key based on size and CPU (pages unknown at this point) const sizeMB = dataBuffer.byteLength / (1024 * 1024); const cacheKey = `${sizeMB.toFixed(1)}_${this.options.availableCPUs}`; if (this.decisionCache.has(cacheKey)) { return this.decisionCache.get(cacheKey); } return null; } /** * Save decision to cache (LRU with size limit) */ cacheDecision(dataBuffer, decision) { if (!this.options.enableCache) { return; } const sizeMB = dataBuffer.byteLength / (1024 * 1024); const cacheKey = `${sizeMB.toFixed(1)}_${this.options.availableCPUs}`; // LRU: remove oldest if at limit if (this.decisionCache.size >= this.cacheMaxSize) { const firstKey = this.decisionCache.keys().next().value; this.decisionCache.delete(firstKey); } this.decisionCache.set(cacheKey, decision); } /** * Match against pre-compiled common scenarios * Much faster than full tree navigation */ matchCommonScenario(analysis) { const { pages, size } = analysis; const sizeMB = size / (1024 * 1024); for (const scenario of this.commonScenarios) { const [minPages, maxPages] = scenario.pages; const [minSize, maxSize] = scenario.sizeMB; if (pages >= minPages && pages <= maxPages && sizeMB >= minSize && sizeMB <= maxSize) { // Match found! const config = scenario.method === 'batch' ? { parallelizePages: true, batchSize: scenario.batchSize } : { parallelizePages: false }; return { name: scenario.method, config, parser: scenario.method === 'sequential' ? PDF : PDF, reason: `${pages} pages, ${sizeMB.toFixed(1)} MB` }; } } return null; // No common scenario match } /** * Main parsing method - automatically selects optimal strategy * Now with fast-path optimization for minimal overhead on small PDFs */ async parse(dataBuffer, userOptions = {}) { const startTime = performance.now(); try { // ⚡ FAST-PATH 1: Quick check for obvious cases (no analysis needed) // Saves ~20-25ms for tiny PDFs const quickDecision = this.quickCheck(dataBuffer); if (quickDecision) { this.stats.fastPathHits++; console.log(`[SmartPDFParser] ⚡ Fast-path: ${quickDecision.reason}`); const result = await this.parseWithMethod(dataBuffer, quickDecision, userOptions); const duration = performance.now() - startTime; this.updateStats(quickDecision.name, duration, true); return { ...result, _meta: { method: quickDecision.name, duration, fastPath: true } }; } // ⚡ FAST-PATH 2: Check cache for similar PDFs // Saves ~20ms on cache hit const cachedDecision = this.getCachedDecision(dataBuffer); if (cachedDecision) { console.log(`[SmartPDFParser] 💾 Cache hit`); const result = await this.parseWithMethod(dataBuffer, cachedDecision, userOptions); const duration = performance.now() - startTime; this.updateStats(cachedDecision.name, duration, true); return { ...result, _meta: { method: cachedDecision.name, duration, cached: true } }; } // Full analysis needed const analysis = await this.analyzePDF(dataBuffer, userOptions); // ⚡ FAST-PATH 3: Match common scenario (faster than tree navigation) // Saves ~15ms vs full tree navigation const commonMatch = this.matchCommonScenario(analysis); if (commonMatch) { console.log(`[SmartPDFParser] 📋 Common scenario: ${commonMatch.reason}`); this.cacheDecision(dataBuffer, commonMatch); const result = await this.parseWithMethod(dataBuffer, commonMatch, userOptions); const duration = performance.now() - startTime; this.updateStats(commonMatch.name, duration, true); return { ...result, _meta: { method: commonMatch.name, duration, analysis, commonScenario: true } }; } // 🐌 SLOW-PATH: Full decision tree (rare cases only) this.stats.treeNavigations++; const method = this.selectMethod(analysis, userOptions); console.log(`[SmartPDFParser] 🌳 Tree decision: ${method.name}`); console.log(`[SmartPDFParser] PDF: ${analysis.pages} pages, ${(analysis.size / 1024 / 1024).toFixed(2)} MB`); console.log(`[SmartPDFParser] Config: ${JSON.stringify(method.config)}`); // Cache for future similar PDFs this.cacheDecision(dataBuffer, method); // Parse with selected method const result = await this.parseWithMethod(dataBuffer, method, userOptions); const endTime = performance.now(); const duration = endTime - startTime; // Update stats (in-memory only) this.updateStats(method.name, duration, true); console.log(`[SmartPDFParser] Completed in ${duration.toFixed(2)}ms\n`); return { ...result, _meta: { method: method.name, duration, analysis } }; } catch (error) { const endTime = performance.now(); const duration = endTime - startTime; this.stats.failedParses++; console.error(`[SmartPDFParser] Failed after ${duration.toFixed(2)}ms:`, error.message); throw error; } } /** * Analyze PDF to extract characteristics */ async analyzePDF(dataBuffer, userOptions = {}) { const analysis = { size: dataBuffer.length, pages: 0, estimatedComplexity: 'medium', availableMemory: os.freemem(), cpuCores: this.options.availableCPUs }; try { // Quick metadata extraction (minimal parsing) // Pass pagerender if provided by user const parseOptions = { max: 1, verbosityLevel: 0 }; // Include custom pagerender if provided if (userOptions.pagerender) { parseOptions.pagerender = userOptions.pagerender; } // Include custom pagerender module if provided (for workers/processes) if (userOptions.pagerenderModule) { parseOptions.pagerenderModule = userOptions.pagerenderModule; } const quickParse = await PDF(dataBuffer, parseOptions); analysis.pages = quickParse.numpages; // Estimate complexity based on size per page const bytesPerPage = analysis.size / analysis.pages; if (bytesPerPage < 10_000) { analysis.estimatedComplexity = 'simple'; // Text-heavy } else if (bytesPerPage > 100_000) { analysis.estimatedComplexity = 'complex'; // Image-heavy } } catch (error) { console.warn('[SmartPDFParser] Failed to analyze PDF:', error.message); // Estimate based on size alone analysis.pages = Math.max(10, Math.floor(analysis.size / 50000)); } return analysis; } /** * Select optimal parsing method based on analysis * Rules loaded from smart-parser-rules.json * Optimized from ${this.rules.benchmarkSamples}+ real-world benchmark samples * Last trained: ${this.rules.generatedAt} */ selectMethod(analysis, userOptions) { // Check for forced method if (this.options.forceMethod) { return this.getMethodConfig(this.options.forceMethod, analysis); } const { pages, cpuCores } = analysis; // Helper: CPU normalization function const cpuNormalizedThreshold = (baselinePages) => { return Math.floor(baselinePages * (cpuCores / this.rules.baselineCPU)); }; // Iterate through rules and find the first match for (const rule of this.rules.rules) { const condition = rule.condition; let matches = false; switch (condition.operator) { case '<=': matches = pages <= condition.value; break; case '>=': matches = pages >= condition.value; break; case '>': if (condition.cpuNormalized) { // Evaluate dynamic threshold const threshold = cpuNormalizedThreshold(condition.baselineThreshold); matches = pages > threshold; } else { matches = pages > condition.value; } break; case '<': matches = pages < condition.value; break; case 'range': matches = pages >= condition.min && pages <= condition.max; break; } if (matches) { // Build config const config = { ...rule.config }; // Handle dynamic values if (config.maxProcesses === 'calculateOptimalWorkers') { config.maxProcesses = this.calculateOptimalWorkers(analysis); } if (config.maxWorkers === 'calculateOptimalWorkers') { config.maxWorkers = this.calculateOptimalWorkers(analysis); } // Get parser const parserMap = { 'PDF': PDF, 'PDFStream': PDFStream, 'PDFAggressive': PDFAggressive, 'PDFProcesses': PDFProcesses, 'PDFWorkers': PDFWorkers }; return { name: rule.method, config, parser: parserMap[rule.parser] || PDF }; } } // Fallback const fallback = this.rules.fallback; return { name: fallback.method, config: fallback.config, parser: PDF }; } /** * Calculate optimal number of workers/processes * Uses oversaturation to maximize CPU utilization during I/O wait */ calculateOptimalWorkers(analysis) { const { cpuCores } = analysis; const { oversaturationFactor, maxWorkerLimit } = this.options; // Base calculation: cores × oversaturation factor // Example: 24 cores × 1.5 = 36 workers let optimalWorkers = Math.floor(cpuCores * oversaturationFactor); // Ensure minimum of 2 workers optimalWorkers = Math.max(2, optimalWorkers); // Apply hard limit if set if (maxWorkerLimit) { optimalWorkers = Math.min(optimalWorkers, maxWorkerLimit); } // Memory-based limiting: ensure we don't run out of memory const sizeMB = analysis.size / (1024 * 1024); const availableMemoryMB = analysis.availableMemory / (1024 * 1024); // Each worker needs ~2x the PDF size in memory (processing overhead) const memoryPerWorker = sizeMB * 2; const maxWorkersByMemory = Math.floor(availableMemoryMB / memoryPerWorker); // Use the more conservative limit if (maxWorkersByMemory > 0 && maxWorkersByMemory < optimalWorkers) { console.log(`[SmartPDFParser] Memory limit: ${maxWorkersByMemory} workers (would need ${optimalWorkers})`); optimalWorkers = Math.max(2, maxWorkersByMemory); } return optimalWorkers; } /** * Adaptive batch size based on page complexity */ adaptiveBatchSize(analysis) { const { pages, estimatedComplexity } = analysis; // Base batch size let batchSize = 10; // Adjust for complexity if (estimatedComplexity === 'simple') { batchSize = 20; // Simple PDFs can handle larger batches } else if (estimatedComplexity === 'complex') { batchSize = 5; // Complex PDFs need smaller batches } // Adjust for total pages if (pages > 200) { batchSize = Math.min(50, batchSize * 2); } return batchSize; } /** * Adaptive chunk size based on available memory */ adaptiveChunkSize(analysis) { const { pages, availableMemory, size } = analysis; // Estimate memory per page const memoryPerPage = size / pages; // Calculate safe chunk size const safeChunkSize = Math.floor(availableMemory / (memoryPerPage * 2)); // 2x safety factor // Clamp between 100 and 1000 return Math.max(100, Math.min(1000, safeChunkSize)); } /** * Get method configuration by name */ getMethodConfig(methodName, analysis) { const configs = { sequential: { name: 'sequential', config: { parallelizePages: false }, parser: PDF }, batch: { name: 'batch', config: { parallelizePages: true, batchSize: this.adaptiveBatchSize(analysis) }, parser: PDF }, stream: { name: 'stream', config: { chunkSize: this.adaptiveChunkSize(analysis), batchSize: 10 }, parser: PDFStream }, aggressive: { name: 'aggressive', config: { chunkSize: 500, batchSize: 20 }, parser: PDFAggressive }, processes: { name: 'processes', config: { chunkSize: this.adaptiveChunkSize(analysis), batchSize: 10, maxProcesses: this.calculateOptimalWorkers(analysis) }, parser: PDFProcesses }, workers: { name: 'workers', config: { chunkSize: this.adaptiveChunkSize(analysis), batchSize: 10, maxWorkers: this.calculateOptimalWorkers(analysis) }, parser: PDFWorkers } }; return configs[methodName] || configs.batch; } /** * Parse with selected method */ async parseWithMethod(dataBuffer, method, userOptions) { const config = { verbosityLevel: 0, ...method.config, ...userOptions }; return await method.parser(dataBuffer, config); } /** * Update statistics */ updateStats(method, duration, success) { this.stats.totalParses++; this.stats.methodUsage[method]++; if (!this.stats.averageTimes[method]) { this.stats.averageTimes[method] = []; } this.stats.averageTimes[method].push(duration); } /** * Get statistics (in-memory only for current session) */ getStats() { const avgTimes = {}; for (const [method, times] of Object.entries(this.stats.averageTimes)) { avgTimes[method] = times.reduce((a, b) => a + b, 0) / times.length; } // Calculate optimization efficiency const totalDecisions = this.stats.totalParses; const optimizedDecisions = this.stats.fastPathHits + this.stats.cacheHits; const optimizationRate = totalDecisions > 0 ? ((optimizedDecisions / totalDecisions) * 100).toFixed(1) : 0; return { ...this.stats, averageTimes: avgTimes, optimizationRate: `${optimizationRate}%`, averageOverhead: this.stats.fastPathHits > 0 ? '~0.5ms' : this.stats.cacheHits > 0 ? '~1ms' : this.stats.treeNavigations > 0 ? '~25ms' : 'N/A' }; } } module.exports = SmartPDFParser;