pdf-parse-new
Version:
Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.
635 lines (535 loc) • 17.9 kB
JavaScript
const os = require('os');
const PDF = require('./pdf-parse');
const PDFStream = require('./pdf-parse-stream');
const PDFAggressive = require('./pdf-parse-aggressive');
const PDFProcesses = require('./pdf-parse-processes');
const PDFWorkers = require('./pdf-parse-workers');
/**
* Smart PDF Parser - Automatically selects the optimal parsing method
* based on PDF characteristics and system resources
*
* Rules are loaded from smart-parser-rules.json which is generated by
* benchmark/train-smart-parser.js based on real-world benchmark data.
*
* This approach allows updating decision rules without modifying code,
* making the system more maintainable and allowing custom rule sets.
*/
class SmartPDFParser {
constructor(options = {}) {
// Load decision rules from JSON configuration
this.rules = require('./smart-parser-rules.json');
this.options = {
// System configuration
availableCPUs: os.cpus().length,
maxMemoryUsage: options.maxMemoryUsage || (os.totalmem() * 0.7), // 70% of total RAM
// Override options
forceMethod: options.forceMethod || null, // 'sequential', 'batch', 'stream', 'aggressive', 'processes', 'workers'
// Performance options
enableFastPath: options.enableFastPath !== false, // Fast-path for tiny PDFs
enableCache: options.enableCache !== false, // Cache decisions
// Parallelism optimization
// Oversaturation factor: use more workers than cores because PDF parsing is I/O bound
// During I/O wait, cores are idle - oversaturation keeps them busy
oversaturationFactor: options.oversaturationFactor || 1.5, // 1.5x cores by default
maxWorkerLimit: options.maxWorkerLimit || null, // Hard limit (null = auto)
...options
};
// Statistics (in-memory only)
this.stats = {
totalParses: 0,
methodUsage: {
sequential: 0,
batch: 0,
stream: 0,
aggressive: 0,
processes: 0,
workers: 0
},
averageTimes: {},
failedParses: 0,
fastPathHits: 0,
cacheHits: 0,
treeNavigations: 0
};
// Simple LRU cache for memoization (limit to 100 entries)
this.decisionCache = new Map();
this.cacheMaxSize = 100;
// Pre-compiled common scenarios (from benchmark analysis)
this.commonScenarios = [
// Tiny PDFs
{ pages: [1, 10], sizeMB: [0, 0.5], method: 'sequential', batchSize: null },
// Small PDFs
{ pages: [11, 50], sizeMB: [0.5, 2], method: 'batch', batchSize: 5 },
{ pages: [11, 50], sizeMB: [2, 5], method: 'batch', batchSize: 10 },
// Medium PDFs
{ pages: [51, 200], sizeMB: [1, 10], method: 'batch', batchSize: 20 },
// Large PDFs
{ pages: [201, 500], sizeMB: [5, 20], method: 'batch', batchSize: 50 },
// X-Large PDFs
{ pages: [501, 1000], sizeMB: [10, 50], method: 'batch', batchSize: 50 }
// Huge PDFs handled by CPU-normalized threshold
];
}
/**
* Quick check for fast-path optimization
* Returns decision immediately for obvious cases (no tree navigation needed)
*/
quickCheck(dataBuffer) {
if (!this.options.enableFastPath) {
return null;
}
const sizeMB = dataBuffer.byteLength / (1024 * 1024);
// Fast-path 1: Tiny PDFs (< 0.5 MB) always use sequential
// Overhead of parallel processing > benefit
if (sizeMB < 0.5) {
return {
name: 'sequential',
config: { parallelizePages: false },
parser: PDF,
reason: 'tiny PDF (< 0.5 MB)'
};
}
// Fast-path 2: Very small PDFs (< 1 MB) likely < 50 pages → batch-5
if (sizeMB < 1) {
return {
name: 'batch',
config: { parallelizePages: true, batchSize: 5 },
parser: PDF,
reason: 'small PDF (< 1 MB)'
};
}
return null; // No fast-path, continue to full analysis
}
/**
* Check decision cache for similar PDFs
*/
getCachedDecision(dataBuffer) {
if (!this.options.enableCache) {
return null;
}
// Cache key based on size and CPU (pages unknown at this point)
const sizeMB = dataBuffer.byteLength / (1024 * 1024);
const cacheKey = `${sizeMB.toFixed(1)}_${this.options.availableCPUs}`;
if (this.decisionCache.has(cacheKey)) {
return this.decisionCache.get(cacheKey);
}
return null;
}
/**
* Save decision to cache (LRU with size limit)
*/
cacheDecision(dataBuffer, decision) {
if (!this.options.enableCache) {
return;
}
const sizeMB = dataBuffer.byteLength / (1024 * 1024);
const cacheKey = `${sizeMB.toFixed(1)}_${this.options.availableCPUs}`;
// LRU: remove oldest if at limit
if (this.decisionCache.size >= this.cacheMaxSize) {
const firstKey = this.decisionCache.keys().next().value;
this.decisionCache.delete(firstKey);
}
this.decisionCache.set(cacheKey, decision);
}
/**
* Match against pre-compiled common scenarios
* Much faster than full tree navigation
*/
matchCommonScenario(analysis) {
const { pages, size } = analysis;
const sizeMB = size / (1024 * 1024);
for (const scenario of this.commonScenarios) {
const [minPages, maxPages] = scenario.pages;
const [minSize, maxSize] = scenario.sizeMB;
if (pages >= minPages && pages <= maxPages &&
sizeMB >= minSize && sizeMB <= maxSize) {
// Match found!
const config = scenario.method === 'batch' ? {
parallelizePages: true,
batchSize: scenario.batchSize
} : {
parallelizePages: false
};
return {
name: scenario.method,
config,
parser: scenario.method === 'sequential' ? PDF : PDF,
reason: `${pages} pages, ${sizeMB.toFixed(1)} MB`
};
}
}
return null; // No common scenario match
}
/**
* Main parsing method - automatically selects optimal strategy
* Now with fast-path optimization for minimal overhead on small PDFs
*/
async parse(dataBuffer, userOptions = {}) {
const startTime = performance.now();
try {
// ⚡ FAST-PATH 1: Quick check for obvious cases (no analysis needed)
// Saves ~20-25ms for tiny PDFs
const quickDecision = this.quickCheck(dataBuffer);
if (quickDecision) {
this.stats.fastPathHits++;
console.log(`[SmartPDFParser] ⚡ Fast-path: ${quickDecision.reason}`);
const result = await this.parseWithMethod(dataBuffer, quickDecision, userOptions);
const duration = performance.now() - startTime;
this.updateStats(quickDecision.name, duration, true);
return {
...result,
_meta: {
method: quickDecision.name,
duration,
fastPath: true
}
};
}
// ⚡ FAST-PATH 2: Check cache for similar PDFs
// Saves ~20ms on cache hit
const cachedDecision = this.getCachedDecision(dataBuffer);
if (cachedDecision) {
console.log(`[SmartPDFParser] 💾 Cache hit`);
const result = await this.parseWithMethod(dataBuffer, cachedDecision, userOptions);
const duration = performance.now() - startTime;
this.updateStats(cachedDecision.name, duration, true);
return {
...result,
_meta: {
method: cachedDecision.name,
duration,
cached: true
}
};
}
// Full analysis needed
const analysis = await this.analyzePDF(dataBuffer, userOptions);
// ⚡ FAST-PATH 3: Match common scenario (faster than tree navigation)
// Saves ~15ms vs full tree navigation
const commonMatch = this.matchCommonScenario(analysis);
if (commonMatch) {
console.log(`[SmartPDFParser] 📋 Common scenario: ${commonMatch.reason}`);
this.cacheDecision(dataBuffer, commonMatch);
const result = await this.parseWithMethod(dataBuffer, commonMatch, userOptions);
const duration = performance.now() - startTime;
this.updateStats(commonMatch.name, duration, true);
return {
...result,
_meta: {
method: commonMatch.name,
duration,
analysis,
commonScenario: true
}
};
}
// 🐌 SLOW-PATH: Full decision tree (rare cases only)
this.stats.treeNavigations++;
const method = this.selectMethod(analysis, userOptions);
console.log(`[SmartPDFParser] 🌳 Tree decision: ${method.name}`);
console.log(`[SmartPDFParser] PDF: ${analysis.pages} pages, ${(analysis.size / 1024 / 1024).toFixed(2)} MB`);
console.log(`[SmartPDFParser] Config: ${JSON.stringify(method.config)}`);
// Cache for future similar PDFs
this.cacheDecision(dataBuffer, method);
// Parse with selected method
const result = await this.parseWithMethod(dataBuffer, method, userOptions);
const endTime = performance.now();
const duration = endTime - startTime;
// Update stats (in-memory only)
this.updateStats(method.name, duration, true);
console.log(`[SmartPDFParser] Completed in ${duration.toFixed(2)}ms\n`);
return {
...result,
_meta: {
method: method.name,
duration,
analysis
}
};
} catch (error) {
const endTime = performance.now();
const duration = endTime - startTime;
this.stats.failedParses++;
console.error(`[SmartPDFParser] Failed after ${duration.toFixed(2)}ms:`, error.message);
throw error;
}
}
/**
* Analyze PDF to extract characteristics
*/
async analyzePDF(dataBuffer, userOptions = {}) {
const analysis = {
size: dataBuffer.length,
pages: 0,
estimatedComplexity: 'medium',
availableMemory: os.freemem(),
cpuCores: this.options.availableCPUs
};
try {
// Quick metadata extraction (minimal parsing)
// Pass pagerender if provided by user
const parseOptions = {
max: 1,
verbosityLevel: 0
};
// Include custom pagerender if provided
if (userOptions.pagerender) {
parseOptions.pagerender = userOptions.pagerender;
}
// Include custom pagerender module if provided (for workers/processes)
if (userOptions.pagerenderModule) {
parseOptions.pagerenderModule = userOptions.pagerenderModule;
}
const quickParse = await PDF(dataBuffer, parseOptions);
analysis.pages = quickParse.numpages;
// Estimate complexity based on size per page
const bytesPerPage = analysis.size / analysis.pages;
if (bytesPerPage < 10_000) {
analysis.estimatedComplexity = 'simple'; // Text-heavy
} else if (bytesPerPage > 100_000) {
analysis.estimatedComplexity = 'complex'; // Image-heavy
}
} catch (error) {
console.warn('[SmartPDFParser] Failed to analyze PDF:', error.message);
// Estimate based on size alone
analysis.pages = Math.max(10, Math.floor(analysis.size / 50000));
}
return analysis;
}
/**
* Select optimal parsing method based on analysis
* Rules loaded from smart-parser-rules.json
* Optimized from ${this.rules.benchmarkSamples}+ real-world benchmark samples
* Last trained: ${this.rules.generatedAt}
*/
selectMethod(analysis, userOptions) {
// Check for forced method
if (this.options.forceMethod) {
return this.getMethodConfig(this.options.forceMethod, analysis);
}
const { pages, cpuCores } = analysis;
// Helper: CPU normalization function
const cpuNormalizedThreshold = (baselinePages) => {
return Math.floor(baselinePages * (cpuCores / this.rules.baselineCPU));
};
// Iterate through rules and find the first match
for (const rule of this.rules.rules) {
const condition = rule.condition;
let matches = false;
switch (condition.operator) {
case '<=':
matches = pages <= condition.value;
break;
case '>=':
matches = pages >= condition.value;
break;
case '>':
if (condition.cpuNormalized) {
// Evaluate dynamic threshold
const threshold = cpuNormalizedThreshold(condition.baselineThreshold);
matches = pages > threshold;
} else {
matches = pages > condition.value;
}
break;
case '<':
matches = pages < condition.value;
break;
case 'range':
matches = pages >= condition.min && pages <= condition.max;
break;
}
if (matches) {
// Build config
const config = { ...rule.config };
// Handle dynamic values
if (config.maxProcesses === 'calculateOptimalWorkers') {
config.maxProcesses = this.calculateOptimalWorkers(analysis);
}
if (config.maxWorkers === 'calculateOptimalWorkers') {
config.maxWorkers = this.calculateOptimalWorkers(analysis);
}
// Get parser
const parserMap = {
'PDF': PDF,
'PDFStream': PDFStream,
'PDFAggressive': PDFAggressive,
'PDFProcesses': PDFProcesses,
'PDFWorkers': PDFWorkers
};
return {
name: rule.method,
config,
parser: parserMap[rule.parser] || PDF
};
}
}
// Fallback
const fallback = this.rules.fallback;
return {
name: fallback.method,
config: fallback.config,
parser: PDF
};
}
/**
* Calculate optimal number of workers/processes
* Uses oversaturation to maximize CPU utilization during I/O wait
*/
calculateOptimalWorkers(analysis) {
const { cpuCores } = analysis;
const { oversaturationFactor, maxWorkerLimit } = this.options;
// Base calculation: cores × oversaturation factor
// Example: 24 cores × 1.5 = 36 workers
let optimalWorkers = Math.floor(cpuCores * oversaturationFactor);
// Ensure minimum of 2 workers
optimalWorkers = Math.max(2, optimalWorkers);
// Apply hard limit if set
if (maxWorkerLimit) {
optimalWorkers = Math.min(optimalWorkers, maxWorkerLimit);
}
// Memory-based limiting: ensure we don't run out of memory
const sizeMB = analysis.size / (1024 * 1024);
const availableMemoryMB = analysis.availableMemory / (1024 * 1024);
// Each worker needs ~2x the PDF size in memory (processing overhead)
const memoryPerWorker = sizeMB * 2;
const maxWorkersByMemory = Math.floor(availableMemoryMB / memoryPerWorker);
// Use the more conservative limit
if (maxWorkersByMemory > 0 && maxWorkersByMemory < optimalWorkers) {
console.log(`[SmartPDFParser] Memory limit: ${maxWorkersByMemory} workers (would need ${optimalWorkers})`);
optimalWorkers = Math.max(2, maxWorkersByMemory);
}
return optimalWorkers;
}
/**
* Adaptive batch size based on page complexity
*/
adaptiveBatchSize(analysis) {
const { pages, estimatedComplexity } = analysis;
// Base batch size
let batchSize = 10;
// Adjust for complexity
if (estimatedComplexity === 'simple') {
batchSize = 20; // Simple PDFs can handle larger batches
} else if (estimatedComplexity === 'complex') {
batchSize = 5; // Complex PDFs need smaller batches
}
// Adjust for total pages
if (pages > 200) {
batchSize = Math.min(50, batchSize * 2);
}
return batchSize;
}
/**
* Adaptive chunk size based on available memory
*/
adaptiveChunkSize(analysis) {
const { pages, availableMemory, size } = analysis;
// Estimate memory per page
const memoryPerPage = size / pages;
// Calculate safe chunk size
const safeChunkSize = Math.floor(availableMemory / (memoryPerPage * 2)); // 2x safety factor
// Clamp between 100 and 1000
return Math.max(100, Math.min(1000, safeChunkSize));
}
/**
* Get method configuration by name
*/
getMethodConfig(methodName, analysis) {
const configs = {
sequential: {
name: 'sequential',
config: { parallelizePages: false },
parser: PDF
},
batch: {
name: 'batch',
config: {
parallelizePages: true,
batchSize: this.adaptiveBatchSize(analysis)
},
parser: PDF
},
stream: {
name: 'stream',
config: {
chunkSize: this.adaptiveChunkSize(analysis),
batchSize: 10
},
parser: PDFStream
},
aggressive: {
name: 'aggressive',
config: {
chunkSize: 500,
batchSize: 20
},
parser: PDFAggressive
},
processes: {
name: 'processes',
config: {
chunkSize: this.adaptiveChunkSize(analysis),
batchSize: 10,
maxProcesses: this.calculateOptimalWorkers(analysis)
},
parser: PDFProcesses
},
workers: {
name: 'workers',
config: {
chunkSize: this.adaptiveChunkSize(analysis),
batchSize: 10,
maxWorkers: this.calculateOptimalWorkers(analysis)
},
parser: PDFWorkers
}
};
return configs[methodName] || configs.batch;
}
/**
* Parse with selected method
*/
async parseWithMethod(dataBuffer, method, userOptions) {
const config = {
verbosityLevel: 0,
...method.config,
...userOptions
};
return await method.parser(dataBuffer, config);
}
/**
* Update statistics
*/
updateStats(method, duration, success) {
this.stats.totalParses++;
this.stats.methodUsage[method]++;
if (!this.stats.averageTimes[method]) {
this.stats.averageTimes[method] = [];
}
this.stats.averageTimes[method].push(duration);
}
/**
* Get statistics (in-memory only for current session)
*/
getStats() {
const avgTimes = {};
for (const [method, times] of Object.entries(this.stats.averageTimes)) {
avgTimes[method] = times.reduce((a, b) => a + b, 0) / times.length;
}
// Calculate optimization efficiency
const totalDecisions = this.stats.totalParses;
const optimizedDecisions = this.stats.fastPathHits + this.stats.cacheHits;
const optimizationRate = totalDecisions > 0 ?
((optimizedDecisions / totalDecisions) * 100).toFixed(1) : 0;
return {
...this.stats,
averageTimes: avgTimes,
optimizationRate: `${optimizationRate}%`,
averageOverhead: this.stats.fastPathHits > 0 ? '~0.5ms' :
this.stats.cacheHits > 0 ? '~1ms' :
this.stats.treeNavigations > 0 ? '~25ms' : 'N/A'
};
}
}
module.exports = SmartPDFParser;