pdf-parse-new
Version:
Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.
346 lines (293 loc) ⢠10.5 kB
JavaScript
const SmartPDFParser = require('../lib/SmartPDFParser');
const PDF = require('../lib/pdf-parse');
const PDFStream = require('../lib/pdf-parse-stream');
const PDFAggressive = require('../lib/pdf-parse-aggressive');
const PDFProcesses = require('../lib/pdf-parse-processes');
const PDFWorkers = require('../lib/pdf-parse-workers');
const fs = require('fs');
const { URL } = require('url');
const axios = require("axios");
console.log('=== Intensive Benchmark Collection ===\n');
/**
* Download PDF from remote URL
*/
async function downloadPDF(url) {
try {
console.log(`š„ Downloading PDF from: ${url}`);
const response = await axios.get(url, {
responseType: 'arraybuffer',
timeout: 30000, // 30 second timeout
maxRedirects: 5
});
const buffer = Buffer.from(response.data);
console.log(`ā Downloaded ${(buffer.length / 1024 / 1024).toFixed(2)} MB`);
return buffer;
} catch (error) {
if (error.code === 'ECONNABORTED') {
throw new Error('Download timeout (30s)');
} else if (error.response) {
throw new Error(`Failed to download: HTTP ${error.response.status}`);
} else {
throw new Error(`Download failed: ${error.message}`);
}
}
}
/**
* Check if string is a URL
*/
function isURL(str) {
try {
const url = new URL(str);
return url.protocol === 'http:' || url.protocol === 'https:';
} catch {
return false;
}
}
/**
* Load PDF from file or URL
*/
async function loadPDF(source) {
if (isURL(source)) {
return await downloadPDF(source);
} else {
return fs.readFileSync(source);
}
}
/**
* Test all methods on a PDF to collect comprehensive data
*/
async function benchmarkAllMethods(file) {
const dataBuffer = await loadPDF(file);
const results = [];
const fileName = isURL(file) ? Math.random().toString() /*new URL(file).pathname.split('/').pop()*/ : file.split(/[/\\]/).pop();
console.log(`Starting benchmarks for: ${fileName}`);
console.log(`\n${'='.repeat(80)}`);
console.log(`š Benchmarking: ${file}`);
console.log(` Size: ${(dataBuffer.length / 1024 / 1024).toFixed(2)} MB`);
// Get page count
let pages = 0;
try {
const quick = await PDF(dataBuffer, { max: 1, verbosityLevel: 0 });
pages = quick.numpages;
console.log(` Pages: ${pages}`);
} catch (error) {
console.log(` ā ļø Could not determine page count`);
return results;
}
console.log('='.repeat(80));
// Test methods based on PDF size
const methods = [];
// Always test sequential and batch
methods.push(
{ name: 'sequential', fn: PDF, config: { parallelizePages: false } },
{ name: 'batch-5', fn: PDF, config: { parallelizePages: true, batchSize: 5 } },
{ name: 'batch-10', fn: PDF, config: { parallelizePages: true, batchSize: 10 } },
);
if (pages > 50) {
methods.push(
{ name: 'batch-20', fn: PDF, config: { parallelizePages: true, batchSize: 20 } },
/* );
}
if (pages > 100) {
methods.push(*/
{ name: 'stream', fn: PDFStream, config: { chunkSize: 500, batchSize: 10 } },
{ name: 'aggressive', fn: PDFAggressive, config: { chunkSize: 500, batchSize: 20 } },
{ name: 'workers', fn: PDFWorkers, config: { chunkSize: 500, batchSize: 10, maxProcesses: require('os').cpus().length - 1 } },
{ name: 'processes', fn: PDFProcesses, config: { chunkSize: 500, batchSize: 10, maxProcesses: require('os').cpus().length - 1 } }
);
}
/*if (pages > 500) {
methods.push(
{ name: 'workers', fn: PDFWorkers, config: { chunkSize: 500, batchSize: 10, maxProcesses: require('os').cpus().length - 1 } },
{ name: 'processes', fn: PDFProcesses, config: { chunkSize: 500, batchSize: 10, maxProcesses: require('os').cpus().length - 1 } }
);
}*/
// Test each method
for (const method of methods) {
try {
console.log(`\nā” Testing ${method.name}...`);
const start = performance.now();
const result = await method.fn(dataBuffer, {
verbosityLevel: 0,
...method.config
});
const duration = performance.now() - start;
console.log(`ā ${method.name}: ${duration.toFixed(2)}ms (${result.text.length.toLocaleString()} chars)`);
results.push({
file: fileName,
pages,
size: dataBuffer.length,
method: method.name,
config: method.config,
duration,
characters: result.text.length,
success: true,
cpuCores: require('os').cpus().length, // ā
Importante per normalizzazione
availableMemory: require('os').freemem(),
timestamp: Date.now()
});
} catch (error) {
console.error(`ā ${method.name}: ${error.message}`);
results.push({
file: fileName,
pages,
size: dataBuffer.length,
method: method.name,
config: method.config,
duration: 0,
success: false,
error: error.message,
timestamp: Date.now()
});
}
}
return results;
}
/**
* Run comprehensive benchmarks
*/
async function runBenchmarks() {
// Load test files from external JSON
const testFilesData = JSON.parse(fs.readFileSync('./test-pdfs.json', 'utf8'));
const testFiles = testFilesData.urls || [];
const allResults = [];
const outputFile = './smart-parser-benchmarks.json';
const totalFiles = testFiles.length;
let processedFiles = 0;
let skippedFiles = 0;
console.log(`š Total files to process: ${totalFiles}\n`);
for (const file of testFiles.reverse()) {
if (!isURL(file) && !fs.existsSync(file)) {
skippedFiles++;
console.log(`\nāļø Skipping ${file} (not found)`);
console.log(` Progress: ${processedFiles}/${totalFiles} processed, ${skippedFiles} skipped\n`);
continue;
}
try {
const results = await benchmarkAllMethods(file);
allResults.push(...results);
processedFiles++;
// Save in smart parser format incrementally (only format needed)
const converted = convertToSmartParserFormatData(allResults);
fs.writeFileSync(outputFile, JSON.stringify(converted, null, 2));
const percentComplete = ((processedFiles / totalFiles) * 100).toFixed(1);
console.log()
console.log(`š¾ Progress saved (${allResults.length} results so far)`);
console.log(`š Files: ${processedFiles}/${totalFiles} (${percentComplete}%) - ${totalFiles - processedFiles} remaining`);
} catch (error) {
processedFiles++;
console.error(`\nā Fatal error processing ${file}:`);
console.error(` ${error.message}`);
console.log(` Continuing with next file...\n`);
const percentComplete = ((processedFiles / totalFiles) * 100).toFixed(1);
console.log(`š Files: ${processedFiles}/${totalFiles} (${percentComplete}%) - ${totalFiles - processedFiles} remaining`);
}
// Small delay between files
await new Promise(resolve => setTimeout(resolve, 1000));
}
// Final save (ensure everything is saved)
const converted = convertToSmartParserFormatData(allResults);
fs.writeFileSync(outputFile, JSON.stringify(converted, null, 2));
console.log(`\n\n${'='.repeat(80)}`);
console.log('ā
BENCHMARK COLLECTION COMPLETED');
console.log('='.repeat(80));
console.log(`š Total files: ${totalFiles}`);
console.log(`ā Processed: ${processedFiles}`);
console.log(`āļø Skipped: ${skippedFiles}`);
console.log(`š Total benchmarks: ${allResults.length}`);
console.log(`š¾ Results saved to: ${outputFile}`);
console.log('='.repeat(80));
// Analyze results
analyzeResults(allResults);
}
/**
* Analyze collected results
*/
function analyzeResults(results) {
console.log(`\n${'='.repeat(80)}`);
console.log('š Benchmark Analysis');
console.log('='.repeat(80));
// Group by file
const byFile = {};
for (const r of results) {
if (!r.success) continue;
if (!byFile[r.file]) {
byFile[r.file] = [];
}
byFile[r.file].push(r);
}
// Show best method for each file
console.log('\nš Best Method Per File:');
console.log('-'.repeat(80));
for (const [file, fileResults] of Object.entries(byFile)) {
const best = fileResults.reduce((a, b) => a.duration < b.duration ? a : b);
const baseline = fileResults.find(r => r.method === 'sequential');
if (baseline) {
const improvement = ((baseline.duration - best.duration) / baseline.duration * 100).toFixed(2);
const speedup = (baseline.duration / best.duration).toFixed(2);
console.log(`\n${file}:`);
console.log(` Pages: ${best.pages}`);
console.log(` Best: ${best.method} - ${best.duration.toFixed(2)}ms`);
console.log(` Improvement: ${improvement}% faster (${speedup}x speedup)`);
console.log(` Baseline: sequential - ${baseline.duration.toFixed(2)}ms`);
}
}
// Method comparison
console.log(`\n\nš Method Performance Comparison:`);
console.log('-'.repeat(80));
const byMethod = {};
for (const r of results) {
if (!r.success) continue;
if (!byMethod[r.method]) {
byMethod[r.method] = [];
}
byMethod[r.method].push(r.duration);
}
const methodStats = [];
for (const [method, durations] of Object.entries(byMethod)) {
const avg = durations.reduce((a, b) => a + b, 0) / durations.length;
const min = Math.min(...durations);
const max = Math.max(...durations);
methodStats.push({ method, avg, min, max, count: durations.length });
}
methodStats.sort((a, b) => a.avg - b.avg);
for (const stat of methodStats) {
console.log(`\n${stat.method}:`);
console.log(` Average: ${stat.avg.toFixed(2)}ms`);
console.log(` Range: ${stat.min.toFixed(2)}ms - ${stat.max.toFixed(2)}ms`);
console.log(` Samples: ${stat.count}`);
}
}
/**
* Convert results to SmartPDFParser benchmark format (data only)
*/
function convertToSmartParserFormatData(results) {
return results
.filter(r => r.success)
.map(r => ({
timestamp: r.timestamp,
pages: r.pages,
size: r.size,
complexity: estimateComplexity(r),
method: r.method,
config: r.config,
duration: r.duration,
success: true,
cpuCores: require('os').cpus().length,
availableMemory: require('os').freemem()
}));
}
/**
* Estimate complexity from benchmark result
*/
function estimateComplexity(result) {
const bytesPerPage = result.size / result.pages;
if (bytesPerPage < 10000) return 'simple';
if (bytesPerPage > 100000) return 'complex';
return 'medium';
}
// Run benchmarks
runBenchmarks().catch(error => {
console.error('\nš„ Benchmark failed:', error);
process.exit(1);
});