pdf-parse-new
Version:
Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.
79 lines (59 loc) • 2.42 kB
JavaScript
/**
* Example: Child Processes
* Multi-processing for maximum performance on huge PDFs
*/
const fs = require('fs');
const os = require('os');
const PDFProcesses = require('../../lib/pdf-parse-processes');
async function processesParse() {
console.log('=== Child Processes Example ===\n');
const testFile = '../data/test_9000.pdf';
if (!fs.existsSync(testFile)) {
console.log('⚠️ Large test file not found:', testFile);
console.log(' This example needs a large PDF (1000+ pages)');
console.log(' Using smaller test file instead...\n');
return alternativeTest();
}
const dataBuffer = fs.readFileSync(testFile);
const cpuCores = os.cpus().length;
console.log(`File: ${testFile}`);
console.log(`Size: ${(dataBuffer.length / 1024 / 1024).toFixed(2)} MB`);
console.log(`CPU Cores: ${cpuCores}\n`);
try {
console.log('Starting processes parse...');
console.log('Progress:\n');
const start = Date.now();
const result = await PDFProcesses(dataBuffer, {
chunkSize: 500,
batchSize: 10,
maxProcesses: Math.floor(cpuCores * 1.5), // Oversaturation!
onProgress: ({ completedChunks, totalChunks, progress }) => {
process.stdout.write(`\r${'█'.repeat(Math.floor(progress/2))}${'░'.repeat(50-Math.floor(progress/2))} ${progress}% (${completedChunks}/${totalChunks} chunks)`);
}
});
const duration = Date.now() - start;
console.log('\n\n✅ Success!');
console.log(`Duration: ${duration}ms`);
console.log(`Pages: ${result.numpages}`);
console.log(`Characters: ${result.text.length.toLocaleString()}`);
console.log(`\n💡 Processes are often faster than workers for huge PDFs!\n`);
} catch (error) {
console.error('\n❌ Error:', error.message);
process.exit(1);
}
}
async function alternativeTest() {
const testFile = './test/data/01-valid.pdf';
const dataBuffer = fs.readFileSync(testFile);
const cpuCores = os.cpus().length;
console.log(`File: ${testFile} (small test)`);
console.log(`CPU Cores: ${cpuCores}\n`);
const start = Date.now();
const result = await PDFProcesses(dataBuffer, { maxProcesses: 2 });
const duration = Date.now() - start;
console.log(`✅ Completed in ${duration}ms`);
console.log(`Pages: ${result.numpages}`);
console.log(`\n💡 Processes work best on PDFs with 1000+ pages!\n`);
}
processesParse();