pdf-parse-new
Version:
Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.
289 lines (246 loc) • 8.07 kB
JavaScript
const { fork } = require('child_process');
const os = require('os');
const path = require('path');
const PDFJS = require(`./pdf.js/v4.5.136/build/pdf.js`);
// Default render_page function
function render_page(pageData) {
let render_options = {
normalizeWhitespace: true,
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function (textContent) {
let lastY, text = '';
const Y_TOLERANCE = 1.0;
for (let item of textContent.items) {
const currentY = item.transform[5];
const isNewLine = lastY !== undefined && Math.abs(currentY - lastY) > Y_TOLERANCE;
if (isNewLine) {
text += '\n';
}
text += item.str;
lastY = currentY;
}
return text;
});
}
const DEFAULT_OPTIONS = {
pagerender: render_page,
pagerenderModule: null, // Path to custom render module for child processes
max: 0,
verbosityLevel: 0,
chunkSize: 500,
batchSize: 10,
maxProcesses: Math.max(1, os.cpus().length - 1),
processTimeout: 120000, // 2 minutes per process
onProgress: null
}
/**
* Parse PDF using child processes for true parallel processing
* Best for very large PDFs (1000+ pages)
* Each child process handles a chunk of pages independently
* More robust than worker threads - no PDF.js compatibility issues
*/
async function PDFProcesses(dataBuffer, options) {
let ret = {
numpages: 0,
numrender: 0,
info: null,
metadata: null,
text: "",
version: null
};
options = { ...DEFAULT_OPTIONS, ...options };
// Validate pagerender option
if (typeof options.pagerender != 'function') {
options.pagerender = DEFAULT_OPTIONS.pagerender;
}
ret.version = PDFJS.version;
// Get document info from main process
PDFJS.disableWorker = true;
let doc = await PDFJS.getDocument({
verbosity: options.verbosityLevel,
data: new Uint8Array(dataBuffer),
}).promise;
ret.numpages = doc.numPages;
let metaData = await doc.getMetadata().catch(function () {
return null;
});
ret.info = metaData ? metaData.info : null;
ret.metadata = metaData ? metaData.metadata : null;
let counter = options.max <= 0 ? doc.numPages : options.max;
counter = counter > doc.numPages ? doc.numPages : counter;
doc.destroy();
// Divide work into chunks
const chunks = [];
for (let i = 1; i <= counter; i += options.chunkSize) {
chunks.push({
start: i,
end: Math.min(i + options.chunkSize - 1, counter),
index: chunks.length
});
}
// Process chunks using child process pool
const results = await processChunksWithChildren(
dataBuffer,
chunks,
options
);
ret.text = results.join('\n\n');
ret.numrender = counter;
return ret;
}
/**
* Process chunks using a pool of child processes
*/
async function processChunksWithChildren(dataBuffer, chunks, options) {
const results = new Array(chunks.length);
const maxConcurrent = Math.min(options.maxProcesses, chunks.length);
let completedChunks = 0;
let activeProcesses = 0;
let chunkIndex = 0;
// Write buffer to temp file to avoid passing huge strings via IPC
// This fixes OOM issues with large PDFs and many processes
const tempDir = os.tmpdir();
const tempFile = path.join(tempDir, `pdf-parse-${Date.now()}-${Math.random().toString(36).substring(7)}.pdf`);
try {
require('fs').writeFileSync(tempFile, dataBuffer);
} catch (err) {
// Fallback to memory if file write fails (unlikely)
console.warn('[pdf-parse] Failed to write temp file, falling back to memory:', err.message);
}
// Helper to cleanup temp file
const cleanup = () => {
if (require('fs').existsSync(tempFile)) {
try {
require('fs').unlinkSync(tempFile);
} catch (e) { /* ignore */ }
}
};
return new Promise((resolve, reject) => {
function startChildProcess() {
if (chunkIndex >= chunks.length) {
if (activeProcesses === 0) {
cleanup();
resolve(results);
}
return;
}
const chunk = chunks[chunkIndex];
const currentIndex = chunkIndex;
chunkIndex++;
activeProcesses++;
// Fork child process with absolute path (npm-safe)
const childPath = path.join(__dirname, 'pdf-child.js');
const child = fork(childPath, [], {
stdio: ['inherit', 'inherit', 'inherit', 'ipc']
});
let messageReceived = false;
let timedOut = false;
// Set timeout for this process
const timeout = setTimeout(() => {
if (!messageReceived) {
timedOut = true;
child.kill('SIGKILL');
const errorMsg = `Child process timeout (chunk ${currentIndex + 1}/${chunks.length}) - exceeded ${options.processTimeout}ms`;
// Don't reject immediately, just fail this chunk?
// For now, consistent behavior with previous version: reject
cleanup();
reject(new Error(errorMsg));
}
}, options.processTimeout);
// Listen for response
child.on('message', (message) => {
clearTimeout(timeout);
messageReceived = true;
if (message.success) {
results[currentIndex] = message.text;
completedChunks++;
// Progress callback
if (options.onProgress) {
options.onProgress({
completedChunks,
totalChunks: chunks.length,
progress: (completedChunks / chunks.length * 100).toFixed(2)
});
}
// Give time for message to be fully processed before killing
setImmediate(() => {
if (child.connected) {
child.disconnect();
child.kill();
}
});
} else {
// Error case
child.disconnect();
child.kill();
const errorMsg = `Child process error (chunk ${currentIndex + 1}/${chunks.length}): ${message.error}`;
cleanup();
reject(new Error(errorMsg));
}
});
// Handle errors
child.on('error', (error) => {
clearTimeout(timeout);
if (!messageReceived && !timedOut) {
child.kill();
const errorMsg = `Child process runtime error (chunk ${currentIndex + 1}/${chunks.length}): ${error.message}`;
cleanup();
reject(new Error(errorMsg));
}
});
// Handle exit - this is where we decrement activeProcesses
child.on('exit', (code, signal) => {
clearTimeout(timeout);
// Always decrement when process exits
const wasActive = activeProcesses > 0;
if (wasActive) {
activeProcesses--;
}
// Handle different exit scenarios
if (messageReceived && (code === 0 || code === null)) {
// Success - start next if available, otherwise check if done
if (chunkIndex < chunks.length) {
startChildProcess();
} else if (activeProcesses === 0) {
// All done!
cleanup();
resolve(results);
}
} else if (code !== 0 && !messageReceived && !timedOut) {
// Error exit without message (and not timeout)
const errorMsg = `Child process exited with code ${code} (chunk ${currentIndex + 1}/${chunks.length})`;
cleanup();
reject(new Error(errorMsg));
} else if (activeProcesses === 0 && chunkIndex >= chunks.length) {
// All chunks assigned and all processes done
cleanup();
resolve(results);
}
});
// Send work to child
// Pass file path if available, otherwise fallback to base64 (legacy/fallback)
const payload = {
startPage: chunk.start,
endPage: chunk.end,
batchSize: options.batchSize,
verbosityLevel: options.verbosityLevel,
pagerenderModule: options.pagerenderModule // Path to custom render module
};
if (require('fs').existsSync(tempFile)) {
payload.pdfFilePath = tempFile;
} else {
// Fallback to base64 if temp file creation failed
payload.dataBuffer = dataBuffer.toString('base64');
}
child.send(payload);
}
// Start initial batch of child processes
for (let i = 0; i < maxConcurrent; i++) {
startChildProcess();
}
});
}
module.exports = PDFProcesses;
module.exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;