UNPKG

pdf-parse-new

Version:

Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.

145 lines (120 loc) 3.78 kB
const PDFJS = require('./pdf.js/v4.5.136/build/pdf.js'); /** * Child process for parsing PDF pages * Communicates via IPC with parent process */ function render_page(pageData) { let render_options = { // Changed to true to fix issue #10 (ligature handling) normalizeWhitespace: true, disableCombineTextItems: false } return pageData.getTextContent(render_options) .then(function (textContent) { let lastY, text = ''; // Y-coordinate tolerance for detecting line breaks (fixes issue #10) const Y_TOLERANCE = 1.0; for (let item of textContent.items) { const currentY = item.transform[5]; const isNewLine = lastY !== undefined && Math.abs(currentY - lastY) > Y_TOLERANCE; if (isNewLine) { text += '\n'; } text += item.str; lastY = currentY; } return text; }); } // Listen for messages from parent process.on('message', async (message) => { try { const { dataBuffer, pdfFilePath, startPage, endPage, batchSize, verbosityLevel, pagerenderModule } = message; // Load custom render_page function from external module if provided let customRenderPage = render_page; // Default if (pagerenderModule) { try { // Load the custom render function from the provided module path const customModule = require(pagerenderModule); customRenderPage = typeof customModule === 'function' ? customModule : customModule.default || customModule.render_page; } catch (err) { console.error('Failed to load custom pagerender module:', err.message); // Fall back to default render_page } } let uint8Array; if (pdfFilePath) { // Read from temp file (memory efficient) const buffer = require('fs').readFileSync(pdfFilePath); uint8Array = new Uint8Array(buffer); } else if (dataBuffer) { // Legacy/Fallback: Convert base64 back to buffer const buffer = Buffer.from(dataBuffer, 'base64'); uint8Array = new Uint8Array(buffer); } else { throw new Error('No PDF data provided (missing dataBuffer or pdfFilePath)'); } // Disable workers PDFJS.disableWorker = true; // Load PDF const doc = await PDFJS.getDocument({ verbosity: verbosityLevel || 0, data: uint8Array, disableAutoFetch: true, disableStream: true, disableRange: true }).promise; const pageTexts = []; // Process pages in batches for (let i = startPage; i <= endPage; i += batchSize) { const batchEnd = Math.min(i + batchSize - 1, endPage); const batchPromises = []; for (let j = i; j <= batchEnd; j++) { batchPromises.push( doc.getPage(j) .then(pageData => customRenderPage(pageData)) .catch(() => "") ); } const batchResults = await Promise.all(batchPromises); pageTexts.push(...batchResults); } doc.destroy(); // Send result back to parent process.send({ success: true, text: pageTexts.join('\n\n'), pagesProcessed: pageTexts.length }); // Let parent kill this process - don't exit manually // This prevents race conditions with message delivery } catch (error) { // Send error back to parent if (process.connected) { process.send({ success: false, error: error.message, stack: error.stack }); } // Exit with error code process.exit(1); } }); // Handle unexpected errors process.on('uncaughtException', (error) => { process.send({ success: false, error: `Uncaught exception: ${error.message}`, stack: error.stack }); process.exit(1); }); process.on('unhandledRejection', (error) => { process.send({ success: false, error: `Unhandled rejection: ${error.message}`, stack: error.stack }); process.exit(1); });