UNPKG

pdf-parse-new

Version:

Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.

github.com/simonegosetto/pdf-parse-new

simonegosetto/pdf-parse-new

141 lines (112 loc) • 3.86 kB

JavaScript

const PDFJS = require(`./pdf.js/v4.5.136/build/pdf.js`); function render_page(pageData) { let render_options = { // Changed to true to fix issue #10 (ligature handling) normalizeWhitespace: true, disableCombineTextItems: false } return pageData.getTextContent(render_options) .then(function (textContent) { let lastY, text = ''; // Y-coordinate tolerance for detecting line breaks (fixes issue #10) const Y_TOLERANCE = 1.0; for (let item of textContent.items) { const currentY = item.transform[5]; const isNewLine = lastY !== undefined && Math.abs(currentY - lastY) > Y_TOLERANCE; if (isNewLine) { text += '\n'; } text += item.str; lastY = currentY; } return text; }); } const DEFAULT_OPTIONS = { pagerender: render_page, max: 0, verbosityLevel: 0, chunkSize: 500, batchSize: 10, concurrentChunks: 1, // Process chunks one at a time but with aggressive batching onChunkComplete: null } /** * Parse PDF with aggressive parallelization for huge files * Uses larger batches and more aggressive concurrency * Best for 1000+ page PDFs */ async function PDFAggressive(dataBuffer, options) { let ret = { numpages: 0, numrender: 0, info: null, metadata: null, text: "", version: null }; // Merge options with defaults options = { ...DEFAULT_OPTIONS, ...options }; if (typeof options.pagerender != 'function') options.pagerender = DEFAULT_OPTIONS.pagerender; ret.version = PDFJS.version; PDFJS.disableWorker = true; let doc = await PDFJS.getDocument({ verbosity: options.verbosityLevel, data: new Uint8Array(dataBuffer), }).promise; ret.numpages = doc.numPages; let metaData = await doc.getMetadata().catch(function () { return null; }); ret.info = metaData ? metaData.info : null; ret.metadata = metaData ? metaData.metadata : null; let counter = options.max <= 0 ? doc.numPages : options.max; counter = counter > doc.numPages ? doc.numPages : counter; const textChunks = []; let processedPages = 0; // Process in large chunks with aggressive batching for (let chunkStart = 1; chunkStart <= counter; chunkStart += options.chunkSize) { const chunkEnd = Math.min(chunkStart + options.chunkSize - 1, counter); // Process entire chunk in parallel (more aggressive than stream) const chunkPromises = []; for (let pageNum = chunkStart; pageNum <= chunkEnd; pageNum += options.batchSize) { const batchEnd = Math.min(pageNum + options.batchSize - 1, chunkEnd); // Create a batch const batchPromises = []; for (let i = pageNum; i <= batchEnd; i++) { batchPromises.push( doc.getPage(i) .then(pageData => options.pagerender(pageData)) .catch(() => "") ); } // Add batch to chunk (all batches in chunk run in parallel) chunkPromises.push(Promise.all(batchPromises)); } // Wait for all batches in this chunk const chunkBatchResults = await Promise.all(chunkPromises); const chunkTexts = chunkBatchResults.flat(); textChunks.push(chunkTexts.join('\n\n')); processedPages += chunkTexts.length; // Progress callback if (options.onChunkComplete) { options.onChunkComplete({ processedPages, totalPages: counter, progress: (processedPages / counter * 100).toFixed(2), currentChunk: Math.ceil(chunkStart / options.chunkSize), totalChunks: Math.ceil(counter / options.chunkSize) }); } // Force garbage collection between chunks if available if (global.gc && chunkStart + options.chunkSize <= counter) { global.gc(); } } ret.text = textChunks.join('\n\n'); ret.numrender = processedPages; doc.destroy(); return ret; } module.exports = PDFAggressive; module.exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;