UNPKG

pdf-parse-new

Version:

Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.

146 lines (118 loc) 4.36 kB
const PDFJS = require(`./pdf.js/v4.5.136/build/pdf.js`); function render_page(pageData) { //check documents https://mozilla.github.io/pdf.js/ //ret.text = ret.text ? ret.text : ""; let render_options = { //replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`. // Changed to true to fix issue #10 (ligature handling) normalizeWhitespace: true, //do not attempt to combine same line TextItem's. The default value is `false`. disableCombineTextItems: false } return pageData.getTextContent(render_options) .then(function (textContent) { let lastY, text = ''; // Y-coordinate tolerance for detecting line breaks (fixes sub-pixel differences) // See: https://github.com/simonegosetto/pdf-parse-new/issues/10 const Y_TOLERANCE = 1.0; //https://github.com/mozilla/pdf.js/issues/8963 //https://github.com/mozilla/pdf.js/issues/2140 //https://gist.github.com/hubgit/600ec0c224481e910d2a0f883a7b98e3 for (let item of textContent.items) { // Get current Y coordinate const currentY = item.transform[5]; // Detect line break with tolerance (fixes ligature issues) const isNewLine = lastY !== undefined && Math.abs(currentY - lastY) > Y_TOLERANCE; if (isNewLine) { text += '\n'; } // Add text content text += item.str; lastY = currentY; } //let strings = textContent.items.map(item => item.str); //let text = strings.join("\n"); //text = text.replace(/[ ]+/ig," "); //ret.text = `${ret.text} ${text} \n\n`; return text; }); } const DEFAULT_OPTIONS = { pagerender: render_page, max: 0, verbosityLevel: 5, // errors: 0, warnings: 1, infos: 5 parallelizePages: false, batchSize: 10 // Number of pages to process in parallel at once } async function PDF(dataBuffer, options) { let ret = { numpages: 0, numrender: 0, info: null, metadata: null, text: "", version: null }; if (typeof options == 'undefined') options = DEFAULT_OPTIONS; if (typeof options.pagerender != 'function') options.pagerender = DEFAULT_OPTIONS.pagerender; if (typeof options.max != 'number') options.max = DEFAULT_OPTIONS.max; if (typeof options.parallelizePages != 'boolean') options.parallelizePages = DEFAULT_OPTIONS.parallelizePages; if (typeof options.batchSize != 'number') options.batchSize = DEFAULT_OPTIONS.batchSize; /*if (typeof PDFJS === 'function') { PDFJS = await PDFJS(); }*/ // console.log(PDFJS); ret.version = PDFJS.version; // Disable workers to avoid yet another cross-origin issue (workers need // the URL of the script to be loaded, and dynamically loading a cross-origin // script does not work). PDFJS.disableWorker = true; let doc = await PDFJS.getDocument({ verbosity: options.verbosityLevel ?? DEFAULT_OPTIONS.verbosityLevel, data: new Uint8Array(dataBuffer), }).promise; ret.numpages = doc.numPages; let metaData = await doc.getMetadata().catch(function (err) { return null; }); ret.info = metaData ? metaData.info : null; ret.metadata = metaData ? metaData.metadata : null; let counter = options.max <= 0 ? doc.numPages : options.max; counter = counter > doc.numPages ? doc.numPages : counter; ret.text = ""; if (options.parallelizePages) { // Batch-parallelized page processing const pageTexts = []; const batchSize = options.batchSize; for (let i = 1; i <= counter; i += batchSize) { const batchEnd = Math.min(i + batchSize, counter + 1); const batchPromises = []; for (let j = i; j < batchEnd; j++) { batchPromises.push( doc.getPage(j) .then(pageData => options.pagerender(pageData)) .catch((err) => { return ""; }) ); } const batchResults = await Promise.all(batchPromises); pageTexts.push(...batchResults); } ret.text = pageTexts.join('\n\n'); } else { // Sequential page processing for (let i = 1; i <= counter; i++) { let pageText = await doc.getPage(i).then(pageData => options.pagerender(pageData)) .catch((err) => { return ""; }); ret.text = `${ret.text}\n\n${pageText}`; } } ret.numrender = counter; doc.destroy(); return ret; } module.exports = PDF; module.exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;