pdf-parse-new
Version:
Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.
175 lines (146 loc) • 4.83 kB
JavaScript
const { parentPort, workerData } = require('worker_threads');
// Set up minimal environment BEFORE loading PDF.js
if (!globalThis.DOMParser) {
globalThis.DOMParser = class DOMParser {
parseFromString() { return { documentElement: { textContent: '' } }; }
};
}
if (!globalThis.navigator) {
globalThis.navigator = { userAgent: 'Node.js' };
}
// Load PDF.js
const PDFJS = require(`./pdf.js/v4.5.136/build/pdf.js`);
function render_page(pageData) {
let render_options = {
// Changed to true to fix issue #10 (ligature handling)
normalizeWhitespace: true,
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function (textContent) {
let lastY, text = '';
// Y-coordinate tolerance for detecting line breaks (fixes issue #10)
const Y_TOLERANCE = 1.0;
for (let item of textContent.items) {
const currentY = item.transform[5];
const isNewLine = lastY !== undefined && Math.abs(currentY - lastY) > Y_TOLERANCE;
if (isNewLine) {
text += '\n';
}
text += item.str;
lastY = currentY;
}
return text;
});
}
/**
* Worker thread for processing a chunk of PDF pages
*/
(async () => {
try {
// Validate worker data
if (!workerData) {
throw new Error('No workerData received');
}
const { dataBuffer, startPage, endPage, batchSize, verbosityLevel, pagerenderModule } = workerData;
if (!dataBuffer) {
throw new Error('No dataBuffer in workerData');
}
// Load custom render_page function from external module if provided
let customRenderPage = render_page; // Default
if (pagerenderModule) {
try {
// Load the custom render function from the provided module path
const customModule = require(pagerenderModule);
customRenderPage = typeof customModule === 'function' ? customModule : customModule.default || customModule.render_page;
} catch (err) {
console.error('Failed to load custom pagerender module:', err.message);
// Fall back to default render_page
}
}
// Convert buffer to Uint8Array (handle both Buffer and ArrayBuffer)
let uint8Array;
if (dataBuffer instanceof Uint8Array) {
uint8Array = dataBuffer;
} else if (dataBuffer instanceof ArrayBuffer) {
uint8Array = new Uint8Array(dataBuffer);
} else if (Buffer.isBuffer(dataBuffer)) {
uint8Array = new Uint8Array(dataBuffer);
} else if (Array.isArray(dataBuffer)) {
// Plain array from main thread - convert to Uint8Array
uint8Array = new Uint8Array(dataBuffer);
} else {
throw new Error(`Unsupported dataBuffer type: ${typeof dataBuffer}`);
}
if (!uint8Array || uint8Array.length === 0) {
throw new Error('Empty or invalid buffer data');
}
// Disable internal workers - CRITICAL for worker threads
PDFJS.disableWorker = true;
// Create document with explicit worker disabled
const doc = await PDFJS.getDocument({
verbosity: verbosityLevel || 0,
data: uint8Array,
disableAutoFetch: true,
disableStream: true,
disableRange: true,
// Don't use workers inside workers
worker: null
}).promise;
const pageTexts = [];
// Process pages in batches within this worker
for (let i = startPage; i <= endPage; i += batchSize) {
const batchEnd = Math.min(i + batchSize - 1, endPage);
const batchPromises = [];
for (let j = i; j <= batchEnd; j++) {
batchPromises.push(
doc.getPage(j)
.then(pageData => customRenderPage(pageData))
.catch((err) => {
console.error(`Error rendering page ${j}:`, err.message);
return "";
})
);
}
const batchResults = await Promise.all(batchPromises);
pageTexts.push(...batchResults);
}
doc.destroy();
// Send result back to main thread
parentPort.postMessage({
success: true,
text: pageTexts.join('\n\n'),
pagesProcessed: pageTexts.length
});
} catch (error) {
// Log error to console for debugging
console.error('Worker error:', error.message);
console.error('Stack:', error.stack);
try {
// Try to send error message
parentPort.postMessage({
success: false,
error: error.message,
stack: error.stack
});
} catch (postError) {
console.error('Failed to post error message:', postError.message);
}
// Exit cleanly - important: use 0 not 1
process.exit(0);
}
})().catch(error => {
// Catch any unhandled promise rejections
console.error('Unhandled worker error:', error.message);
console.error('Stack:', error.stack);
try {
parentPort.postMessage({
success: false,
error: error.message,
stack: error.stack
});
} catch (e) {
console.error('Failed to post error:', e.message);
}
process.exit(0);
});