UNPKG

afpp

Version:

Async Fast PDF Parser for Node.js — dependency-light, TypeScript-first, production-ready.

244 lines 10.3 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.PROCESSING_TYPE = void 0; exports.parsePdfFile = parsePdfFile; exports.streamPdfFile = streamPdfFile; const promises_1 = require("node:fs/promises"); const node_os_1 = require("node:os"); const canvas_1 = require("@napi-rs/canvas"); const p_limit_1 = __importDefault(require("p-limit")); const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs"); var PROCESSING_TYPE; (function (PROCESSING_TYPE) { PROCESSING_TYPE["IMAGE"] = "IMAGE"; PROCESSING_TYPE["MIXED"] = "MIXED"; PROCESSING_TYPE["TEXT"] = "TEXT"; })(PROCESSING_TYPE || (exports.PROCESSING_TYPE = PROCESSING_TYPE = {})); class PooledCanvasFactory { pool = []; maxSize; constructor(maxSize) { this.maxSize = maxSize; } create(width, height) { const existing = this.pool.pop(); if (existing) { this.reset(existing, width, height); return existing; } const canvas = (0, canvas_1.createCanvas)(width, height); const context = canvas.getContext('2d'); return { canvas, context }; } reset(canvasAndContext, width, height) { canvasAndContext.canvas.width = width; canvasAndContext.canvas.height = height; // Resizing implicitly clears the canvas in @napi-rs/canvas } destroy(canvasAndContext) { if (this.pool.length < this.maxSize) { this.pool.push(canvasAndContext); } // Otherwise let it GC — pool is at capacity } } const extractText = (items) => { const parts = []; for (const item of items) { if (item.str) parts.push(item.str); } return parts.join(' '); }; const processPdfPageTypeMixed = async (page, canvasFactory, pageNumber, pageCount, scale, encoding, callback) => { const textContent = await page.getTextContent({ includeMarkedContent: false, }); const items = textContent.items; if (items.length === 0) { const viewport = page.getViewport({ scale }); const canvasAndContext = canvasFactory.create(viewport.width, viewport.height); try { await page.render({ canvas: canvasAndContext.canvas, canvasContext: canvasAndContext.context, viewport, }).promise; //@ts-expect-error this should be fixed in release const imageBuffer = await canvasAndContext.canvas.encode(encoding); return callback(imageBuffer, pageNumber, pageCount); } finally { canvasFactory.destroy(canvasAndContext); } } return callback(extractText(items), pageNumber, pageCount); }; const processPdfPageTypeText = async (page) => { const textContent = await page.getTextContent({ includeMarkedContent: false, }); const items = textContent.items; return extractText(items); }; const processPdfPageTypeImage = async (page, canvasFactory, pageNumber, pageCount, scale, encoding) => { const viewport = page.getViewport({ scale }); const canvasAndContext = canvasFactory.create(viewport.width, viewport.height); try { await page.render({ canvas: canvasAndContext.canvas, canvasContext: canvasAndContext.context, viewport, }).promise; //@ts-expect-error this should be fixed in release const imageBuffer = await canvasAndContext.canvas.encode(encoding); return imageBuffer; } finally { canvasFactory.destroy(canvasAndContext); } }; const validateParameters = async (input, options) => { const documentInitParameters = {}; switch (true) { case typeof input === 'string': documentInitParameters.data = new Uint8Array(await (0, promises_1.readFile)(input)); break; case Buffer.isBuffer(input): documentInitParameters.data = new Uint8Array(input); break; case input instanceof Uint8Array: documentInitParameters.data = input; break; case input instanceof URL: documentInitParameters.url = input; break; default: throw new Error(`Invalid source type: ${typeof input}`); } documentInitParameters.password = options?.password; documentInitParameters.verbosity = pdf_mjs_1.VerbosityLevel.ERRORS; // Performance optimizations for local file processing documentInitParameters.disableAutoFetch = true; // Don't prefetch - we have full data documentInitParameters.disableStream = true; // Don't stream - we have full data documentInitParameters.disableRange = true; // Don't use range requests - we have full data const scale = options?.scale ?? 1.0; if (Number.isNaN(scale) || scale < 0.1 || scale > 10) { throw new Error(`Invalid scale value: ${scale}. Must be a number between 0.1 and 10.`); } const concurrency = options?.concurrency === 'auto' ? Math.min((0, node_os_1.availableParallelism)(), 8) : (options?.concurrency ?? 1); if (typeof concurrency === 'number' && (!Number.isInteger(concurrency) || concurrency < 1)) { throw new Error(`Invalid concurrency value: ${concurrency}. Must be a positive integer or 'auto'.`); } const encoding = options?.imageEncoding ?? 'png'; if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) { throw new Error(`Unsupported image encoding format: '${encoding}'`); } return { concurrency, documentInitParameters, encoding, scale }; }; async function parsePdfFile(type, input, options, callback) { const { concurrency, documentInitParameters, encoding, scale } = await validateParameters(input, options); const pooledFactory = type !== PROCESSING_TYPE.TEXT ? new PooledCanvasFactory(concurrency) : undefined; if (pooledFactory) { // @ts-expect-error - PooledCanvasFactory is structurally compatible with pdfjs BaseCanvasFactory documentInitParameters.canvasFactory = pooledFactory; } const limit = (0, p_limit_1.default)(concurrency); const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters); const pdfDocument = await loadingTask.promise; try { const { numPages } = pdfDocument; if (type === PROCESSING_TYPE.MIXED) { if (!callback || typeof callback !== 'function') { throw new Error(`Invalid callback type: ${typeof callback}`); } const results = Array.from({ length: numPages }); const pageTasks = Array.from({ length: numPages }, (_, i) => { const pageNum = i + 1; return limit(async () => { const page = await pdfDocument.getPage(pageNum); const canvasFactory = pooledFactory ?? pdfDocument.canvasFactory; const result = await processPdfPageTypeMixed(page, canvasFactory, pageNum, numPages, scale, encoding, callback); results[i] = result; page.cleanup(); }); }); await Promise.all(pageTasks); return results; } if (type === PROCESSING_TYPE.TEXT) { const results = Array.from({ length: numPages }); const pageTasks = Array.from({ length: numPages }, (_, i) => { const pageNum = i + 1; return limit(async () => { const page = await pdfDocument.getPage(pageNum); results[i] = await processPdfPageTypeText(page); page.cleanup(); }); }); await Promise.all(pageTasks); return results; } if (type === PROCESSING_TYPE.IMAGE) { const results = Array.from({ length: numPages }); const pageTasks = Array.from({ length: numPages }, (_, i) => { const pageNum = i + 1; return limit(async () => { const page = await pdfDocument.getPage(pageNum); const canvasFactory = pooledFactory ?? pdfDocument.canvasFactory; results[i] = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding); page.cleanup(); }); }); await Promise.all(pageTasks); return results; } throw new Error('Invalid PROCESSING_TYPE'); } finally { // Clean up pdfjs resources to prevent memory leaks pdfDocument.cleanup(); await pdfDocument.destroy(); loadingTask.destroy(); } } async function* streamPdfFile(type, input, options) { const { documentInitParameters, encoding, scale } = await validateParameters(input, options); const pooledFactory = type === PROCESSING_TYPE.IMAGE ? new PooledCanvasFactory(1) : undefined; if (pooledFactory) { // @ts-expect-error - PooledCanvasFactory is structurally compatible with pdfjs BaseCanvasFactory documentInitParameters.canvasFactory = pooledFactory; } const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters); const pdfDocument = await loadingTask.promise; try { const { numPages } = pdfDocument; for (let pageNum = 1; pageNum <= numPages; pageNum++) { const page = await pdfDocument.getPage(pageNum); if (type === PROCESSING_TYPE.IMAGE) { const canvasFactory = pooledFactory ?? pdfDocument.canvasFactory; const data = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding); yield { data, pageCount: numPages, pageNumber: pageNum }; } else { const data = await processPdfPageTypeText(page); yield { data, pageCount: numPages, pageNumber: pageNum }; } page.cleanup(); } } finally { pdfDocument.cleanup(); await pdfDocument.destroy(); loadingTask.destroy(); } } //# sourceMappingURL=core.js.map