UNPKG

afpp

Version:

another f*cking pdf parser

141 lines 6.03 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.PROCESSING_TYPE = void 0; exports.parsePdfFile = parsePdfFile; var PROCESSING_TYPE; (function (PROCESSING_TYPE) { PROCESSING_TYPE["IMAGE"] = "IMAGE"; PROCESSING_TYPE["MIXED"] = "MIXED"; PROCESSING_TYPE["TEXT"] = "TEXT"; })(PROCESSING_TYPE || (exports.PROCESSING_TYPE = PROCESSING_TYPE = {})); const promises_1 = require("node:fs/promises"); const p_limit_1 = __importDefault(require("p-limit")); const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs"); const processPdfPageTypeMixed = async (page, canvasFactory, pageNumber, pageCount, scale, encoding, callback) => { const textContent = await page.getTextContent({ includeMarkedContent: false, }); const items = textContent.items; if (items.length === 0) { const viewport = page.getViewport({ scale }); const canvasAndContext = canvasFactory.create(viewport.width, viewport.height); await page.render({ canvas: canvasAndContext.canvas, canvasContext: canvasAndContext.context, viewport, }).promise; //@ts-expect-error this should be fixed in release const imageBuffer = await canvasAndContext.canvas.encode(encoding); canvasFactory.destroy(canvasAndContext); return callback(imageBuffer, pageNumber, pageCount); } const pageText = items.map((item) => item.str || '').join(' '); return callback(pageText, pageNumber, pageCount); }; const processPdfPageTypeText = async (page) => { const textContent = await page.getTextContent({ includeMarkedContent: false, }); const items = textContent.items; if (items.length === 0) { return ''; } else { return items.map((item) => item.str || '').join(' '); } }; const processPdfPageTypeImage = async (page, canvasFactory, pageNumber, pageCount, scale, encoding) => { const viewport = page.getViewport({ scale }); const canvasAndContext = canvasFactory.create(viewport.width, viewport.height); await page.render({ canvas: canvasAndContext.canvas, canvasContext: canvasAndContext.context, viewport, }).promise; //@ts-expect-error this should be fixed in release const imageBuffer = await canvasAndContext.canvas.encode(encoding); canvasFactory.destroy(canvasAndContext); return imageBuffer; }; const validateParameters = async (input, options) => { const documentInitParameters = {}; switch (true) { case typeof input === 'string': documentInitParameters.data = new Uint8Array(await (0, promises_1.readFile)(input)); break; case Buffer.isBuffer(input): documentInitParameters.data = new Uint8Array(input); break; case input instanceof Uint8Array: documentInitParameters.data = input; break; case input instanceof URL: documentInitParameters.url = input; break; default: throw new Error(`Invalid source type: ${typeof input}`); } documentInitParameters.password = options?.password; documentInitParameters.verbosity = pdf_mjs_1.VerbosityLevel.ERRORS; const scale = options?.scale ?? 2.0; const concurrency = options?.concurrency ?? 1; const encoding = options?.imageEncoding ?? 'png'; if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) { throw new Error(`Unsupported image encoding format: '${encoding}'`); } return { concurrency, documentInitParameters, encoding, scale }; }; async function parsePdfFile(type, input, options, callback) { const { concurrency, documentInitParameters, encoding, scale } = await validateParameters(input, options); const limit = (0, p_limit_1.default)(concurrency); const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters); const pdfDocument = await loadingTask.promise; const { numPages } = pdfDocument; if (type === PROCESSING_TYPE.MIXED) { if (!callback || typeof callback !== 'function') { throw new Error(`Invalid callback type: ${typeof callback}`); } const results = new Array(numPages); const pageTasks = Array.from({ length: numPages }, (_, i) => { const pageNum = i + 1; return limit(async () => { const page = await pdfDocument.getPage(pageNum); const canvasFactory = pdfDocument.canvasFactory; const result = await processPdfPageTypeMixed(page, canvasFactory, pageNum, numPages, scale, encoding, callback); results[i] = result; }); }); await Promise.all(pageTasks); return results; } if (type === PROCESSING_TYPE.TEXT) { const results = new Array(numPages); const pageTasks = Array.from({ length: numPages }, (_, i) => { const pageNum = i + 1; return limit(async () => { const page = await pdfDocument.getPage(pageNum); results[i] = await processPdfPageTypeText(page); }); }); await Promise.all(pageTasks); return results; } if (type === PROCESSING_TYPE.IMAGE) { const results = new Array(numPages); const pageTasks = Array.from({ length: numPages }, (_, i) => { const pageNum = i + 1; return limit(async () => { const page = await pdfDocument.getPage(pageNum); const canvasFactory = pdfDocument.canvasFactory; results[i] = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding); }); }); await Promise.all(pageTasks); return results; } throw new Error('Invalid PROCESSING_TYPE'); } //# sourceMappingURL=core.js.map