UNPKG

unpdf

Version:

PDF extraction and rendering across all JavaScript runtimes

255 lines (247 loc) 8.3 kB
let resolvedModule; const isNode = globalThis.process?.release?.name === "node"; const isBrowser = typeof window !== "undefined"; async function getDocumentProxy(data, options = {}) { const { getDocument } = await getResolvedPDFJS(); const pdf = await getDocument({ data, isEvalSupported: false, // See: https://github.com/mozilla/pdf.js/issues/4244#issuecomment-1479534301 useSystemFonts: true, ...options }).promise; return pdf; } async function getResolvedPDFJS() { if (!resolvedModule) { await resolvePDFJSImport(); } return resolvedModule; } async function resolvePDFJSImport(pdfjsResolver, { reload = false } = {}) { if (resolvedModule && !reload) { return; } if (pdfjsResolver) { try { resolvedModule = await interopDefault(pdfjsResolver()); return; } catch (error) { throw new Error(`PDF.js could not be resolved: ${error}`); } } try { resolvedModule = await import('unpdf/pdfjs'); } catch (error) { throw new Error(`Serverless PDF.js bundle could not be resolved: ${error}`); } } function isPDFDocumentProxy(data) { return typeof data === "object" && data !== null && "_pdfInfo" in data; } async function interopDefault(m) { const resolved = await m; return resolved.default || resolved; } let resolvedCanvasModule; /** * Derived from the PDF.js project by the Mozilla Foundation. * @see https://github.com/mozilla/pdf.js/blob/b8de9a372f9bbf7e33adb362eeae5ef1919dba73/src/display/canvas_factory.js#L18 * @license Apache-2.0 */ class BaseCanvasFactory { #enableHWA = false; constructor({ enableHWA = false } = {}) { this.#enableHWA = enableHWA; } create(width, height) { const canvas = this._createCanvas(width, height); return { canvas, context: canvas.getContext("2d", { willReadFrequently: !this.#enableHWA }) }; } reset({ canvas }, width, height) { if (!canvas) { throw new Error("Canvas is not specified"); } canvas.width = width; canvas.height = height; } destroy(context) { if (!context.canvas) { throw new Error("Canvas is not specified"); } context.canvas.width = 0; context.canvas.height = 0; context.canvas = void 0; context.context = void 0; } // eslint-disable-next-line unused-imports/no-unused-vars _createCanvas(width, height) { throw new Error("Not implemented"); } } /** * Derived from the PDF.js project by the Mozilla Foundation. * @see https://github.com/mozilla/pdf.js/blob/b8de9a372f9bbf7e33adb362eeae5ef1919dba73/src/display/canvas_factory.js#L18 * @license Apache-2.0 */ class DOMCanvasFactory extends BaseCanvasFactory { _document; constructor({ ownerDocument = globalThis.document, enableHWA = false } = {}) { super({ enableHWA }); this._document = ownerDocument; } _createCanvas(width, height) { const canvas = this._document.createElement("canvas"); canvas.width = width; canvas.height = height; return canvas; } } class NodeCanvasFactory extends BaseCanvasFactory { constructor({ enableHWA = false } = {}) { super({ enableHWA }); } _createCanvas(width, height) { if (!resolvedCanvasModule) { throw new Error("@napi-rs/canvas module is not resolved"); } return resolvedCanvasModule.createCanvas(width, height); } } async function resolveCanvasModule(canvasImport) { resolvedCanvasModule ??= await interopDefault(canvasImport()); } function injectCanvasConstructors() { if (!resolvedCanvasModule) return; if (typeof globalThis.DOMMatrix === "undefined") globalThis.DOMMatrix = resolvedCanvasModule.DOMMatrix; if (typeof globalThis.ImageData === "undefined") globalThis.ImageData = resolvedCanvasModule.ImageData; if (typeof globalThis.Path2D === "undefined") globalThis.Path2D = resolvedCanvasModule.Path2D; } async function extractImages$1(data, pageNumber) { const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data); if (pageNumber < 1 || pageNumber > pdf.numPages) { throw new Error(`Invalid page number. Must be between 1 and ${pdf.numPages}.`); } const page = await pdf.getPage(pageNumber); const operatorList = await page.getOperatorList(); const { OPS } = await getResolvedPDFJS(); const images = []; for (let i = 0; i < operatorList.fnArray.length; i++) { const op = operatorList.fnArray[i]; if (op !== OPS.paintImageXObject) { continue; } const imageKey = operatorList.argsArray[i][0]; const image = imageKey.startsWith("g_") ? await page.commonObjs.get(imageKey) : await page.objs.get(imageKey); if (!image || !image.data || !image.width || !image.height) { continue; } const { width, height, data: data2 } = image; const calculatedChannels = data2.length / (width * height); if (![1, 3, 4].includes(calculatedChannels)) { continue; } const channels = calculatedChannels; images.push({ data: data2, width, height, channels, key: imageKey }); } return images; } async function renderPageAsImage$1(data, pageNumber, options = {}) { const CanvasFactory = await createIsomorphicCanvasFactory(options.canvasImport); const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data, { CanvasFactory }); const page = await pdf.getPage(pageNumber); if (pageNumber < 1 || pageNumber > pdf.numPages) { throw new Error(`Invalid page number. Must be between 1 and ${pdf.numPages}.`); } const defaultViewport = page.getViewport({ scale: 1 }); let scale = options.scale || 1; if (options.width) { scale = options.width / defaultViewport.width; } else if (options.height) { scale = options.height / defaultViewport.height; } const viewport = page.getViewport({ scale: Math.max(0, scale) }); const drawingContext = new CanvasFactory().create(viewport.width, viewport.height); await page.render({ canvasContext: drawingContext.context, viewport }).promise; const dataUrl = drawingContext.canvas.toDataURL(); const response = await fetch(dataUrl); return await response.arrayBuffer(); } async function createIsomorphicCanvasFactory(canvasImport) { if (isBrowser) return DOMCanvasFactory; if (isNode) { if (!canvasImport) { throw new Error('Parameter "canvasImport" is required in Node.js environment.'); } await resolveCanvasModule(canvasImport); injectCanvasConstructors(); return NodeCanvasFactory; } throw new Error("Unsupported environment for canvas creation."); } async function getMeta$1(data) { const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data); const meta = await pdf.getMetadata(); return { info: meta?.info ?? {}, metadata: { ...meta?.metadata } }; } async function extractText$1(data, options = {}) { const { mergePages = false } = options; const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data); const texts = await Promise.all( Array.from({ length: pdf.numPages }, (_, i) => getPageText(pdf, i + 1)) ); return { totalPages: pdf.numPages, text: mergePages ? texts.join("\n").replace(/\s+/g, " ") : texts }; } async function getPageText(document, pageNumber) { const page = await document.getPage(pageNumber); const content = await page.getTextContent(); return content.items.filter((item) => item.str != null).map((item) => item.str + (item.hasEOL ? "\n" : "")).join(""); } async function definePDFJSModule(pdfjs) { await resolvePDFJSImport(pdfjs, { reload: true }); } async function configureUnPDF(options) { await resolvePDFJSImport(options.pdfjs, { reload: true }); } const getMeta = async (...args) => { await resolvePDFJSImport(); return await getMeta$1(...args); }; const extractText = async (...args) => { await resolvePDFJSImport(); return await extractText$1(...args); }; const extractImages = async (...args) => { await resolvePDFJSImport(); return await extractImages$1(...args); }; const renderPageAsImage = async (...args) => { await resolvePDFJSImport(); return await renderPageAsImage$1(...args); }; export { configureUnPDF, definePDFJSModule, extractImages, extractText, getDocumentProxy, getMeta, getResolvedPDFJS, renderPageAsImage, resolvePDFJSImport };