UNPKG

unpdf

Version:

PDF extraction and rendering across all JavaScript runtimes

329 lines (320 loc) 10.4 kB
let resolvedModule; const isNode = globalThis.process?.release?.name === "node"; const isBrowser = typeof window !== "undefined"; async function getDocumentProxy(data, options = {}) { const { getDocument } = await getResolvedPDFJS(); const pdf = await getDocument({ data, isEvalSupported: false, // See: https://github.com/mozilla/pdf.js/issues/4244#issuecomment-1479534301 useSystemFonts: true, ...options }).promise; return pdf; } async function getResolvedPDFJS() { if (!resolvedModule) { await resolvePDFJSImport(); } return resolvedModule; } async function resolvePDFJSImport(pdfjsResolver, { reload = false } = {}) { if (resolvedModule && !reload) { return; } if (pdfjsResolver) { try { resolvedModule = await interopDefault(pdfjsResolver()); return; } catch (error) { throw new Error(`PDF.js could not be resolved: ${error}`); } } try { resolvedModule = await import('unpdf/pdfjs'); } catch (error) { throw new Error(`Serverless PDF.js bundle could not be resolved: ${error}`); } } function isPDFDocumentProxy(data) { return typeof data === "object" && data !== null && "_pdfInfo" in data; } async function interopDefault(m) { const resolved = await m; return resolved.default || resolved; } let resolvedCanvasModule; /** * Derived from the PDF.js project by the Mozilla Foundation. * @see https://github.com/mozilla/pdf.js/blob/b8de9a372f9bbf7e33adb362eeae5ef1919dba73/src/display/canvas_factory.js#L18 * @license Apache-2.0 */ class BaseCanvasFactory { #enableHWA = false; constructor({ enableHWA = false } = {}) { this.#enableHWA = enableHWA; } create(width, height) { const canvas = this._createCanvas(width, height); return { canvas, context: canvas.getContext("2d", { willReadFrequently: !this.#enableHWA }) }; } reset({ canvas }, width, height) { if (!canvas) { throw new Error("Canvas is not specified"); } canvas.width = width; canvas.height = height; } destroy(context) { if (!context.canvas) { throw new Error("Canvas is not specified"); } context.canvas.width = 0; context.canvas.height = 0; context.canvas = void 0; context.context = void 0; } // eslint-disable-next-line unused-imports/no-unused-vars _createCanvas(width, height) { throw new Error("Not implemented"); } } /** * Derived from the PDF.js project by the Mozilla Foundation. * @see https://github.com/mozilla/pdf.js/blob/b8de9a372f9bbf7e33adb362eeae5ef1919dba73/src/display/canvas_factory.js#L18 * @license Apache-2.0 */ class DOMCanvasFactory extends BaseCanvasFactory { _document; constructor({ ownerDocument = globalThis.document, enableHWA = false } = {}) { super({ enableHWA }); this._document = ownerDocument; } _createCanvas(width, height) { const canvas = this._document.createElement("canvas"); canvas.width = width; canvas.height = height; return canvas; } } class NodeCanvasFactory extends BaseCanvasFactory { constructor({ enableHWA = false } = {}) { super({ enableHWA }); } _createCanvas(width, height) { if (!resolvedCanvasModule) { throw new Error("@napi-rs/canvas module is not resolved"); } return resolvedCanvasModule.createCanvas(width, height); } } async function resolveCanvasModule(canvasImport) { resolvedCanvasModule ??= await interopDefault(canvasImport()); } function injectCanvasConstructors() { if (!resolvedCanvasModule) return; if (typeof globalThis.DOMMatrix === "undefined") globalThis.DOMMatrix = resolvedCanvasModule.DOMMatrix; if (typeof globalThis.ImageData === "undefined") globalThis.ImageData = resolvedCanvasModule.ImageData; if (typeof globalThis.Path2D === "undefined") globalThis.Path2D = resolvedCanvasModule.Path2D; } async function extractImages$1(data, pageNumber) { const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data); if (pageNumber < 1 || pageNumber > pdf.numPages) { throw new Error(`Invalid page number. Must be between 1 and ${pdf.numPages}.`); } const page = await pdf.getPage(pageNumber); const operatorList = await page.getOperatorList(); const { OPS } = await getResolvedPDFJS(); const images = []; for (let i = 0; i < operatorList.fnArray.length; i++) { const op = operatorList.fnArray[i]; if (op !== OPS.paintImageXObject) { continue; } const imageKey = operatorList.argsArray[i][0]; const image = await new Promise( (resolve) => (imageKey.startsWith("g_") ? page.commonObjs : page.objs).get(imageKey, resolve) ); if (!image || !image.data || !image.width || !image.height) { continue; } const { width, height, data: data2 } = image; const calculatedChannels = data2.length / (width * height); if (![1, 3, 4].includes(calculatedChannels)) { continue; } const channels = calculatedChannels; images.push({ data: data2, width, height, channels, key: imageKey }); } return images; } async function renderPageAsImage$1(data, pageNumber, options = {}) { const CanvasFactory = await createIsomorphicCanvasFactory(options.canvasImport); const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data, { CanvasFactory }); const page = await pdf.getPage(pageNumber); if (pageNumber < 1 || pageNumber > pdf.numPages) { throw new Error(`Invalid page number. Must be between 1 and ${pdf.numPages}.`); } const defaultViewport = page.getViewport({ scale: 1 }); let scale = options.scale || 1; if (options.width) { scale = options.width / defaultViewport.width; } else if (options.height) { scale = options.height / defaultViewport.height; } const viewport = page.getViewport({ scale: Math.max(0, scale) }); const drawingContext = new CanvasFactory().create(viewport.width, viewport.height); await page.render({ canvas: drawingContext.canvas, canvasContext: drawingContext.context, viewport }).promise; const dataUrl = drawingContext.canvas.toDataURL(); if (options.toDataURL) { return dataUrl; } const response = await fetch(dataUrl); return await response.arrayBuffer(); } async function createIsomorphicCanvasFactory(canvasImport) { if (isBrowser) return DOMCanvasFactory; if (isNode) { if (!canvasImport) { throw new Error('Parameter "canvasImport" is required in Node.js environment.'); } await resolveCanvasModule(canvasImport); injectCanvasConstructors(); return NodeCanvasFactory; } throw new Error("Unsupported environment for canvas creation."); } async function extractLinks$1(data) { const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data); const pageLinks = await Promise.all( Array.from({ length: pdf.numPages }, (_, i) => getPageLinks(pdf, i + 1)) ); return { totalPages: pdf.numPages, links: pageLinks.flat() }; } async function getPageLinks(document, pageNumber) { const page = await document.getPage(pageNumber); const annotations = await page.getAnnotations(); const links = []; for (const annotation of annotations) { if (annotation.subtype === "Link" && annotation.url) { links.push(annotation.url); } } return links; } const XMP_DATE_PROPERTIES = [ "xmp:createdate", "xmp:modifydate", "xmp:metadatadate", "xap:createdate", "xap:modifydate", "xap:metadatadate" ]; async function getMeta$1(data, options = {}) { const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data); const meta = await pdf.getMetadata(); const info = meta?.info || {}; if (options.parseDates) { const { PDFDateString } = await getResolvedPDFJS(); if (info?.CreationDate) { info.CreationDate = PDFDateString.toDateObject(info?.CreationDate); } if (info?.ModDate) { info.ModDate = PDFDateString.toDateObject(info?.ModDate); } if (meta.metadata) { meta.metadata = new Proxy(meta.metadata, { get(target, prop) { if (prop === "get") { return (name) => { const value = target.get(name); if (XMP_DATE_PROPERTIES.includes(name) && value) { return parseISODateString(value); } return value; }; } return target[prop]; } }); } } return { info, metadata: meta?.metadata || {} }; } function parseISODateString(isoDateString) { if (!isoDateString) return; const parsedDate = Date.parse(isoDateString); if (!Number.isNaN(parsedDate)) { return new Date(parsedDate); } } async function extractText$1(data, options = {}) { const { mergePages = false } = options; const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data); const texts = await Promise.all( Array.from({ length: pdf.numPages }, (_, i) => getPageText(pdf, i + 1)) ); return { totalPages: pdf.numPages, text: mergePages ? texts.join("\n").replace(/\s+/g, " ") : texts }; } async function getPageText(document, pageNumber) { const page = await document.getPage(pageNumber); const content = await page.getTextContent(); return content.items.filter((item) => item.str != null).map((item) => item.str + (item.hasEOL ? "\n" : "")).join(""); } async function definePDFJSModule(pdfjs) { await resolvePDFJSImport(pdfjs, { reload: true }); } async function configureUnPDF(options) { await resolvePDFJSImport(options.pdfjs, { reload: true }); } const getMeta = async (...args) => { await resolvePDFJSImport(); return await getMeta$1(...args); }; const extractText = async (...args) => { await resolvePDFJSImport(); return await extractText$1(...args); }; const extractImages = async (...args) => { await resolvePDFJSImport(); return await extractImages$1(...args); }; const renderPageAsImage = async (...args) => { await resolvePDFJSImport(); return await renderPageAsImage$1(...args); }; const extractLinks = async (...args) => { await resolvePDFJSImport(); return await extractLinks$1(...args); }; export { configureUnPDF, createIsomorphicCanvasFactory, definePDFJSModule, extractImages, extractLinks, extractText, getDocumentProxy, getMeta, getResolvedPDFJS, renderPageAsImage, resolvePDFJSImport };