unpdf
Version:
PDF extraction and rendering across all JavaScript runtimes
255 lines (247 loc) • 8.3 kB
JavaScript
let resolvedModule;
const isNode = globalThis.process?.release?.name === "node";
const isBrowser = typeof window !== "undefined";
async function getDocumentProxy(data, options = {}) {
const { getDocument } = await getResolvedPDFJS();
const pdf = await getDocument({
data,
isEvalSupported: false,
// See: https://github.com/mozilla/pdf.js/issues/4244#issuecomment-1479534301
useSystemFonts: true,
...options
}).promise;
return pdf;
}
async function getResolvedPDFJS() {
if (!resolvedModule) {
await resolvePDFJSImport();
}
return resolvedModule;
}
async function resolvePDFJSImport(pdfjsResolver, { reload = false } = {}) {
if (resolvedModule && !reload) {
return;
}
if (pdfjsResolver) {
try {
resolvedModule = await interopDefault(pdfjsResolver());
return;
} catch (error) {
throw new Error(`PDF.js could not be resolved: ${error}`);
}
}
try {
resolvedModule = await import('unpdf/pdfjs');
} catch (error) {
throw new Error(`Serverless PDF.js bundle could not be resolved: ${error}`);
}
}
function isPDFDocumentProxy(data) {
return typeof data === "object" && data !== null && "_pdfInfo" in data;
}
async function interopDefault(m) {
const resolved = await m;
return resolved.default || resolved;
}
let resolvedCanvasModule;
/**
* Derived from the PDF.js project by the Mozilla Foundation.
* @see https://github.com/mozilla/pdf.js/blob/b8de9a372f9bbf7e33adb362eeae5ef1919dba73/src/display/canvas_factory.js#L18
* @license Apache-2.0
*/
class BaseCanvasFactory {
#enableHWA = false;
constructor({ enableHWA = false } = {}) {
this.#enableHWA = enableHWA;
}
create(width, height) {
const canvas = this._createCanvas(width, height);
return {
canvas,
context: canvas.getContext("2d", {
willReadFrequently: !this.#enableHWA
})
};
}
reset({ canvas }, width, height) {
if (!canvas) {
throw new Error("Canvas is not specified");
}
canvas.width = width;
canvas.height = height;
}
destroy(context) {
if (!context.canvas) {
throw new Error("Canvas is not specified");
}
context.canvas.width = 0;
context.canvas.height = 0;
context.canvas = void 0;
context.context = void 0;
}
// eslint-disable-next-line unused-imports/no-unused-vars
_createCanvas(width, height) {
throw new Error("Not implemented");
}
}
/**
* Derived from the PDF.js project by the Mozilla Foundation.
* @see https://github.com/mozilla/pdf.js/blob/b8de9a372f9bbf7e33adb362eeae5ef1919dba73/src/display/canvas_factory.js#L18
* @license Apache-2.0
*/
class DOMCanvasFactory extends BaseCanvasFactory {
_document;
constructor({ ownerDocument = globalThis.document, enableHWA = false } = {}) {
super({ enableHWA });
this._document = ownerDocument;
}
_createCanvas(width, height) {
const canvas = this._document.createElement("canvas");
canvas.width = width;
canvas.height = height;
return canvas;
}
}
class NodeCanvasFactory extends BaseCanvasFactory {
constructor({ enableHWA = false } = {}) {
super({ enableHWA });
}
_createCanvas(width, height) {
if (!resolvedCanvasModule) {
throw new Error("@napi-rs/canvas module is not resolved");
}
return resolvedCanvasModule.createCanvas(width, height);
}
}
async function resolveCanvasModule(canvasImport) {
resolvedCanvasModule ??= await interopDefault(canvasImport());
}
function injectCanvasConstructors() {
if (!resolvedCanvasModule)
return;
if (typeof globalThis.DOMMatrix === "undefined")
globalThis.DOMMatrix = resolvedCanvasModule.DOMMatrix;
if (typeof globalThis.ImageData === "undefined")
globalThis.ImageData = resolvedCanvasModule.ImageData;
if (typeof globalThis.Path2D === "undefined")
globalThis.Path2D = resolvedCanvasModule.Path2D;
}
async function extractImages$1(data, pageNumber) {
const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data);
if (pageNumber < 1 || pageNumber > pdf.numPages) {
throw new Error(`Invalid page number. Must be between 1 and ${pdf.numPages}.`);
}
const page = await pdf.getPage(pageNumber);
const operatorList = await page.getOperatorList();
const { OPS } = await getResolvedPDFJS();
const images = [];
for (let i = 0; i < operatorList.fnArray.length; i++) {
const op = operatorList.fnArray[i];
if (op !== OPS.paintImageXObject) {
continue;
}
const imageKey = operatorList.argsArray[i][0];
const image = imageKey.startsWith("g_") ? await page.commonObjs.get(imageKey) : await page.objs.get(imageKey);
if (!image || !image.data || !image.width || !image.height) {
continue;
}
const { width, height, data: data2 } = image;
const calculatedChannels = data2.length / (width * height);
if (![1, 3, 4].includes(calculatedChannels)) {
continue;
}
const channels = calculatedChannels;
images.push({
data: data2,
width,
height,
channels,
key: imageKey
});
}
return images;
}
async function renderPageAsImage$1(data, pageNumber, options = {}) {
const CanvasFactory = await createIsomorphicCanvasFactory(options.canvasImport);
const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data, { CanvasFactory });
const page = await pdf.getPage(pageNumber);
if (pageNumber < 1 || pageNumber > pdf.numPages) {
throw new Error(`Invalid page number. Must be between 1 and ${pdf.numPages}.`);
}
const defaultViewport = page.getViewport({ scale: 1 });
let scale = options.scale || 1;
if (options.width) {
scale = options.width / defaultViewport.width;
} else if (options.height) {
scale = options.height / defaultViewport.height;
}
const viewport = page.getViewport({ scale: Math.max(0, scale) });
const drawingContext = new CanvasFactory().create(viewport.width, viewport.height);
await page.render({
canvasContext: drawingContext.context,
viewport
}).promise;
const dataUrl = drawingContext.canvas.toDataURL();
const response = await fetch(dataUrl);
return await response.arrayBuffer();
}
async function createIsomorphicCanvasFactory(canvasImport) {
if (isBrowser)
return DOMCanvasFactory;
if (isNode) {
if (!canvasImport) {
throw new Error('Parameter "canvasImport" is required in Node.js environment.');
}
await resolveCanvasModule(canvasImport);
injectCanvasConstructors();
return NodeCanvasFactory;
}
throw new Error("Unsupported environment for canvas creation.");
}
async function getMeta$1(data) {
const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data);
const meta = await pdf.getMetadata();
return {
info: meta?.info ?? {},
metadata: { ...meta?.metadata }
};
}
async function extractText$1(data, options = {}) {
const { mergePages = false } = options;
const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data);
const texts = await Promise.all(
Array.from({ length: pdf.numPages }, (_, i) => getPageText(pdf, i + 1))
);
return {
totalPages: pdf.numPages,
text: mergePages ? texts.join("\n").replace(/\s+/g, " ") : texts
};
}
async function getPageText(document, pageNumber) {
const page = await document.getPage(pageNumber);
const content = await page.getTextContent();
return content.items.filter((item) => item.str != null).map((item) => item.str + (item.hasEOL ? "\n" : "")).join("");
}
async function definePDFJSModule(pdfjs) {
await resolvePDFJSImport(pdfjs, { reload: true });
}
async function configureUnPDF(options) {
await resolvePDFJSImport(options.pdfjs, { reload: true });
}
const getMeta = async (...args) => {
await resolvePDFJSImport();
return await getMeta$1(...args);
};
const extractText = async (...args) => {
await resolvePDFJSImport();
return await extractText$1(...args);
};
const extractImages = async (...args) => {
await resolvePDFJSImport();
return await extractImages$1(...args);
};
const renderPageAsImage = async (...args) => {
await resolvePDFJSImport();
return await renderPageAsImage$1(...args);
};
export { configureUnPDF, definePDFJSModule, extractImages, extractText, getDocumentProxy, getMeta, getResolvedPDFJS, renderPageAsImage, resolvePDFJSImport };