pdf-to-img
Version:
📃📸 Converts PDFs to images in nodejs
84 lines (83 loc) • 3.08 kB
JavaScript
import { createRequire } from "node:module";
import path from "node:path/posix";
import * as pdfjs from "pdfjs-dist/legacy/build/pdf.mjs";
import { parseInput } from "./parseInput.js";
const pdfjsPath = path.dirname(createRequire(import.meta.url).resolve("pdfjs-dist/package.json"));
/** required since k-yle/pdf-to-img#58, the objects from pdfjs are weirdly structured */
const sanitize = (x) => {
// eslint-disable-next-line unicorn/prefer-structured-clone -- TODO: wait for min nodejs version to be bumped
const object = JSON.parse(JSON.stringify(x));
// remove UTF16 BOM and weird 0x0 character introduced in k-yle/pdf-to-img#138 and k-yle/pdf-to-img#184
for (const key in object) {
if (typeof object[key] === "string") {
// eslint-disable-next-line no-control-regex -- this is deliberate
object[key] = object[key].replaceAll(/(^þÿ|\u0000)/g, "");
}
}
return object;
};
/**
* Converts a PDF to a series of images. This returns a `Symbol.asyncIterator`
*
* @param input Either (a) the path to a pdf file, or (b) a data url, or (b) a buffer, (c) a buffer, or (e) a ReadableStream.
*
* @example
* ```js
* import pdf from "pdf-to-img";
*
* for await (const page of await pdf("example.pdf")) {
* expect(page).toMatchImageSnapshot();
* }
*
* // or if you want access to more details:
*
* const doc = await pdf("example.pdf");
* expect(doc.length).toBe(1);
* expect(doc.metadata).toEqual({ ... });
*
* for await (const page of doc) {
* expect(page).toMatchImageSnapshot();
* }
* ```
*/
export async function pdf(input, options = {}) {
const data = await parseInput(input);
const pdfDocument = await pdfjs.getDocument({
password: options.password,
standardFontDataUrl: path.join(pdfjsPath, `standard_fonts${path.sep}`),
cMapUrl: path.join(pdfjsPath, `cmaps${path.sep}`),
cMapPacked: true,
...options.docInitParams,
isEvalSupported: false,
data,
}).promise;
const metadata = await pdfDocument.getMetadata();
async function getPage(pageNumber) {
const page = await pdfDocument.getPage(pageNumber);
const viewport = page.getViewport({ scale: options.scale ?? 1 });
const { canvas } = pdfDocument.canvasFactory.create(viewport.width, viewport.height, !!options.renderParams?.background);
await page.render({
canvas,
viewport,
...options.renderParams,
}).promise;
return canvas.toBuffer("image/png");
}
return {
length: pdfDocument.numPages,
metadata: sanitize(metadata.info),
getPage,
[Symbol.asyncIterator]() {
return {
pg: 0,
async next() {
if (this.pg < pdfDocument.numPages) {
this.pg += 1;
return { done: false, value: await getPage(this.pg) };
}
return { done: true, value: undefined };
},
};
},
};
}