UNPKG

node-poppler

Version:

Asynchronous node.js wrapper for the Poppler PDF rendering library

1,173 lines (1,078 loc) 70.1 kB
"use strict"; const { execFile, spawn, spawnSync } = require("node:child_process"); const { promisify } = require("node:util"); const camelCase = require("camelcase"); const { lt } = require("semver"); const { normalize, resolve: pathResolve } = require("node:path"); const execFileAsync = promisify(execFile); const errorMessages = { 0: "No Error", 1: "Error opening a PDF file", 2: "Error opening an output file", 3: "Error related to PDF permissions", 4: "Error related to ICC profile", 99: "Other error", 3221226505: "Internal process error", }; // Cache immutable regex as they are expensive to create and garbage collect const popplerVersionRegex = /(\d{1,2}\.\d{1,2}\.\d{1,2})/u; const pdfInfoFileSizesRegex = /(File\s+size:\s+)0(\s+)bytes/u; /** * @author Frazer Smith * @description Checks each option provided is valid, of the correct type, and can be used by specified * version of binary. * @ignore * @param {object} acceptedOptions - Object containing accepted options. * @param {Record<string, any>} options - Object containing options to pass to binary. * @param {string} [version] - Version of binary. * @returns {string[]} Array of CLI arguments. * @throws If invalid arguments provided. */ function parseOptions(acceptedOptions, options, version) { /** @type {string[]} */ const args = []; /** @type {string[]} */ const invalidArgs = []; for (const key of Object.keys(options)) { if (Object.hasOwn(acceptedOptions, key)) { const option = options[key]; const acceptedOption = acceptedOptions[key]; // eslint-disable-next-line valid-typeof -- `type` is a string if (acceptedOption.type === typeof option) { // Skip boolean options if false if (acceptedOption.type !== "boolean" || option) { // Arg will be empty for some non-standard options if (acceptedOption.arg !== "") { args.push(acceptedOption.arg); } if (typeof option !== "boolean") { args.push(option); } } } else { invalidArgs.push( `Invalid value type provided for option '${key}', expected ${ acceptedOption.type } but received ${typeof option}` ); } if ( acceptedOption.minVersion && version && // @ts-ignore: type checking is done above lt(version, acceptedOption.minVersion, { loose: true }) ) { invalidArgs.push( `Invalid option provided for the current version of the binary used. '${key}' was introduced in v${acceptedOption.minVersion}, but received v${version}` ); } } else { invalidArgs.push(`Invalid option provided '${key}'`); } } if (invalidArgs.length === 0) { return args; } throw new Error(invalidArgs.join("; ")); } class Poppler { /** * @param {string} [binPath] - Path of poppler-utils binaries. * If not provided, the constructor will attempt to find the Poppler `pdfinfo` binary * in the PATH environment variable and use that as the path for all binaries. * For `win32` the binaries are bundled with the package and will be used * if a local installation is not found. */ constructor(binPath) { this.popplerPath = ""; /* istanbul ignore else: requires specific OS */ if (binPath) { /** @type {string|undefined} */ this.popplerPath = binPath; } else { const { platform } = process; const which = spawnSync(platform === "win32" ? "where" : "which", [ "pdfinfo", ]).stdout.toString(); const popplerPath = /(.+)pdfinfo/u.exec(which)?.[1]; if (popplerPath) { this.popplerPath = popplerPath; } if (platform === "win32" && !popplerPath) { this.popplerPath = pathResolve( __dirname, "lib", "win32", "poppler-24.07.0", "Library", "bin" ); } } /* istanbul ignore next: unable to test due to https://github.com/jestjs/jest/pull/14297 */ if (!this.popplerPath) { throw new Error( `Unable to find ${process.platform} Poppler binaries, please pass the installation directory as a parameter to the Poppler instance.` ); } this.popplerPath = normalize(this.popplerPath); } /** * @author Frazer Smith * @description Embeds files (attachments) into a PDF file. * @param {string} file - Filepath of the PDF file to read. * @param {string} fileToAttach - Filepath of the attachment to be embedded into the PDF file. * @param {string} outputFile - Filepath of the file to output the results to. * @param {object} [options] - Object containing options to pass to binary. * @param {boolean} [options.printVersionInfo] - Print copyright and version info. * @param {boolean} [options.replace] - Replace embedded file with same name (if it exists). * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfAttach(file, fileToAttach, outputFile, options = {}) { const acceptedOptions = { printVersionInfo: { arg: "-v", type: "boolean" }, replace: { arg: "-replace", type: "boolean" }, }; try { const args = parseOptions(acceptedOptions, options); args.push(file, fileToAttach, outputFile); const { stdout } = await execFileAsync( pathResolve(this.popplerPath, "pdfattach"), args ); return Promise.resolve(stdout); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Lists or extracts embedded files (attachments) from a PDF file. * @param {string} file - Filepath of the PDF file to read. * @param {object} [options] - Object containing options to pass to binary. * @param {boolean} [options.listEmbedded] - List all of the embedded files in the PDF file. * File names are converted to the text encoding specified by `options.outputEncoding`. * @param {string} [options.outputEncoding] - Sets the encoding to use for text output. * This defaults to `UTF-8`. * @param {string} [options.ownerPassword] - Owner password (for encrypted files). * @param {string} [options.outputPath] - Set the file name used when saving an embedded file with * the save option enabled, or the directory if `options.saveall` is used. * @param {boolean} [options.printVersionInfo] - Print copyright and version info. * @param {boolean} [options.saveAllFiles] - Save all of the embedded files. This uses the file * names associated with the embedded files (as printed by `options.listEmbedded`). * By default, the files are saved in the current directory; this can be changed * with `options.outputPath`. * @param {string} [options.saveFile] - Save the specified embedded file. * By default, this uses the file name associated with the embedded file (as printed by * `options.listEmbedded`); the file name can be changed with `options.outputPath`. * @param {number} [options.saveSpecificFile] - Save the specified embedded file. * By default, this uses the file name associated with the embedded file (as printed by * `options.listEmbedded`); the file name can be changed with `options.outputPath`. * @param {string} [options.userPassword] - User password (for encrypted files). * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfDetach(file, options = {}) { const acceptedOptions = { listEmbedded: { arg: "-list", type: "boolean" }, outputEncoding: { arg: "-enc", type: "string" }, outputPath: { arg: "-o", type: "string" }, ownerPassword: { arg: "-opw", type: "string" }, printVersionInfo: { arg: "-v", type: "boolean" }, saveAllFiles: { arg: "-saveall", type: "boolean" }, saveFile: { arg: "-savefile", type: "string", minVersion: "0.86.0", }, saveSpecificFile: { arg: "-save", type: "number" }, userPassword: { arg: "-upw", type: "string" }, }; try { const args = parseOptions(acceptedOptions, options); args.push(file); const { stdout } = await execFileAsync( pathResolve(this.popplerPath, "pdfdetach"), args ); return Promise.resolve(stdout); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Lists the fonts used in a PDF file along with various information for each font. * @param {Buffer|string} file - PDF file as Buffer, or filepath of the PDF file to read. * @param {object} [options] - Object containing options to pass to binary. * @param {number} [options.firstPageToExamine] - Specifies the first page to examine. * @param {number} [options.lastPageToExamine] - Specifies the last page to examine. * @param {boolean} [options.listSubstitutes] - List the substitute fonts that poppler * will use for non-embedded fonts. * @param {string} [options.ownerPassword] - Owner password (for encrypted files). * @param {boolean} [options.printVersionInfo] - Print copyright and version info. * @param {string} [options.userPassword] - User password (for encrypted files). * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfFonts(file, options = {}) { const acceptedOptions = { firstPageToExamine: { arg: "-f", type: "number" }, lastPageToExamine: { arg: "-l", type: "number" }, listSubstitutes: { arg: "-subst", type: "boolean" }, ownerPassword: { arg: "-opw", type: "string" }, printVersionInfo: { arg: "-v", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }; try { const { stderr } = await execFileAsync( pathResolve(this.popplerPath, "pdffonts"), ["-v"] ); // @ts-ignore: parseOptions checks if falsy const versionInfo = popplerVersionRegex.exec(stderr)[1]; const args = parseOptions(acceptedOptions, options, versionInfo); return new Promise((resolve, reject) => { args.push(Buffer.isBuffer(file) ? "-" : file); const child = spawn( pathResolve(this.popplerPath, "pdffonts"), args ); if (Buffer.isBuffer(file)) { child.stdin.write(file); child.stdin.end(); } let stdOut = ""; let stdErr = ""; child.stdout.on("data", (data) => { stdOut += data; }); child.stderr.on("data", (data) => { stdErr += data; }); child.on("close", (code) => { /* istanbul ignore else */ if (stdOut !== "") { resolve(stdOut.trim()); } else if (code === 0) { resolve(errorMessages[code]); } else if (stdErr !== "") { reject(new Error(stdErr.trim())); } else { reject( new Error( errorMessages[code] || `pdffonts ${args.join( " " )} exited with code ${code}` ) ); } }); }); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Saves images from a PDF file as PPM, PBM, PNG, TIFF, JPEG, JPEG2000, or JBIG2 files. * @param {Buffer|string} file - PDF file as Buffer, or filepath of the PDF file to read. * @param {string} [outputPrefix] - Filename prefix of output files. * @param {object} [options] - Object containing options to pass to binary. * @param {boolean} [options.allFiles] - Write JPEG, JPEG2000, JBIG2, and CCITT images in their native format. * CMYK files are written as TIFF files. All other images are written as PNG files. * @param {boolean} [options.ccittFile] - Generate CCITT images as CCITT files. * @param {number} [options.firstPageToConvert] - Specifies the first page to convert. * @param {number} [options.lastPageToConvert] - Specifies the last page to convert. * @param {boolean} [options.jbig2File] - Generate JBIG2 images as JBIG2 files. * @param {boolean} [options.jpeg2000File] - Generate JPEG2000 images at JP2 files. * @param {boolean} [options.jpegFile] - Generate JPEG images as JPEG files. * @param {boolean} [options.list] - Instead of writing the images, list the * images along with various information for each image. * NOTE: Do not specify the outputPrefix with this option. * @param {string} [options.ownerPassword] - Owner password (for encrypted files). * @param {boolean} [options.pngFile] - Change the default output format to PNG. * @param {boolean} [options.printVersionInfo] - Print copyright and version info. * @param {boolean} [options.tiffFile] - Change the default output format to TIFF. * @param {string} [options.userPassword] - Specify the user password for the PDF file. * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfImages(file, outputPrefix, options = {}) { const acceptedOptions = { allFiles: { arg: "-all", type: "boolean" }, ccittFile: { arg: "-ccitt", type: "boolean" }, firstPageToConvert: { arg: "-f", type: "number" }, lastPageToConvert: { arg: "-l", type: "number" }, jbig2File: { arg: "-jbig2", type: "boolean" }, jpeg2000File: { arg: "-jp2", type: "boolean" }, jpegFile: { arg: "-j", type: "boolean" }, list: { arg: "-list", type: "boolean" }, ownerPassword: { arg: "-opw", type: "string" }, pngFile: { arg: "-png", type: "boolean" }, printVersionInfo: { arg: "-v", type: "boolean" }, tiffFile: { arg: "-tiff", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }; try { const { stderr } = await execFileAsync( pathResolve(this.popplerPath, "pdfimages"), ["-v"] ); // @ts-ignore: parseOptions checks if falsy const versionInfo = popplerVersionRegex.exec(stderr)[1]; const args = parseOptions(acceptedOptions, options, versionInfo); return new Promise((resolve, reject) => { args.push(Buffer.isBuffer(file) ? "-" : file); if (outputPrefix) { args.push(outputPrefix); } const child = spawn( pathResolve(this.popplerPath, "pdfimages"), args ); if (Buffer.isBuffer(file)) { child.stdin.write(file); child.stdin.end(); } let stdOut = ""; let stdErr = ""; child.stdout.on("data", (data) => { stdOut += data; }); child.stderr.on("data", (data) => { stdErr += data; }); child.on("close", (code) => { /* istanbul ignore else */ if (stdOut !== "") { resolve(stdOut.trim()); } else if (code === 0) { resolve(errorMessages[code]); } else if (stdErr !== "") { reject(new Error(stdErr.trim())); } else { reject( new Error( errorMessages[code] || `pdfimages ${args.join( " " )} exited with code ${code}` ) ); } }); }); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Prints the contents of the `Info` dictionary from a PDF file. * @param {Buffer|string} file - PDF file as Buffer, or filepath of the PDF file to read. * @param {object} [options] - Object containing options to pass to binary. * @param {number} [options.firstPageToConvert] - First page to print. * @param {number} [options.lastPageToConvert] - Last page to print. * @param {boolean} [options.listEncodingOptions] - List the available encodings. * @param {string} [options.outputEncoding] - Sets the encoding to use for text output. * This defaults to `UTF-8`. * @param {string} [options.ownerPassword] - Owner password (for encrypted files). * @param {boolean} [options.printAsJson] - Print result as a JSON object. * @param {boolean} [options.printBoundingBoxes] - Prints the page box bounding boxes: * MediaBox, CropBox, BleedBox, TrimBox, and ArtBox. * @param {boolean} [options.printDocStruct] - Prints the logical document structure * of a Tagged-PDF file. * @param {boolean} [options.printDocStructText] - Print the textual content along with the * document structure of a Tagged-PDF file. Note that extracting text this way might be slow * for big PDF files. * @param {boolean} [options.printIsoDates] - Prints dates in ISO-8601 format (including the time zone). * @param {boolean} [options.printJS] - Prints all JavaScript in the PDF file. * @param {boolean} [options.printMetadata] - Prints document-level metadata. (This is the `Metadata` * stream from the PDF file's Catalog object). * @param {boolean} [options.printNamedDests] - Print a list of all named destinations. If a page range * is specified using the `options.firstPageToConvert` and `options.lastPageToConvert` options, only destinations * in the page range are listed. * @param {boolean} [options.printRawDates] - Prints the raw (undecoded) date strings, directly from the PDF file. * @param {boolean} [options.printUrls] - Print all URLs in the PDF; only URLs referenced by PDF objects * such as Link Annotations are listed, not URL strings in the text content. * @param {boolean} [options.printVersionInfo] - Print copyright and version info. * @param {string} [options.userPassword] - User password (for encrypted files). * @returns {Promise<object|string>} A promise that resolves with a stdout string or JSON object if * `options.printAsJson` is `true`, or rejects with an `Error` object. */ async pdfInfo(file, options = {}) { const acceptedOptions = { firstPageToConvert: { arg: "-f", type: "number" }, lastPageToConvert: { arg: "-l", type: "number" }, listEncodingOptions: { arg: "-listenc", type: "boolean" }, outputEncoding: { arg: "-enc", type: "string" }, ownerPassword: { arg: "-opw", type: "string" }, printAsJson: { arg: "", type: "boolean" }, printBoundingBoxes: { arg: "-box", type: "boolean" }, printDocStruct: { arg: "-struct", type: "boolean" }, printDocStructText: { arg: "-struct-text", type: "boolean" }, printIsoDates: { arg: "-isodates", type: "boolean" }, printJS: { arg: "-js", type: "boolean" }, printMetadata: { arg: "-meta", type: "boolean" }, printNamedDests: { arg: "-dests", type: "boolean" }, printRawDates: { arg: "-rawdates", type: "boolean" }, printUrls: { arg: "-url", type: "boolean", minVersion: "21.11.0" }, printVersionInfo: { arg: "-v", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }; try { const { stderr } = await execFileAsync( pathResolve(this.popplerPath, "pdfinfo"), ["-v"] ); // @ts-ignore: parseOptions checks if falsy const versionInfo = popplerVersionRegex.exec(stderr)[1]; const args = parseOptions(acceptedOptions, options, versionInfo); /** * Poppler does not set the "File size" metadata value if passed * a Buffer via stdin, so need to retrieve it from the Buffer. */ /** @type {number} */ let fileSize; return new Promise((resolve, reject) => { if (Buffer.isBuffer(file)) { args.push("-"); fileSize = file.length; } else { args.push(file); } const child = spawn( pathResolve(this.popplerPath, "pdfinfo"), args ); if (Buffer.isBuffer(file)) { child.stdin.write(file); child.stdin.end(); } let stdOut = ""; let stdErr = ""; child.stdout.on("data", (data) => { stdOut += data; }); child.stderr.on("data", (data) => { stdErr += data; }); child.on("close", (code) => { /* istanbul ignore else */ if (stdOut !== "") { if (fileSize) { stdOut = stdOut.replace( pdfInfoFileSizesRegex, `$1${fileSize}$2bytes` ); } /** * Convert output to JSON. * @see {@link https://github.com/Fdawgs/node-poppler/issues/248#issuecomment-845948080 | Node-Poppler Issue #248} */ if (options.printAsJson === true) { const info = {}; stdOut.split("\n").forEach((line) => { const lines = line.split(": "); if (lines.length > 1) { // @ts-ignore: creating dynamic object keys info[camelCase(lines[0])] = lines[1].trim(); } }); resolve(info); } else { resolve(stdOut.trim()); } } else if (code === 0) { resolve(errorMessages[code]); } else if (stdErr !== "") { reject(new Error(stdErr.trim())); } else { reject( new Error( errorMessages[code] || `pdfinfo ${args.join( " " )} exited with code ${code}` ) ); } }); }); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Extracts single pages from a PDF file, * and writes one PDF file for each page to outputPattern. * This will not work if the file is encrypted. * @param {string} file - Filepath of the PDF file to read. * @param {string} outputPattern - Should contain %d (or any variant respecting printf format), * since %d is replaced by the page number. * As an example, `sample-%d.pdf` will produce `sample-1.pdf` for a single page document. * @param {object} [options] - Object containing options to pass to binary. * @param {number} [options.firstPageToExtract] - Specifies the first page to extract. * This defaults to page 1. * @param {number} [options.lastPageToExtract] - Specifies the last page to extract. * This defaults to the last page of the PDF file. * @param {boolean} [options.printVersionInfo] - Print copyright and version info. * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfSeparate(file, outputPattern, options = {}) { const acceptedOptions = { firstPageToExtract: { arg: "-f", type: "number" }, lastPageToExtract: { arg: "-l", type: "number" }, printVersionInfo: { arg: "-v", type: "boolean" }, }; try { const { stderr } = await execFileAsync( pathResolve(this.popplerPath, "pdfseparate"), ["-v"] ); // @ts-ignore: parseOptions checks if falsy const versionInfo = popplerVersionRegex.exec(stderr)[1]; const args = parseOptions(acceptedOptions, options, versionInfo); args.push(file, outputPattern); const { stdout } = await execFileAsync( pathResolve(this.popplerPath, "pdfseparate"), args ); return Promise.resolve(stdout); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Converts a PDF file to EPS/JPEG/PDF/PNG/PS/SVG/TIFF. * @param {Buffer|string} file - PDF file as Buffer, or filepath of the PDF file to read. * @param {string} [outputFile] - Filepath of the file to output the results to. * * If `undefined` then will write output to stdout. Using stdout is not valid with image formats * (jpeg, png, and tiff) unless `options.singleFile` is set to `true`. * Encoding is set to `binary` if used with `options.singleFile` or `options.pdfFile`. * * If not set then the output filename will be derived from the PDF file name. * @param {object} [options] - Object containing options to pass to binary. * @param {('best'|'default'|'fast'|'good'|'gray'|'none'|'subpixel')} [options.antialias] - Set the cairo * antialias option used for text and drawing in image files (or rasterized regions in vector output). * @param {boolean} [options.cropBox] - Uses the crop box rather than media box when * generating the files (PNG/JPEG/TIFF only). * @param {number} [options.cropHeight] - Specifies the height of crop area in pixels * (image output) or points (vector output). * @param {number} [options.cropSize] - Specifies the size of crop square in pixels * (image output) or points (vector output). * @param {number} [options.cropWidth] - Specifies the width of crop area in pixels * (image output) or points (vector output). * @param {number} [options.cropXAxis] - Specifies the x-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @param {number} [options.cropYAxis] - Specifies the y-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @param {boolean} [options.duplex] - Adds the %%IncludeFeature: *Duplex DuplexNoTumble DSC * comment to the PostScript file (PS only). This tells the print manager to enable duplexing. * @param {boolean} [options.epsFile] - Generate an EPS file. An EPS file contains a single image, * so if you use this option with a multi-page PDF file, you must use `options.firstPageToConvert` and * `options.lastPageToConvert` to specify a single page. * The page size options (originalPageSizes, paperSize, paperWidth, paperHeight) can not be used * with this option. * @param {boolean} [options.evenPagesOnly] - Generates only the even numbered pages. * @param {boolean} [options.fillPage] - Expand PDF pages smaller than the paper to fill the * paper (PS,PDF,SVG only). By default, these pages are not scaled. * @param {number} [options.firstPageToConvert] - Specifies the first page to convert. * @param {boolean} [options.grayscaleFile] - Generate grayscale file (PNG, JPEG, and TIFF only). * @param {string} [options.iccFile] - Use the specified ICC file as the output profile * (PNG only). The profile will be embedded in the PNG file. * @param {boolean} [options.jpegFile] - Generate JPEG file(s). * @param {string} [options.jpegOptions] - When used with `options.jpegFile`, this option can * be used to control the JPEG compression parameters. It takes a string of the form * `"<opt>=<val>[,<opt>=<val>]"`. Currently available options are: * - `quality` Selects the JPEG quality value. The value must be an integer between 0 and 100. * - `progressive` Select progressive JPEG output. The possible values are "y", "n", indicating * progressive (yes) or non-progressive (no), respectively. * - `optimize` Sets whether to compute optimal Huffman coding tables for the JPEG output, which * will create smaller files but make an extra pass over the data. The value must be "y" or "n", * with "y" performing optimization, otherwise the default Huffman tables are used. * * Example: `"quality=95,optimize=y"`. * @param {number} [options.lastPageToConvert] - Specifies the last page to convert. * @param {boolean} [options.monochromeFile] - Generate monochrome file (PNG and TIFF only). * @param {boolean} [options.noCenter] - By default, PDF pages smaller than the paper * (after any scaling) are centered on the paper. This option causes them to be aligned to * the lower-left corner of the paper instead (PS,PDF,SVG only). * @param {boolean} [options.noCrop] - By default, printing output is cropped to the CropBox * specified in the PDF file. This option disables cropping (PS, PDF, SVG only). * @param {boolean} [options.noShrink] - Do not scale PDF pages which are larger than the paper * (PS,PDF,SVG only). By default, pages larger than the paper are shrunk to fit. * @param {boolean} [options.oddPagesOnly] - Generates only the odd numbered pages. * @param {boolean} [options.originalPageSizes] - Set the paper size of each page to match * the size specified in the PDF file. * @param {string} [options.ownerPassword] - Specify the owner password for the PDF file. * Providing this will bypass all security restrictions. * @param {number} [options.paperHeight] - Set the paper height, in points (PS, PDF, SVG only). * @param {('A3'|'A4'|'legal'|'letter'|'match')} [options.paperSize] - Set the paper size to one of `A3`, `A4`, * `legal`, or `letter` (PS,PDF,SVG only). This can also be set to `match`, which will set the paper size * of each page to match the size specified in the PDF file. If none of the paperSize, * paperWidth, or paperHeight options are specified the default is to match the paper size. * @param {number} [options.paperWidth] - Set the paper width, in points (PS,PDF,SVG only). * @param {boolean} [options.pdfFile] - Generate PDF file. * @param {boolean} [options.pngFile] - Generate PNG file(s). * @param {boolean} [options.printVersionInfo] - Print copyright and version information. * @param {boolean} [options.printDocStruct] - If the input file contains structural information * about the document's content, write this information to the output file (PDF only). * @param {boolean} [options.psFile] - Generate PS file. * @param {boolean} [options.psLevel2] - Generate Level 2 PostScript (PS only). * @param {boolean} [options.psLevel3] - Generate Level 3 PostScript (PS only). This enables all * Level 2 features plus shading patterns and masked images. This is the default setting. * @param {boolean} [options.quiet] - Do not print any messages or errors. * @param {number} [options.resolutionXAxis] - Specifies the X resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @param {number} [options.resolutionXYAxis] - Specifies the X and Y resolution, in pixels per * inch of image files (or rasterized regions in vector output). The default is 150 PPI. * @param {number} [options.resolutionYAxis] - Specifies the Y resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @param {number} [options.scalePageTo] - Scales the long side of each page (width for landscape * pages, height for portrait pages) to fit in scale-to pixels. The size of the short side will * be determined by the aspect ratio of the page (PNG/JPEG/TIFF only). * @param {number} [options.scalePageToXAxis] - Scales each page horizontally to fit in scale-to-x * pixels. If scale-to-y is set to -1, the vertical size will determined by the aspect ratio of * the page (PNG/JPEG/TIFF only). * @param {number} [options.scalePageToYAxis] - Scales each page vertically to fit in scale-to-y * pixels. If scale-to-x is set to -1, the horizontal size will determined by the aspect ratio of * the page (PNG/JPEG/TIFF only). * @param {boolean} [options.singleFile] - Writes only the first page and does not add digits. * Can only be used with `options.jpegFile`, `options.pngFile`, and `options.tiffFile`. * @param {boolean} [options.svgFile] - Generate SVG (Scalable Vector Graphics) file. * @param {('deflate'|'jpeg'|'lzw'|'none'|'packbits')} [options.tiffCompression] - Set TIFF compression. * @param {boolean} [options.tiffFile] - Generate TIFF file(s). * @param {boolean} [options.transparentPageColor] - Use a transparent page color * instead of white (PNG and TIFF only). * @param {string} [options.userPassword] - Specify the user password for the PDF file. * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfToCairo(file, outputFile, options = {}) { const acceptedOptions = { antialias: { arg: "-antialias", type: "string" }, cropBox: { arg: "-cropbox", type: "boolean" }, cropHeight: { arg: "-H", type: "number" }, cropSize: { arg: "-sz", type: "number" }, cropWidth: { arg: "-W", type: "number" }, cropXAxis: { arg: "-x", type: "number" }, cropYAxis: { arg: "-y", type: "number" }, duplex: { arg: "-duplex", type: "boolean" }, epsFile: { arg: "-eps", type: "boolean" }, evenPagesOnly: { arg: "-e", type: "boolean" }, fillPage: { arg: "-expand", type: "boolean" }, firstPageToConvert: { arg: "-f", type: "number" }, grayscaleFile: { arg: "-gray", type: "boolean" }, iccFile: { arg: "-icc", type: "string" }, jpegFile: { arg: "-jpeg", type: "boolean" }, jpegOptions: { arg: "-jpegopt", type: "string" }, lastPageToConvert: { arg: "-l", type: "number" }, monochromeFile: { arg: "-mono", type: "boolean" }, noCenter: { arg: "-nocenter", type: "boolean" }, noCrop: { arg: "-nocrop", type: "boolean" }, noShrink: { arg: "-noshrink", type: "boolean" }, oddPagesOnly: { arg: "-o", type: "boolean" }, originalPageSizes: { arg: "-origpagesizes", type: "boolean" }, ownerPassword: { arg: "-opw", type: "string" }, paperHeight: { arg: "-paperh", type: "number" }, paperSize: { arg: "-paper", type: "string" }, paperWidth: { arg: "-paperw", type: "number" }, pdfFile: { arg: "-pdf", type: "boolean" }, pngFile: { arg: "-png", type: "boolean" }, printDocStruct: { arg: "-struct", type: "boolean", minVersion: "23.11.0", }, printVersionInfo: { arg: "-v", type: "boolean" }, psFile: { arg: "-ps", type: "boolean" }, psLevel2: { arg: "-level2", type: "boolean" }, psLevel3: { arg: "-level3", type: "boolean" }, quiet: { arg: "-q", type: "boolean" }, resolutionXAxis: { arg: "-rx", type: "number" }, resolutionXYAxis: { arg: "-r", type: "number" }, resolutionYAxis: { arg: "-ry", type: "number" }, scalePageTo: { arg: "-scale-to", type: "number" }, scalePageToXAxis: { arg: "-scale-to-x", type: "number" }, scalePageToYAxis: { arg: "-scale-to-y", type: "number" }, singleFile: { arg: "-singlefile", type: "boolean" }, svgFile: { arg: "-svg", type: "boolean" }, tiffCompression: { arg: "-tiffcompression", type: "string" }, tiffFile: { arg: "-tiff", type: "boolean" }, transparentPageColor: { arg: "-transp", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }; try { const { stderr } = await execFileAsync( pathResolve(this.popplerPath, "pdftocairo"), ["-v"] ); // @ts-ignore: parseOptions checks if falsy const versionInfo = popplerVersionRegex.exec(stderr)[1]; const args = parseOptions(acceptedOptions, options, versionInfo); return new Promise((resolve, reject) => { args.push( Buffer.isBuffer(file) ? "-" : file, outputFile || "-" ); const child = spawn( pathResolve(this.popplerPath, "pdftocairo"), args ); if ( outputFile === undefined && args.some((arg) => ["-singlefile", "-pdf"].includes(arg)) ) { child.stdout.setEncoding("binary"); } if (Buffer.isBuffer(file)) { child.stdin.write(file); child.stdin.end(); } let stdOut = ""; let stdErr = ""; child.stdout.on("data", (data) => { stdOut += data; }); child.stderr.on("data", (data) => { stdErr += data; }); child.on("close", (code) => { /* istanbul ignore else */ if (stdOut !== "") { resolve(stdOut.trim()); } else if (code === 0) { resolve(errorMessages[code]); } else if (stdErr !== "") { reject(new Error(stdErr.trim())); } else { reject( new Error( errorMessages[code] || `pdftocairo ${args.join( " " )} exited with code ${code}` ) ); } }); }); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Converts a PDF file to HTML. * @param {Buffer|string} file - PDF file as Buffer, or filepath of the PDF file to read. * @param {string} [outputFile] - Filepath of the file to output the results to. * If `undefined` then Poppler will use the directory and name of the original file * and create a new file, with `-html` appended to the end of the filename. * * Required if `file` is a Buffer. * @param {object} [options] - Object containing options to pass to binary. * @param {boolean} [options.complexOutput] - Generate complex output. * @param {boolean} [options.dataUrls] - Use data URLs instead of external images in HTML. * @param {boolean} [options.exchangePdfLinks] - Exchange .pdf links with .html. * @param {boolean} [options.extractHidden] - Force hidden text extraction. * @param {number} [options.firstPageToConvert] - First page to print. * @param {boolean} [options.fontFullName] - Outputs the font name without any substitutions. * @param {boolean} [options.ignoreImages] - Ignore images. * @param {('JPG'|'PNG')} [options.imageFormat] - Image file format for Splash output (JPG or PNG). * If complexOutput is selected, but imageFormat is not specified, PNG will be assumed. * @param {number} [options.lastPageToConvert] - Last page to print. * @param {boolean} [options.noDrm] - Override document DRM settings. * @param {boolean} [options.noFrames] - Generate no frames. Not supported in complex output mode. * @param {boolean} [options.noMergeParagraph] - Do not merge paragraphs. * @param {boolean} [options.noRoundedCoordinates] - Do not round coordinates * (with XML output only). * @param {string} [options.outputEncoding] - Sets the encoding to use for text output. * This defaults to `UTF-8`. * @param {string} [options.ownerPassword] - Owner password (for encrypted files). * @param {boolean} [options.printVersionInfo] - Print copyright and version info. * @param {boolean} [options.quiet] - Do not print any messages or errors. * @param {boolean} [options.singlePage] - Generate single HTML that includes all pages. * @param {boolean} [options.stdout] - Use standard output. * @param {string} [options.userPassword] - User password (for encrypted files). * @param {number} [options.wordBreakThreshold] - Adjust the word break threshold percent. * Default is 10. Word break occurs when distance between two adjacent characters is greater * than this percent of character height. * @param {boolean} [options.xmlOutput] - Output for XML post-processing. * @param {number} [options.zoom] - Zoom the PDF document (default 1.5). * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfToHtml(file, outputFile, options = {}) { const acceptedOptions = { complexOutput: { arg: "-c", type: "boolean" }, dataUrls: { arg: "-dataurls", type: "boolean", minVersion: "0.75.0", }, exchangePdfLinks: { arg: "-p", type: "boolean" }, extractHidden: { arg: "-hidden", type: "boolean" }, firstPageToConvert: { arg: "-f", type: "number" }, fontFullName: { arg: "-fontfullname", type: "boolean" }, ignoreImages: { arg: "-i", type: "boolean" }, imageFormat: { arg: "-fmt", type: "string" }, lastPageToConvert: { arg: "-l", type: "number" }, noDrm: { arg: "-nodrm", type: "boolean" }, noFrames: { arg: "-noframes", type: "boolean" }, noMergeParagraph: { arg: "-nomerge", type: "boolean" }, noRoundedCoordinates: { arg: "-noroundcoord", type: "boolean" }, outputEncoding: { arg: "-enc", type: "string" }, ownerPassword: { arg: "-opw", type: "string" }, printVersionInfo: { arg: "-v", type: "boolean" }, quiet: { arg: "-q", type: "boolean" }, singlePage: { arg: "-s", type: "boolean" }, stdout: { arg: "-stdout", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, wordBreakThreshold: { arg: "-wbt", type: "number" }, xmlOutput: { arg: "-xml", type: "boolean" }, zoom: { arg: "-zoom", type: "number" }, }; try { const { stderr } = await execFileAsync( pathResolve(this.popplerPath, "pdftohtml"), ["-v"] ); // @ts-ignore: parseOptions checks if falsy const versionInfo = popplerVersionRegex.exec(stderr)[1]; const args = parseOptions(acceptedOptions, options, versionInfo); return new Promise((resolve, reject) => { args.push(Buffer.isBuffer(file) ? "-" : file); if (outputFile) { args.push(outputFile); } const child = spawn( pathResolve(this.popplerPath, "pdftohtml"), args ); if (Buffer.isBuffer(file)) { child.stdin.write(file); child.stdin.end(); } let stdOut = ""; let stdErr = ""; child.stdout.on("data", (data) => { stdOut += data; }); child.stderr.on("data", (data) => { stdErr += data; }); /** * PdfToHtml does not return an exit code so check output to see if it was successful. * @see {@link https://gitlab.freedesktop.org/poppler/poppler/-/blob/master/utils/pdftohtml.1 | Poppler pdftohtml man} */ child.on("close", () => { if (stdOut !== "") { resolve(stdOut.trim()); } else { reject(new Error(stdErr ? stdErr.trim() : undefined)); } }); }); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Converts a PDF file to colour image files in Portable Pixmap (PPM) format, * grayscale image files in Portable Graymap (PGM) format, or monochrome image files * in Portable Bitmap (PBM) format. * @param {Buffer|string} file - PDF file as Buffer, or filepath of the PDF file to read. * @param {string} outputPath - Filepath to output the results to. * @param {object} [options] - Object containing options to pass to binary. * @param {('no'|'yes')} [options.antialiasFonts] - Enable or disable font anti-aliasing. * This defaults to `yes`. * @param {('no'|'yes')} [options.antialiasVectors] - Enable or disable vector anti-aliasing. * This defaults to `yes`. * @param {boolean} [options.cropBox] - Uses the crop box rather than media box when * generating the files (PNG/JPEG/TIFF only). * @param {number} [options.cropHeight] - Specifies the height of crop area in pixels * (image output) or points (vector output). * @param {number} [options.cropSize] - Specifies the size of crop square in pixels * (image output) or points (vector output). * @param {number} [options.cropWidth] - Specifies the width of crop area in pixels * (image output) or points (vector output). * @param {number} [options.cropXAxis] - Specifies the x-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @param {number} [options.cropYAxis] - Specifies the y-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @param {string} [options.defaultCmykProfile] - If Poppler is compiled with colour management support, this option * sets the DefaultCMYK color space to the ICC profile stored in the display profile file passed. * @param {string} [options.defaultGrayProfile] - If Poppler is compiled with colour management support, this option * sets the DefaultGray color space to the ICC profile stored in the display profile file passed. * @param {string} [options.defaultRgbProfile] - If Poppler is compiled with colour management support, this option * sets the DefaultRGB color space to the ICC profile stored in the display profile file passed. * @param {string} [options.displayProfile] - If Poppler is compiled with colour management support, this option * sets the display profile to the ICC profile stored in the display profile file passed. * @param {boolean} [options.evenPagesOnly] - Generates only the even numbered pages. * @param {number} [options.firstPageToConvert] - Specifies the first page to convert. * @param {('no'|'yes')} [options.freetype] - Enable or disable FreeType (a TrueType / Type 1 font rasterizer). * This defaults to `yes`. * @param {boolean} [options.forcePageNumber] - Force page number even if there is only one page. * @param {boolean} [options.grayscaleFile] - Generate grayscale PGM file (instead of a color PPM file). * @param {boolean} [options.hideAnnotations] - Hide annotations. * @param {boolean} [options.jpegFile] - Generate JPEG file instead a PPM file. * @param {number} [options.lastPageToConvert] - Specifies the last page to convert. * @param {boolean} [options.monochromeFile] - Generate monochrome PBM file (instead of a color PPM file). * @param {boolean} [options.oddPagesOnly] - Generates only the odd numbered pages. * @param {string} [options.ownerPassword] - Specify the owner password for the PDF file. * Providing this will bypass all security restrictions. * @param {boolean} [options.pngFile] - Generate PNG file instead a PPM file. * @param {boolean} [options.printProgress] - Print progress info as each page is generated. * Three space-separated fields are printed to STDERR: the number of the current page, the number * of the last page that will be generated, and the path to the file written to. * @param {boolean} [options.printVersionInfo] - Print copyright and version information. * @param {boolean} [options.quiet] - Do not print any messages or errors. * @param {number} [options.resolutionXAxis] - Specifies the X resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @param {number} [options.resolutionXYAxis] - Specifies the X and Y resolution, in pixels per * inch of image files (or rasterized regions in vector output). The default is 150 PPI. * @param {number} [options.resolutionYAxis] - Specifies the Y resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @param {number} [options.scalePageTo] - Scales the long side of each page (width for landscape * pages, height for portrait pages) to fit in scale-to pixels. The size of the short side will * be determined by the aspect ratio of the page. * @param {number} [options.scalePageToXAxis] - Scales each page horizontally to fit in scale-to-x * pixels. If scale-to-y is set to -1, the vertical size will determined by the aspect ratio of * the page. * @param {number} [options.scalePageToYAxis] - Scales each page vertically to fit in scale-to-y * pixels. If scale-to-x is set to -1, the horizontal size will determined by the aspect ratio of * the page. * @param {string} [options.separator] - Specify single character separator between name and page number. * @param {boolean} [options.singleFile] - Writes only the first page and does not add digits. * @param {('none'|'shape'|'solid')} [options.thinLineMode] - Specifies the thin line mode. This defaults to `none`. * @param {('deflate'|'jpeg'|'lzw'|'none'|'packbits')} [options.tiffCompression] - Set TIFF compression. * @param {boolean} [options.tiffFile] - Generate TIFF file instead a PPM file. * @param {string} [options.userPassword] - Specify the user password for the PDF file. * @returns {Promise<string>} A promise that resolves with a stdout string, or rejects with an `Error` object. */ async pdfToPpm(file, outputPath, options = {}) { const acceptedOptions = { antialiasFonts: { arg: "-aa", type: "string" }, antialiasVectors: { arg: "-aaVector", type: "string" }, cropBox: { arg: "-cropbox", type: "boolean" }, cropHeight: { arg: "-H", type: "number" }, cropSize: { arg: "-sz", type: "number" }, cropWidth: { arg: "-W", type: "number" }, cropXAxis: { arg: "-x", type: "number" }, cropYAxis: { arg: "-y", type: "number" }, defaultCmykProfile: { arg: "-defaultcmykprofile", type: "string", minVersion: "21.01.0", }, defaultGrayProfile: { arg: "-defaultgrayprofile", type: "string", minVersion: "21.01.0", }, defaultRgbProfile: { arg: "-defaultrgbprofile", type: "string", minVersion: "21.01.0", }, displayProfile: { arg: "-displayprofile", type: "string", minVersion: "0.90.0", }, evenPagesOnly: { arg: "-e", type: "boolean" }, firstPageToConvert: { arg: "-f", type: "number" }, forcePageNumber: { arg: "-forcenum", type: "boolean", minVersion: "0.75.0", }, freetype: { arg: "-freetype", type: "string" }, grayscaleFile: { arg: "-gray", type: "boolean" }, hideAnnotations: { arg: "-hide-annotations", type: "boolean", minVersion: "0.84.0", }, jpegFile: { arg: "-jpeg", type: "boolean" }, lastPageToConvert: { arg: "-l", type: "number" }, monochromeFile: { arg: "-mono", type: "boolean" }, oddPagesOnly: { arg: "-o", type: "boolean" }, ownerPassword: { arg: "-opw", type: "string" }, pngFile: { arg: "-png", type: "boolean" }, printProgress: { arg: "-progress", type: "boolean", minVersion: "21.03.0", }, printVersionInfo: { arg: "-v", type: "boolean" }, quiet: { arg: "-q", type: "boolean" }, resolutionXAxis: { arg: "-rx", type: "number" }, resolutionXYAxis: { arg: "-r", type: "number" }, resolutionYAxis: { arg: "-ry", type: "number" }, scalePageTo: { arg: "-scale-to", type: "number" }, scalePageToXAxis: { arg: "-scale-to-x", type: "number" }, scalePageToYAxis: { arg: "-scale-to-y", type: "number" }, separator: { arg: "-sep", type: "string", minVersion: "0.75.0" }, singleFile: { arg: "-singlefile", type: "boolean" }, thinLineMode: { arg: "-thinlinemode", type: "string" }, tiffCompression: { arg: "-tiffcompression", type: "string" }, tiffFile: { arg: "-tiff", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }; try { const { stderr } = await execFileAsync( pathResolve(this.popplerPath, "pdftoppm"), ["-v"] ); // @ts-ignore: parseOptions checks if falsy const versionInfo = popplerVersionRegex.exec(stderr)[1]; const args = parseOptions(acceptedOptions, options, versionInfo); return new Promise((resolve, reject) => { args.push(Buffer.isBuffer(file) ? "-" : file, outputPath); const child = spawn( pathResolve(this.popplerPath, "pdftoppm"), args ); if (Buffer.isBuffer(file)) { child.stdin.write(file); child.stdin.end(); } let stdErr = ""; child.stderr.on("data", (data) => { stdErr += data; }); child.on("close", (code) => { /* istanbul ignore else */ if (stdErr !== "") { reject(new Error(stdErr.trim())); } else if (code === 0) { resolve(errorMessages[code]); } else { reject( new Error( errorMessages[code] || `pdftoppm ${args.join( " " )} exited with code ${code}` ) ); } }); }); } catch (err) { return Promise.reject(err); } } /** * @author Frazer Smith * @description Converts a PDF file to PostScript (PS). * @param {Buffer|string} file - PDF file as Buffer, or filepath of the PDF file to read. * @param {string} [outputFile] - Filepath of the file to output the results to. * If `undefined` then will write output to stdout. * @param {object} [options] - Object containing options to pass to binary. * @param {('no'|'yes')} [options.antialias] - Enable anti-aliasing on rasterization, accepts `no` or `yes`. * @param {boolean} [options.binary] - Write binary data in Level 1 PostScript. By default, * pdftops wr