UNPKG

node-poppler

Version:

Asynchronous Node.js wrapper for the Poppler PDF rendering library

1,017 lines (970 loc) 71.1 kB
"use strict"; const { execFile, spawn, spawnSync } = require("node:child_process"); const { basename, normalize, resolve: pathResolve } = require("node:path"); const { platform } = require("node:process"); const { promisify } = require("node:util"); const camelCase = require("camelcase"); const freeze = require("ice-barrage"); const { lt } = require("semver"); const execFileAsync = promisify(execFile); /** * @type {Readonly<Record<string, string>>} * @ignore */ const ERROR_MSGS = Object.freeze({ 0: "No Error", 1: "Error opening a PDF file", 2: "Error opening an output file", 3: "Error related to PDF permissions", 4: "Error related to ICC profile", 99: "Other error", 3221226505: "Internal process error", }); // Cache immutable regex as they are expensive to create and garbage collect const POPPLER_VERSION_REG = /(\d{1,2}\.\d{1,2}\.\d{1,2})/u; const PDF_INFO_FILE_SIZES_REG = /(File\s+size:\s+)0(\s+)bytes/u; const PDF_INFO_PATH_REG = /(.+)pdfinfo/u; /** * @typedef {object} OptionDetails * @property {string} arg The argument to pass to the binary. * @property {('boolean'|'number'|'string')} type The type of the option. * @property {string} [minVersion] The minimum version of the binary that supports this option. * @property {string} [maxVersion] The maximum version of the binary that supports this option (optional). */ /** * @typedef {Record<string, OptionDetails>} PopplerAcceptedOptions */ /** * @typedef PdfAttachOptions * @property {boolean} [printVersionInfo] Print copyright and version info. * @property {boolean} [replace] Replace embedded file with same name (if it exists). */ /** * @typedef PdfDetachOptions * @property {boolean} [listEmbedded] List all of the embedded files in the PDF file. * File names are converted to the text encoding specified by `options.outputEncoding`. * @property {string} [outputEncoding] Sets the encoding to use for text output. * This defaults to `UTF-8`. * @property {string} [ownerPassword] Owner password (for encrypted files). * @property {string} [outputPath] Set the file name used when saving an embedded file with * the save option enabled, or the directory if `options.saveall` is used. * @property {boolean} [printVersionInfo] Print copyright and version info. * @property {boolean} [saveAllFiles] Save all of the embedded files. This uses the file * names associated with the embedded files (as printed by `options.listEmbedded`). * By default, the files are saved in the current directory; this can be changed * with `options.outputPath`. * @property {string} [saveFile] Save the specified embedded file. * By default, this uses the file name associated with the embedded file (as printed by * `options.listEmbedded`); the file name can be changed with `options.outputPath`. * @property {number} [saveSpecificFile] Save the specified embedded file. * By default, this uses the file name associated with the embedded file (as printed by * `options.listEmbedded`); the file name can be changed with `options.outputPath`. * @property {string} [userPassword] User password (for encrypted files). */ /** * @typedef PdfFontsOptions * @property {number} [firstPageToExamine] Specifies the first page to examine. * @property {number} [lastPageToExamine] Specifies the last page to examine. * @property {boolean} [listSubstitutes] List the substitute fonts that poppler * will use for non-embedded fonts. * @property {string} [ownerPassword] Owner password (for encrypted files). * @property {boolean} [printVersionInfo] Print copyright and version info. * @property {string} [userPassword] User password (for encrypted files). */ /** * @typedef PdfImagesOptions * @property {boolean} [allFiles] Write JPEG, JPEG2000, JBIG2, and CCITT images in their native format. * CMYK files are written as TIFF files. All other images are written as PNG files. * @property {boolean} [ccittFile] Generate CCITT images as CCITT files. * @property {number} [firstPageToConvert] Specifies the first page to convert. * @property {number} [lastPageToConvert] Specifies the last page to convert. * @property {boolean} [jbig2File] Generate JBIG2 images as JBIG2 files. * @property {boolean} [jpeg2000File] Generate JPEG2000 images at JP2 files. * @property {boolean} [jpegFile] Generate JPEG images as JPEG files. * @property {boolean} [list] Instead of writing the images, list the * images along with various information for each image. * NOTE: Do not specify the outputPrefix with this option. * @property {string} [ownerPassword] Owner password (for encrypted files). * @property {boolean} [pngFile] Change the default output format to PNG. * @property {boolean} [printVersionInfo] Print copyright and version info. * @property {boolean} [tiffFile] Change the default output format to TIFF. * @property {string} [userPassword] Specify the user password for the PDF file. */ /** * @typedef PdfInfoOptions * @property {number} [firstPageToConvert] First page to print. * @property {number} [lastPageToConvert] Last page to print. * @property {boolean} [listEncodingOptions] List the available encodings. * @property {string} [outputEncoding] Sets the encoding to use for text output. * This defaults to `UTF-8`. * @property {string} [ownerPassword] Owner password (for encrypted files). * @property {boolean} [printAsJson] Print result as a JSON object. * @property {boolean} [printBoundingBoxes] Prints the page box bounding boxes: * MediaBox, CropBox, BleedBox, TrimBox, and ArtBox. * @property {boolean} [printDocStruct] Prints the logical document structure * of a Tagged-PDF file. * @property {boolean} [printDocStructText] Print the textual content along with the * document structure of a Tagged-PDF file. Note that extracting text this way might be slow * for big PDF files. * @property {boolean} [printIsoDates] Prints dates in ISO-8601 format (including the time zone). * @property {boolean} [printJS] Prints all JavaScript in the PDF file. * @property {boolean} [printMetadata] Prints document-level metadata. (This is the `Metadata` * stream from the PDF file's Catalog object). * @property {boolean} [printNamedDests] Print a list of all named destinations. If a page range * is specified using the `options.firstPageToConvert` and `options.lastPageToConvert` options, only destinations * in the page range are listed. * @property {boolean} [printRawDates] Prints the raw (undecoded) date strings, directly from the PDF file. * @property {boolean} [printUrls] Print all URLs in the PDF; only URLs referenced by PDF objects * such as Link Annotations are listed, not URL strings in the text content. * @property {boolean} [printVersionInfo] Print copyright and version info. * @property {string} [userPassword] User password (for encrypted files). */ /** * @typedef PdfSeparateOptions * @property {number} [firstPageToExtract] Specifies the first page to extract. * This defaults to page 1. * @property {number} [lastPageToExtract] Specifies the last page to extract. * This defaults to the last page of the PDF file. * @property {boolean} [printVersionInfo] Print copyright and version info. */ /** * @typedef PdfToCairoOptions * @property {('best'|'default'|'fast'|'good'|'gray'|'none'|'subpixel')} [antialias] Set the cairo * antialias option used for text and drawing in image files (or rasterized regions in vector output). * @property {boolean} [cropBox] Uses the crop box rather than media box when * generating the files (PNG/JPEG/TIFF only). * @property {number} [cropHeight] Specifies the height of crop area in pixels * (image output) or points (vector output). * @property {number} [cropSize] Specifies the size of crop square in pixels * (image output) or points (vector output). * @property {number} [cropWidth] Specifies the width of crop area in pixels * (image output) or points (vector output). * @property {number} [cropXAxis] Specifies the x-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @property {number} [cropYAxis] Specifies the y-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @property {boolean} [duplex] Adds the %%IncludeFeature: *Duplex DuplexNoTumble DSC * comment to the PostScript file (PS only). This tells the print manager to enable duplexing. * @property {boolean} [epsFile] Generate an EPS file. An EPS file contains a single image, * so if you use this option with a multi-page PDF file, you must use `options.firstPageToConvert` and * `options.lastPageToConvert` to specify a single page. * The page size options (originalPageSizes, paperSize, paperWidth, paperHeight) can not be used * with this option. * @property {boolean} [evenPagesOnly] Generates only the even numbered pages. * @property {boolean} [fillPage] Expand PDF pages smaller than the paper to fill the * paper (PS,PDF,SVG only). By default, these pages are not scaled. * @property {number} [firstPageToConvert] Specifies the first page to convert. * @property {boolean} [grayscaleFile] Generate grayscale file (PNG, JPEG, and TIFF only). * @property {string} [iccFile] Use the specified ICC file as the output profile * (PNG only). The profile will be embedded in the PNG file. * @property {boolean} [jpegFile] Generate JPEG file(s). * @property {string} [jpegOptions] When used with `options.jpegFile`, this option can * be used to control the JPEG compression parameters. It takes a string of the form * `"<opt>=<val>[,<opt>=<val>]"`. Currently available options are: * - `quality` Selects the JPEG quality value. The value must be an integer between 0 and 100. * - `progressive` Select progressive JPEG output. The possible values are "y", "n", indicating * progressive (yes) or non-progressive (no), respectively. * - `optimize` Sets whether to compute optimal Huffman coding tables for the JPEG output, which * will create smaller files but make an extra pass over the data. The value must be "y" or "n", * with "y" performing optimization, otherwise the default Huffman tables are used. * * Example: `"quality=95,optimize=y"`. * @property {number} [lastPageToConvert] Specifies the last page to convert. * @property {boolean} [monochromeFile] Generate monochrome file (PNG and TIFF only). * @property {boolean} [noCenter] By default, PDF pages smaller than the paper * (after any scaling) are centered on the paper. This option causes them to be aligned to * the lower-left corner of the paper instead (PS,PDF,SVG only). * @property {boolean} [noCrop] By default, printing output is cropped to the CropBox * specified in the PDF file. This option disables cropping (PS, PDF, SVG only). * @property {boolean} [noShrink] Do not scale PDF pages which are larger than the paper * (PS,PDF,SVG only). By default, pages larger than the paper are shrunk to fit. * @property {boolean} [oddPagesOnly] Generates only the odd numbered pages. * @property {boolean} [originalPageSizes] Set the paper size of each page to match * the size specified in the PDF file. * @property {string} [ownerPassword] Specify the owner password for the PDF file. * Providing this will bypass all security restrictions. * @property {number} [paperHeight] Set the paper height, in points (PS, PDF, SVG only). * @property {('A3'|'A4'|'legal'|'letter'|'match')} [paperSize] Set the paper size to one of `A3`, `A4`, * `legal`, or `letter` (PS,PDF,SVG only). This can also be set to `match`, which will set the paper size * of each page to match the size specified in the PDF file. If none of the paperSize, * paperWidth, or paperHeight options are specified the default is to match the paper size. * @property {number} [paperWidth] Set the paper width, in points (PS,PDF,SVG only). * @property {boolean} [pdfFile] Generate PDF file. * @property {boolean} [pngFile] Generate PNG file(s). * @property {boolean} [printVersionInfo] Print copyright and version information. * @property {boolean} [printDocStruct] If the input file contains structural information * about the document's content, write this information to the output file (PDF only). * @property {boolean} [psFile] Generate PS file. * @property {boolean} [psLevel2] Generate Level 2 PostScript (PS only). * @property {boolean} [psLevel3] Generate Level 3 PostScript (PS only). This enables all * Level 2 features plus shading patterns and masked images. This is the default setting. * @property {boolean} [quiet] Do not print any messages or errors. * @property {number} [resolutionXAxis] Specifies the X resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @property {number} [resolutionXYAxis] Specifies the X and Y resolution, in pixels per * inch of image files (or rasterized regions in vector output). The default is 150 PPI. * @property {number} [resolutionYAxis] Specifies the Y resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @property {number} [scalePageTo] Scales the long side of each page (width for landscape * pages, height for portrait pages) to fit in scale-to pixels. The size of the short side will * be determined by the aspect ratio of the page (PNG/JPEG/TIFF only). * @property {number} [scalePageToXAxis] Scales each page horizontally to fit in scale-to-x * pixels. If scale-to-y is set to -1, the vertical size will determined by the aspect ratio of * the page (PNG/JPEG/TIFF only). * @property {number} [scalePageToYAxis] Scales each page vertically to fit in scale-to-y * pixels. If scale-to-x is set to -1, the horizontal size will determined by the aspect ratio of * the page (PNG/JPEG/TIFF only). * @property {boolean} [singleFile] Writes only the first page and does not add digits. * Can only be used with `options.jpegFile`, `options.pngFile`, and `options.tiffFile`. * @property {boolean} [svgFile] Generate SVG (Scalable Vector Graphics) file. * @property {('deflate'|'jpeg'|'lzw'|'none'|'packbits')} [tiffCompression] Set TIFF compression. * @property {boolean} [tiffFile] Generate TIFF file(s). * @property {boolean} [transparentPageColor] Use a transparent page color * instead of white (PNG and TIFF only). * @property {string} [userPassword] Specify the user password for the PDF file. */ /** * @typedef PdfToHtmlOptions * @property {boolean} [complexOutput] Generate complex output. * @property {boolean} [dataUrls] Use data URLs instead of external images in HTML. * @property {boolean} [exchangePdfLinks] Exchange .pdf links with .html. * @property {boolean} [extractHidden] Force hidden text extraction. * @property {number} [firstPageToConvert] First page to print. * @property {boolean} [fontFullName] Outputs the font name without any substitutions. * @property {boolean} [ignoreImages] Ignore images. * @property {('JPG'|'PNG')} [imageFormat] Image file format for Splash output (JPG or PNG). * If complexOutput is selected, but imageFormat is not specified, PNG will be assumed. * @property {number} [lastPageToConvert] Last page to print. * @property {boolean} [noDrm] Override document DRM settings. * @property {boolean} [noFrames] Generate no frames. Not supported in complex output mode. * @property {boolean} [noMergeParagraph] Do not merge paragraphs. * @property {boolean} [noRoundedCoordinates] Do not round coordinates * (with XML output only). * @property {string} [outputEncoding] Sets the encoding to use for text output. * This defaults to `UTF-8`. * @property {string} [ownerPassword] Owner password (for encrypted files). * @property {boolean} [printVersionInfo] Print copyright and version info. * @property {boolean} [quiet] Do not print any messages or errors. * @property {boolean} [singlePage] Generate single HTML that includes all pages. * @property {boolean} [stdout] Use standard output. * @property {string} [userPassword] User password (for encrypted files). * @property {number} [wordBreakThreshold] Adjust the word break threshold percent. * Default is 10. Word break occurs when distance between two adjacent characters is greater * than this percent of character height. * @property {boolean} [xmlOutput] Output for XML post-processing. * @property {number} [zoom] Zoom the PDF document (default 1.5). */ /** * @typedef PdfToPpmOptions * @property {('no'|'yes')} [antialiasFonts] Enable or disable font anti-aliasing. * This defaults to `yes`. * @property {('no'|'yes')} [antialiasVectors] Enable or disable vector anti-aliasing. * This defaults to `yes`. * @property {boolean} [cropBox] Uses the crop box rather than media box when * generating the files (PNG/JPEG/TIFF only). * @property {number} [cropHeight] Specifies the height of crop area in pixels * (image output) or points (vector output). * @property {number} [cropSize] Specifies the size of crop square in pixels * (image output) or points (vector output). * @property {number} [cropWidth] Specifies the width of crop area in pixels * (image output) or points (vector output). * @property {number} [cropXAxis] Specifies the x-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @property {number} [cropYAxis] Specifies the y-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @property {string} [defaultCmykProfile] If Poppler is compiled with colour management support, this option * sets the DefaultCMYK color space to the ICC profile stored in the display profile file passed. * @property {string} [defaultGrayProfile] If Poppler is compiled with colour management support, this option * sets the DefaultGray color space to the ICC profile stored in the display profile file passed. * @property {string} [defaultRgbProfile] If Poppler is compiled with colour management support, this option * sets the DefaultRGB color space to the ICC profile stored in the display profile file passed. * @property {string} [displayProfile] If Poppler is compiled with colour management support, this option * sets the display profile to the ICC profile stored in the display profile file passed. * @property {boolean} [evenPagesOnly] Generates only the even numbered pages. * @property {number} [firstPageToConvert] Specifies the first page to convert. * @property {('no'|'yes')} [freetype] Enable or disable FreeType (a TrueType / Type 1 font rasterizer). * This defaults to `yes`. * @property {boolean} [forcePageNumber] Force page number even if there is only one page. * @property {boolean} [grayscaleFile] Generate grayscale PGM file (instead of a color PPM file). * @property {boolean} [hideAnnotations] Hide annotations. * @property {boolean} [jpegFile] Generate JPEG file instead a PPM file. * @property {number} [lastPageToConvert] Specifies the last page to convert. * @property {boolean} [monochromeFile] Generate monochrome PBM file (instead of a color PPM file). * @property {boolean} [oddPagesOnly] Generates only the odd numbered pages. * @property {string} [ownerPassword] Specify the owner password for the PDF file. * Providing this will bypass all security restrictions. * @property {boolean} [pngFile] Generate PNG file instead a PPM file. * @property {boolean} [printProgress] Print progress info as each page is generated. * Three space-separated fields are printed to STDERR: the number of the current page, the number * of the last page that will be generated, and the path to the file written to. * @property {boolean} [printVersionInfo] Print copyright and version information. * @property {boolean} [quiet] Do not print any messages or errors. * @property {number} [resolutionXAxis] Specifies the X resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @property {number} [resolutionXYAxis] Specifies the X and Y resolution, in pixels per * inch of image files (or rasterized regions in vector output). The default is 150 PPI. * @property {number} [resolutionYAxis] Specifies the Y resolution, in pixels per inch of * image files (or rasterized regions in vector output). The default is 150 PPI. * @property {number} [scalePageTo] Scales the long side of each page (width for landscape * pages, height for portrait pages) to fit in scale-to pixels. The size of the short side will * be determined by the aspect ratio of the page. * @property {number} [scalePageToXAxis] Scales each page horizontally to fit in scale-to-x * pixels. If scale-to-y is set to -1, the vertical size will determined by the aspect ratio of * the page. * @property {number} [scalePageToYAxis] Scales each page vertically to fit in scale-to-y * pixels. If scale-to-x is set to -1, the horizontal size will determined by the aspect ratio of * the page. * @property {string} [separator] Specify single character separator between name and page number. * @property {boolean} [singleFile] Writes only the first page and does not add digits. * @property {('none'|'shape'|'solid')} [thinLineMode] Specifies the thin line mode. This defaults to `none`. * @property {('deflate'|'jpeg'|'lzw'|'none'|'packbits')} [tiffCompression] Set TIFF compression. * @property {boolean} [tiffFile] Generate TIFF file instead a PPM file. * @property {string} [userPassword] Specify the user password for the PDF file. */ /** * @typedef PdfToPsOptions * @property {('no'|'yes')} [antialias] Enable anti-aliasing on rasterization, accepts `no` or `yes`. * @property {boolean} [binary] Write binary data in Level 1 PostScript. By default, * pdftops writes hex-encoded data in Level 1 PostScript. Binary data is non-standard in Level 1 * PostScript but reduces the file size and can be useful when Level 1 PostScript is required * only for its restricted use of PostScript operators. * @property {string} [defaultCmykProfile] If Poppler is compiled with colour management support, this option * sets the DefaultCMYK color space to the ICC profile stored in the display profile file passed. * @property {string} [defaultGrayProfile] If Poppler is compiled with colour management support, this option * sets the DefaultGray color space to the ICC profile stored in the display profile file passed. * @property {string} [defaultRgbProfile] If Poppler is compiled with colour management support, this option * sets the DefaultRGB color space to the ICC profile stored in the display profile file passed. * @property {boolean} [duplex] Set the Duplex pagedevice entry in the PostScript file. * This tells duplex-capable printers to enable duplexing. * @property {boolean} [epsFile] Generate an EPS file. An EPS file contains a single image, * so if you use this option with a multi-page PDF file, you must use `options.firstPageToConvert` and * `options.lastPageToConvert` to specify a single page. * The page size options (originalPageSizes, paperSize, paperWidth, paperHeight) can not be used * with this option. * @property {boolean} [fillPage] Expand PDF pages smaller than the paper to fill the * paper. By default, these pages are not scaled. * @property {number} [firstPageToConvert] Specifies the first page to convert. * @property {number} [form] Generate PostScript form which can be imported by software * that understands forms. * A form contains a single page, so if you use this option with a multi-page PDF file, * you must use `options.firstPageToConvert` and `options.lastPageToConvert` to specify a single page. * The `options.level1` option cannot be used with `options.form`. * No more than one of the mode options (`options.epsFile`, `options.form`) may be given. * @property {number} [lastPageToConvert] Specifies the last page to convert. * @property {boolean} [level1] Generate Level 1 PostScript. The resulting PostScript * files will be significantly larger (if they contain images), but will print on Level 1 printers. * This also converts all images to black and white. * @property {boolean} [level1Sep] Generate Level 1 separable PostScript. * All colors are converted to CMYK. Images are written with separate stream data for the four components. * @property {boolean} [level2] Generate Level 2 PostScript. * Level 2 supports color images and image compression. This is the default setting. * @property {boolean} [level2Sep] Generate Level 2 separable PostScript. All colors are * converted to CMYK. The PostScript separation convention operators are used to handle custom (spot) colors. * @property {boolean} [level3] Generate Level 3 PostScript. * This enables all Level 2 features plus CID font embedding. * @property {boolean} [level3Sep] Generate Level 3 separable PostScript. * The separation handling is the same as for `options.level2Sep`. * @property {boolean} [noCenter] By default, PDF pages smaller than the paper * (after any scaling) are centered on the paper. This option causes them to be aligned to * the lower-left corner of the paper instead. * @property {boolean} [noCrop] By default, printing output is cropped to the CropBox * specified in the PDF file. This option disables cropping. * @property {boolean} [noEmbedCIDFonts] By default, any CID PostScript fonts which are * embedded in the PDF file are copied into the PostScript file. This option disables that embedding. * No attempt is made to substitute for non-embedded CID PostScript fonts. * @property {boolean} [noEmbedCIDTrueTypeFonts] By default, any CID TrueType fonts which are * embedded in the PDF file are copied into the PostScript file. This option disables that embedding. * No attempt is made to substitute for non-embedded CID TrueType fonts. * @property {boolean} [noEmbedTrueTypeFonts] By default, any TrueType fonts which are embedded * in the PDF file are copied into the PostScript file. This option causes pdfToPs to substitute base fonts instead. * Embedded fonts make PostScript files larger, but may be necessary for readable output. * Also, some PostScript interpreters do not have TrueType rasterizers. * @property {boolean} [noEmbedType1Fonts] By default, any Type 1 fonts which are embedded in the PDF file * are copied into the PostScript file. This option causes pdfToPs to substitute base fonts instead. * Embedded fonts make PostScript files larger, but may be necessary for readable output. * @property {boolean} [noShrink] Do not scale PDF pages which are larger than the paper. * By default, pages larger than the paper are shrunk to fit. * @property {boolean} [opi] Generate OPI comments for all images and forms which have OPI information. * @property {boolean} [optimizecolorspace] By default, bitmap images in the PDF pass through to the * output PostScript in their original color space, which produces predictable results. * This option converts RGB and CMYK images into Gray images if every pixel of the image has equal components. * This can fix problems when doing color separations of PDFs that contain embedded black and * white images encoded as RGB. * @property {boolean} [originalPageSizes] Set the paper size of each page to match * the size specified in the PDF file. * @property {boolean} [overprint] Enable overprinting. * @property {string} [ownerPassword] Owner password (for encrypted files). * @property {number} [paperHeight] Set the paper height, in points. * @property {('A3'|'A4'|'legal'|'letter'|'match')} [paperSize] Set the paper size to one of `A3`, `A4`, * `legal`, or `letter`. This can also be set to `match`, which will set the paper size * of each page to match the size specified in the PDF file. If none of the paperSize, * paperWidth, or paperHeight options are specified the default is to match the paper size. * @property {number} [paperWidth] Set the paper width, in points. * @property {boolean} [passfonts] By default, references to non-embedded 8-bit fonts * in the PDF file are substituted with the closest `Helvetica`, `Times-Roman`, or `Courier` font. * This option passes references to non-embedded fonts through to the PostScript file. * @property {boolean} [preload] Preload images and forms. * @property {boolean} [printVersionInfo] Print copyright and version information. * @property {('CMYK8'|'MONO8'|'RGB8')} [processColorFormat] Sets the process color format as it is used * during rasterization and transparency reduction. * * The default depends on the other settings: For `options.level1` the default is MONO8; for `options.level1Sep`, * `options.level2Sep`, `options.level3Sep`, or `options.overprint` the default is CMYK8; in all other * cases RGB8 is the default. * If `option.processColorProfile` is set then `options.processColorFormat` is inferred from the specified ICC profile. * @property {string} [processColorProfile] Sets the ICC profile that is assumed during * rasterization and transparency reduction. * @property {boolean} [quiet] Do not print any messages or errors. * @property {('always'|'never'|'whenneeded')} [rasterize] By default, pdfToPs rasterizes pages as needed, * for example, if they contain transparencies. To force rasterization, set `rasterize` to `always`. * Use this to eliminate fonts. * To prevent rasterization, set `rasterize` to `never`. * This may produce files that display incorrectly. * @property {number} [resolutionXYAxis] Specifies the X and Y resolution, in pixels per * inch of image files (or rasterized regions in vector output). The default is 300 PPI. * @property {string} [userPassword] User password (for encrypted files). */ /** * @typedef PdfToTextOptions * @property {boolean} [boundingBoxXhtml] Generate an XHTML file containing bounding * box information for each word in the file. * @property {boolean} [boundingBoxXhtmlLayout] Generate an XHTML file containing * bounding box information for each block, line, and word in the file. * @property {boolean} [cropBox] Use the crop box rather than the media box with * `options.boundingBoxXhtml` and `options.boundingBoxXhtmlLayout`. * @property {number} [cropHeight] Specifies the height of crop area in pixels * (image output) or points (vector output). * @property {number} [cropWidth] Specifies the width of crop area in pixels * (image output) or points (vector output). * @property {number} [cropXAxis] Specifies the x-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @property {number} [cropYAxis] Specifies the y-coordinate of the crop area top left * corner in pixels (image output) or points (vector output). * @property {('dos'|'mac'|'unix')} [eolConvention] Sets the end-of-line convention to use for * text output: dos; mac; unix. * @property {number} [firstPageToConvert] Specifies the first page to convert. * @property {number} [fixedWidthLayout] Assume fixed-pitch (or tabular) text, with the * specified character width (in points). This forces physical layout mode. * @property {boolean} [generateHtmlMetaFile] Generate simple HTML file, including the * meta information. This simply wraps the text in `<pre>` and `</pre>` and prepends the meta headers. * @property {boolean} [generateTsvFile] Generate a TSV file containing the bounding box * information for each block, line, and word in the file. * @property {number} [lastPageToConvert] Specifies the last page to convert. * @property {boolean} [listEncodingOptions] List the available encodings. * @property {boolean} [maintainLayout] Maintain (as best as possible) the original physical * layout of the text. The default is to undo physical layout (columns, hyphenation, etc.) and * output the text in reading order. * @property {boolean} [noDiagonalText] Discard diagonal text. * @property {boolean} [noPageBreaks] Do not insert page breaks (form feed characters) * between pages. * @property {string} [outputEncoding] Sets the encoding to use for text output. * This defaults to `UTF-8`. * @property {string} [ownerPassword] Owner password (for encrypted files). * @property {boolean} [printVersionInfo] Print copyright and version information. * @property {boolean} [quiet] Do not print any messages or errors. * @property {boolean} [rawLayout] Keep the text in content stream order. This is a * hack which often undoes column formatting, etc. Use of raw mode is no longer recommended. * @property {string} [userPassword] User password (for encrypted files). */ /** * @typedef PdfUniteOptions * @property {boolean} [printVersionInfo] Print copyright and version information. */ /** * @typedef {object} PopplerExtraOptions * @property {AbortSignal} [signal] An `AbortSignal` that can be used to cancel the operation. */ /** * @author Frazer Smith * @description Executes a Poppler binary with the provided arguments and file input. * @ignore * @param {string} binary - Path to the binary to execute. * @param {string[]} args - Array of CLI arguments to pass to the binary. * @param {Buffer|string} [file] - File input (Buffer or path). * @param {object} [options] - Object containing execution options. * @param {boolean} [options.binaryOutput] - Set binary encoding for stdout. * @param {boolean} [options.ignoreExitCode] - If true, resolve based on stdout presence regardless of exit code. * @param {boolean} [options.preserveWhitespace] - If true, preserves leading and trailing whitespace in the output. * @param {AbortSignal} [options.signal] - An `AbortSignal` that can be used to cancel the operation. * @returns {Promise<string>} A promise that resolves with stdout, or rejects with an Error. */ function execBinary(binary, args, file, options = {}) { return new Promise((resolve, reject) => { const child = spawn(binary, args, { signal: options.signal }); if (options.binaryOutput) { child.stdout.setEncoding("binary"); } if (Buffer.isBuffer(file)) { child.stdin.write(file); child.stdin.end(); } let stdOut = ""; let stdErr = ""; let errorHandled = false; child.stdout.on("data", (data) => { stdOut += data; }); child.stderr.on("data", (data) => { stdErr += data; }); child.on("error", (err) => { errorHandled = true; reject(err); }); child.on("close", (code) => { // If an error was already emitted, don't process the close event if (errorHandled) { return; } // For binaries without reliable exit codes, resolve based on stdout presence if (options.ignoreExitCode) { if (stdOut !== "") { resolve( options.preserveWhitespace ? stdOut : stdOut.trim() ); } else { reject(new Error(stdErr.trim())); } return; } if (stdOut !== "") { resolve(options.preserveWhitespace ? stdOut : stdOut.trim()); } else if (code === 0) { resolve(ERROR_MSGS[code]); } else if (stdErr !== "") { reject(new Error(stdErr.trim())); } else { reject( new Error( ERROR_MSGS[code ?? -1] || `${basename(binary)} ${args.join(" ")} exited with code ${code}` ) ); } }); }); } /** * @author Frazer Smith * @description Checks each option provided is valid, of the correct type, and can be used by specified * version of binary. * @ignore * @param {PopplerAcceptedOptions} acceptedOptions - Object containing accepted options. * @param {Record<string, any>} options - Object containing options to pass to binary. * @param {string} [version] - Version of binary. * @returns {string[]} Array of CLI arguments. * @throws {Error} If invalid arguments provided. */ function parseOptions(acceptedOptions, options, version) { /** @type {string[]} */ const args = []; /** @type {string[]} */ const invalidArgs = []; // Imperative loops are faster than functional loops, see https://romgrk.com/posts/optimizing-javascript const keys = Object.keys(options); const keysLength = keys.length; for (let i = 0; i < keysLength; i += 1) { const key = keys[i]; if (!Object.hasOwn(acceptedOptions, key)) { invalidArgs.push(`Invalid option provided '${key}'`); continue; } // @ts-ignore: Keys are from options, TS cannot infer this const option = options[key]; const acceptedOption = acceptedOptions[key]; if (acceptedOption.type === typeof option) { // Skip boolean options if false if (acceptedOption.type !== "boolean" || option) { // Arg will be empty for some non-standard options if (acceptedOption.arg !== "") { args.push(acceptedOption.arg); } if (typeof option !== "boolean") { args.push(option); } } } else { invalidArgs.push( `Invalid value type provided for option '${key}', expected ${ acceptedOption.type } but received ${typeof option}` ); } if ( acceptedOption.minVersion && version && lt(version, acceptedOption.minVersion, { loose: true }) ) { invalidArgs.push( `Invalid option provided for the current version of the binary used. '${key}' was introduced in v${acceptedOption.minVersion}, but received v${version}` ); } } if (invalidArgs.length === 0) { return args; } throw new Error(invalidArgs.join("; ")); } class Poppler { #popplerPath; #pdfAttachBin; #pdfDetachBin; #pdfFontsBin; #pdfImagesBin; #pdfInfoBin; #pdfSeparateBin; #pdfToCairoBin; #pdfToHtmlBin; #pdfToPpmBin; #pdfToPsBin; #pdfToTextBin; #pdfUniteBin; #binVersions = new Map(); #acceptedOptions = new Map(); /** * @param {string} [binPath] - Path of poppler-utils binaries. * If not provided, the constructor will attempt to find the Poppler `pdfinfo` binary * in the PATH environment variable and use that as the path for all binaries. * For `win32` the binaries are bundled with the package and will be used * if a local installation is not found. * @throws {Error} If the Poppler binaries cannot be found. */ constructor(binPath) { this.#popplerPath = ""; /* istanbul ignore else: requires specific OS */ if (binPath) { /** @type {string|undefined} */ this.#popplerPath = binPath; } else { const which = spawnSync(platform === "win32" ? "where" : "which", [ "pdfinfo", ]).stdout.toString(); const popplerPath = PDF_INFO_PATH_REG.exec(which)?.[1]; if (popplerPath) { this.#popplerPath = popplerPath; } if (platform === "win32" && !popplerPath) { try { // @ts-ignore: Optional dependency // eslint-disable-next-line n/global-require -- Conditional require this.#popplerPath = require("node-poppler-win32"); } catch { // Leave #popplerPath empty; the generic "Unable to find ... binaries" error below will fire } } } if (!this.#popplerPath) { throw new Error( `Unable to find ${platform} Poppler binaries, please pass the installation directory as a parameter to the Poppler instance.` ); } this.#popplerPath = normalize(this.#popplerPath); this.#pdfAttachBin = pathResolve(this.#popplerPath, "pdfattach"); this.#pdfDetachBin = pathResolve(this.#popplerPath, "pdfdetach"); this.#pdfFontsBin = pathResolve(this.#popplerPath, "pdffonts"); this.#pdfImagesBin = pathResolve(this.#popplerPath, "pdfimages"); this.#pdfInfoBin = pathResolve(this.#popplerPath, "pdfinfo"); this.#pdfSeparateBin = pathResolve(this.#popplerPath, "pdfseparate"); this.#pdfToCairoBin = pathResolve(this.#popplerPath, "pdftocairo"); this.#pdfToHtmlBin = pathResolve(this.#popplerPath, "pdftohtml"); this.#pdfToPpmBin = pathResolve(this.#popplerPath, "pdftoppm"); this.#pdfToPsBin = pathResolve(this.#popplerPath, "pdftops"); this.#pdfToTextBin = pathResolve(this.#popplerPath, "pdftotext"); this.#pdfUniteBin = pathResolve(this.#popplerPath, "pdfunite"); } /** * @description Returns the path of the Poppler binaries. * @returns {string} Path of Poppler binaries. */ get path() { return this.#popplerPath; } /** * @author Frazer Smith * @description Returns the version of the specified Poppler binary. * @param {string} binary - The Poppler binary to get the version of. * @returns {Promise<string>} A promise that resolves with the version of the binary, or rejects with an `Error` object. */ async #getVersion(binary) { if (!this.#binVersions.has(binary)) { const { stderr } = await execFileAsync(binary, ["-v"]); // @ts-ignore: parseOptions checks if falsy const version = POPPLER_VERSION_REG.exec(stderr)[1]; this.#binVersions.set(binary, version); } return this.#binVersions.get(binary); } /** * @author Frazer Smith * @description Returns the accepted options for the specified Poppler binary function. * @param {string} functionName - The name of the Poppler binary function. * @returns {PopplerAcceptedOptions} An object containing the accepted options of the specified function. */ #getAcceptedOptions(functionName) { if (!this.#acceptedOptions.has(functionName)) { switch (functionName) { case "pdfAttach": this.#acceptedOptions.set( "pdfAttach", freeze({ printVersionInfo: { arg: "-v", type: "boolean" }, replace: { arg: "-replace", type: "boolean" }, }) ); break; case "pdfDetach": this.#acceptedOptions.set( "pdfDetach", freeze({ listEmbedded: { arg: "-list", type: "boolean" }, outputEncoding: { arg: "-enc", type: "string" }, outputPath: { arg: "-o", type: "string" }, ownerPassword: { arg: "-opw", type: "string" }, printVersionInfo: { arg: "-v", type: "boolean" }, saveAllFiles: { arg: "-saveall", type: "boolean" }, saveFile: { arg: "-savefile", type: "string", minVersion: "0.86.0", }, saveSpecificFile: { arg: "-save", type: "number" }, userPassword: { arg: "-upw", type: "string" }, }) ); break; case "pdfFonts": this.#acceptedOptions.set( "pdfFonts", freeze({ firstPageToExamine: { arg: "-f", type: "number" }, lastPageToExamine: { arg: "-l", type: "number" }, listSubstitutes: { arg: "-subst", type: "boolean" }, ownerPassword: { arg: "-opw", type: "string" }, printVersionInfo: { arg: "-v", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }) ); break; case "pdfImages": this.#acceptedOptions.set( "pdfImages", freeze({ allFiles: { arg: "-all", type: "boolean" }, ccittFile: { arg: "-ccitt", type: "boolean" }, firstPageToConvert: { arg: "-f", type: "number" }, lastPageToConvert: { arg: "-l", type: "number" }, jbig2File: { arg: "-jbig2", type: "boolean" }, jpeg2000File: { arg: "-jp2", type: "boolean" }, jpegFile: { arg: "-j", type: "boolean" }, list: { arg: "-list", type: "boolean" }, ownerPassword: { arg: "-opw", type: "string" }, pngFile: { arg: "-png", type: "boolean" }, printVersionInfo: { arg: "-v", type: "boolean" }, tiffFile: { arg: "-tiff", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }) ); break; case "pdfInfo": this.#acceptedOptions.set( "pdfInfo", freeze({ firstPageToConvert: { arg: "-f", type: "number" }, lastPageToConvert: { arg: "-l", type: "number" }, listEncodingOptions: { arg: "-listenc", type: "boolean", }, outputEncoding: { arg: "-enc", type: "string" }, ownerPassword: { arg: "-opw", type: "string" }, printAsJson: { arg: "", type: "boolean" }, printBoundingBoxes: { arg: "-box", type: "boolean", }, printDocStruct: { arg: "-struct", type: "boolean" }, printDocStructText: { arg: "-struct-text", type: "boolean", }, printIsoDates: { arg: "-isodates", type: "boolean", }, printJS: { arg: "-js", type: "boolean" }, printMetadata: { arg: "-meta", type: "boolean" }, printNamedDests: { arg: "-dests", type: "boolean" }, printRawDates: { arg: "-rawdates", type: "boolean", }, printUrls: { arg: "-url", type: "boolean", minVersion: "21.11.0", }, printVersionInfo: { arg: "-v", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, }) ); break; case "pdfSeparate": this.#acceptedOptions.set( "pdfSeparate", freeze({ firstPageToExtract: { arg: "-f", type: "number" }, lastPageToExtract: { arg: "-l", type: "number" }, printVersionInfo: { arg: "-v", type: "boolean" }, }) ); break; case "pdfToCairo": this.#acceptedOptions.set( "pdfToCairo", freeze({ antialias: { arg: "-antialias", type: "string" }, cropBox: { arg: "-cropbox", type: "boolean" }, cropHeight: { arg: "-H", type: "number" }, cropSize: { arg: "-sz", type: "number" }, cropWidth: { arg: "-W", type: "number" }, cropXAxis: { arg: "-x", type: "number" }, cropYAxis: { arg: "-y", type: "number" }, duplex: { arg: "-duplex", type: "boolean" }, epsFile: { arg: "-eps", type: "boolean" }, evenPagesOnly: { arg: "-e", type: "boolean" }, fillPage: { arg: "-expand", type: "boolean" }, firstPageToConvert: { arg: "-f", type: "number" }, grayscaleFile: { arg: "-gray", type: "boolean" }, iccFile: { arg: "-icc", type: "string" }, jpegFile: { arg: "-jpeg", type: "boolean" }, jpegOptions: { arg: "-jpegopt", type: "string" }, lastPageToConvert: { arg: "-l", type: "number" }, monochromeFile: { arg: "-mono", type: "boolean" }, noCenter: { arg: "-nocenter", type: "boolean" }, noCrop: { arg: "-nocrop", type: "boolean" }, noShrink: { arg: "-noshrink", type: "boolean" }, oddPagesOnly: { arg: "-o", type: "boolean" }, originalPageSizes: { arg: "-origpagesizes", type: "boolean", }, ownerPassword: { arg: "-opw", type: "string" }, paperHeight: { arg: "-paperh", type: "number" }, paperSize: { arg: "-paper", type: "string" }, paperWidth: { arg: "-paperw", type: "number" }, pdfFile: { arg: "-pdf", type: "boolean" }, pngFile: { arg: "-png", type: "boolean" }, printDocStruct: { arg: "-struct", type: "boolean", minVersion: "23.11.0", }, printVersionInfo: { arg: "-v", type: "boolean" }, psFile: { arg: "-ps", type: "boolean" }, psLevel2: { arg: "-level2", type: "boolean" }, psLevel3: { arg: "-level3", type: "boolean" }, quiet: { arg: "-q", type: "boolean" }, resolutionXAxis: { arg: "-rx", type: "number" }, resolutionXYAxis: { arg: "-r", type: "number" }, resolutionYAxis: { arg: "-ry", type: "number" }, scalePageTo: { arg: "-scale-to", type: "number" }, scalePageToXAxis: { arg: "-scale-to-x", type: "number", }, scalePageToYAxis: { arg: "-scale-to-y", type: "number", }, singleFile: { arg: "-singlefile", type: "boolean" }, svgFile: { arg: "-svg", type: "boolean" }, tiffCompression: { arg: "-tiffcompression", type: "string", }, tiffFile: { arg: "-tiff", type: "boolean" }, transparentPageColor: { arg: "-transp", type: "boolean", }, userPassword: { arg: "-upw", type: "string" }, }) ); break; case "pdfToHtml": this.#acceptedOptions.set( "pdfToHtml", freeze({ complexOutput: { arg: "-c", type: "boolean" }, dataUrls: { arg: "-dataurls", type: "boolean", minVersion: "0.75.0", }, exchangePdfLinks: { arg: "-p", type: "boolean" }, extractHidden: { arg: "-hidden", type: "boolean" }, firstPageToConvert: { arg: "-f", type: "number" }, fontFullName: { arg: "-fontfullname", type: "boolean", }, ignoreImages: { arg: "-i", type: "boolean" }, imageFormat: { arg: "-fmt", type: "string" }, lastPageToConvert: { arg: "-l", type: "number" }, noDrm: { arg: "-nodrm", type: "boolean" }, noFrames: { arg: "-noframes", type: "boolean" }, noMergeParagraph: { arg: "-nomerge", type: "boolean", }, noRoundedCoordinates: { arg: "-noroundcoord", type: "boolean", }, outputEncoding: { arg: "-enc", type: "string" }, ownerPassword: { arg: "-opw", type: "string" }, printVersionInfo: { arg: "-v", type: "boolean" }, quiet: { arg: "-q", type: "boolean" }, singlePage: { arg: "-s", type: "boolean" }, stdout: { arg: "-stdout", type: "boolean" }, userPassword: { arg: "-upw", type: "string" }, wordBreakThreshold: { arg: "-wbt", type: "number" }, xmlOutput: { arg: "-xml", type: "boolean" }, zoom: { arg: "-zoom", type: "number" }, }) ); break; case "pdfToPpm": this.#acceptedOptions.set( "pdfToPpm", freeze({ antialiasFonts: { arg: "-aa", type: "string" }, antialiasVectors: { arg: "-aaVector", type: "string", }, cropBox: { arg: "-cropbox", type: "boolean" }, cropHeight: { arg: "-H", type: "number" }, cropSize: { arg: "-sz", type: "number" }, cropWidth: { arg: "-W", type: "number" }, cropXAxis: { arg: "-x", type: "number" }, cropYAxis: { arg: "-y", type: "number" }, defaultCmykProfile: { arg: "-defaultcmykprofile", type: "string", minVersion: "21.01.0", }, defaultGrayProfile: { arg: "-defaultgrayprofile", type: "string", minVersion: "21.01.0", }, defaultRgbProfile: { arg: "-defaultrgbprofile", type: "string", minVersion: "21.01.0", }, displayProfile: { arg: "-displayprofile", type: "string", minVersio