UNPKG

pdf-officegen

Version:

Converts one or more PDFs into a powerpoint or word document with one pdf page per slide/page

418 lines (382 loc) 14.3 kB
const _ = require('lodash') const Promise = require('bluebird') const EventEmitter2 = require('eventemitter2').EventEmitter2 const fs = require('fs') const execFile = require('child_process').execFile const spawn = require('child_process').spawn const os = require('os') const path = require('path') const { info } = require('./logger') const { nowInMillis, elapsed, sortPages } = require('./util') /** allows the caller to provide a path to the Inkscape Executable */ const INKSCAPE_PATH = process.env.PDF_PPT_INKSCAPE_PATH /** allows the caller to provide a path to the MuPdf Executable */ const MUPDF_PATH = process.env.PDF_PPT_MUPDF_PATH /** allows the caller to provide a path to the GhostScript Executable */ const GS_PATH = process.env.PDF_PPT_GSPATH /** allows the caller to provide a path to the ImageMagick Executable */ const IM_PATH = process.env.PDF_PPT_IMPATH let imageMagickConvert let gsExecutable let mupdfExecutable let inkScapeExecutable /** Going above 300 has a significant impact on performance * without much noticeable quality improvement */ const DPI_DENSITY_MAX = 300 /** 72 is the default (because of long-time standard) */ const DPI_DENSITY_DEFAULT = 72 const isWindowsPlatform = p => p === 'win32' /** * Responsible for turning a PDF into a series of images. */ class Engine extends EventEmitter2 { /** * @param {String} [options.engine=ghostscript] options: mupdf, inkscape, ghostscript * @param {Boolean} [options.cropLastImage=false] * @param {Object} [options.convertOptions] ImageMagick conversion options * @param {Number} [options.convertOptions.density] must be < 300 */ constructor (options = {}) { super({ wildcard: true, // Allow clients to listen based on wildcard expressions maxListeners: 10 // Node Default, set to 0 for Infinity }) this._engine = options.engine this._cropLastImage = options.cropLastImage this._convertOptions = options.convertOptions || {} this._imgDir = options.imgDir this._pdfDir = options.pdfDir this._setExecutables() } /** * Inspects any env property that may have been set, and changes the default * based on operating system if not set. * @private */ _setExecutables () { const p = os.platform() // Linux gsExecutable = GS_PATH || 'gs' mupdfExecutable = MUPDF_PATH || 'mutool' inkScapeExecutable = INKSCAPE_PATH || 'inkscape' imageMagickConvert = IM_PATH || 'convert' // Windows if (isWindowsPlatform(p)) { gsExecutable = GS_PATH || 'gswin32c.exe' mupdfExecutable = MUPDF_PATH || 'mutool.exe' inkScapeExecutable = INKSCAPE_PATH || 'inkscape' imageMagickConvert = IM_PATH || 'convert.exe' } } /** * Start the conversion from PDF into an array of images. * @return {Promise} */ convert (outputDir, files) { let convertPromise if (this._engine === 'inkscape') { convertPromise = this._convertWithInkscape(files) } else if (this._engine === 'mupdf') { convertPromise = this._convertWithMuPDF(files) } else { convertPromise = this._convertWithGhostScript(files) } return convertPromise } /** * GhostScript can be invoked directly, since ImageMagick just delegates to it * * @param pdfFiles * @param options * @private */ _convertWithGhostScript (pdfFiles) { const start = nowInMillis() const co = this._getConvertOptions() const gsOpts = ['-q', '-dQUIET', '-dSAFER', '-sDEVICE=pngalpha', '-dMaxBitmap=500000000', `-r${co.density}`] // Get the image files for each PDF const gsErr = [] const tasks = pdfFiles.map((pdfPath, pdfIndex) => { return new Promise((resolve) => { const imgPrefix = `img-${pdfIndex}-` const args = [...gsOpts, `-o "${imgPrefix}%d.png"`, pdfPath] const gsStart = nowInMillis() execFile(gsExecutable, args, { cwd: this._imgDir }, (err, stdout, stderr) => { this.emit('done.gs.convert', { output: `${gsExecutable} ${args.join(' ')}`, time: elapsed(gsStart), error: err }) if (err) { gsErr.push(err) } resolve() }) }) }) return this._processImgConversionTasks(tasks, gsErr, start) } _convertWithInkscape (pdfFiles) { // Split the PDFs const splitTasks = this._getSplitTasks(pdfFiles) return Promise.all(splitTasks).then(() => { return this._readPdfDirectory().then(singlePagePdfFiles => { const sortedSinglePDFs = sortPages(singlePagePdfFiles) return this._executeInkscape(sortedSinglePDFs) }) }) } /** * Reads the contents of the staging directory that contains the PDF * files after they are split using pdfseparate. This is because we have * no idea how many files there will be, and they need to be sorted properly. * * @returns {Promise} fulfilled with list of filenames */ _readPdfDirectory () { return new Promise((resolve, reject) => { fs.readdir(this._pdfDir, (err, files) => { if (err) { reject(err) } resolve(files.map(f => `${this._pdfDir}/${f}`)) }) }) } _getSplitTasks (pdfFiles) { return pdfFiles.map((pdfPath, pdfIndex) => { return new Promise((resolve, reject) => { const splitStart = nowInMillis() const args = [pdfPath, `${this._pdfDir}/pdf-${pdfIndex}-%d.pdf`] execFile('pdfseparate', args, (err, stdout, stderr) => { this.emit('done.pdf.separate', { output: `pdfseparate ${args.join(' ')}`, time: elapsed(splitStart), error: err }) if (err) { reject(err) } resolve() }) }) }) } _executeInkscape (sortedSinglePDFs) { const sortedImages = [] const co = this._getConvertOptions() const commands = _.map(sortedSinglePDFs, (pdfFile) => { const pngFile = path.join(this._imgDir, `${path.basename(pdfFile, '.pdf')}.png`) sortedImages.push(pngFile) return [`-d ${co.density}`, `--export-png=${pngFile}`, pdfFile] }) const inkTasks = this._getInkscapeExportTasks(commands) const start = nowInMillis() return Promise.all(inkTasks).then(() => { this.emit('done.inkscape.export.all', { time: elapsed(start) }) return Promise.resolve(sortedImages) }) // return this._spawnInkscapeShell(sortedImages, commands).then(() => sortedImages) } /** * @param {Array<String[]>>} commands List of Inkscape args per invocation, to be passed to execFile */ _getInkscapeExportTasks (commands) { return commands.map(args => { return new Promise((resolve, reject) => { // Including timings here isn't useful because these promises run concurrently execFile(inkScapeExecutable, args, (err, stdout, stderr) => { this.emit('done.inkscape.export', { output: `${inkScapeExecutable} ${args.join(' ')}`, error: err }) if (err) { reject(err) } resolve() }) }) }) } /** * Executes inkscape export commands in a single reusable shell. * In theory this should be more efficient, but it runs serially and ends * up taking a lot longer. * * @param {Array} commands inkscape export commands * @returns {Promise} * @private */ _spawnInkscapeShell (commands) { return new Promise((resolve, reject) => { const inkProc = spawn(inkScapeExecutable, ['--shell']) inkProc.stdout.on('data', d => { // Each export shell command writes 3 lines to stdout, this is all we have to // signal that a single conversion was completed. The `Bitmap saved` line is the // most contextual (and last) line, so that is what is logged const msg = d.toString() if (_.startsWith(msg, 'Bitmap saved as:')) { this.emit('done.inkscape.export', { output: msg, time: elapsed(inkCmdStart) }) inkCmdStart = nowInMillis() } }) inkProc.on('error', e => { info('Inkscape conversion failed:', e) reject(e) }) inkProc.on('exit', () => { this.emit('done.inkscape.export.all', { time: elapsed(inkExecStart) }) resolve() }) // Run all the conversions in the shell const inkShellCmd = _.join(commands, ' \n') + ' \nquit\n' info('Inkscape shell commands:', inkShellCmd) const inkExecStart = nowInMillis() let inkCmdStart = inkExecStart inkProc.stdin.write(inkShellCmd) }) } /** * Converts an array of input files (PNG or PDF) into an array of image files * that will be placed on powerpoint slides. * * @param files An array of PDF or PNG files * @returns {Promise<String[]>} A list of all the PNG files needed for slides */ _convertWithMuPDF (files) { const start = nowInMillis() const co = this._getConvertOptions() // Get the image files for each PDF const errors = [] const tasks = files.map((filePath, fileIndex) => { return new Promise((resolve) => { const imgPrefix = `img-${fileIndex}-` if (filePath.endsWith('.png')) { // Already have a PNG, just copy it fs.copyFile(filePath, `${this._imgDir}/${imgPrefix}0.png`, (err) => { if (err) { errors.push(err) } resolve() }) } else { // Need to convert the PDF to PNG(s) const args = ['draw', `-r ${co.density}`, `-o ${imgPrefix}%d.png`, filePath] const muStart = nowInMillis() execFile(mupdfExecutable, args, { cwd: this._imgDir }, (err, stdout, stderr) => { this.emit('done.mupdf.convert', { output: `${mupdfExecutable} ${args.join(' ')}`, time: elapsed(muStart), error: err }) if (err) { errors.push(err) } resolve() }) } }) }) return this._processImgConversionTasks(tasks, errors, start) } /** * @param {Array} tasks list of promises * @param {Array} errors * @param {Number} startedAt timestamp of when conversion started * @returns {Promise.<String[]>} sorted images ready for pptx slides */ _processImgConversionTasks (tasks, errors, startedAt) { return Promise.all(tasks).then(() => { if (!_.isEmpty(errors)) { this.emit('err.png.all', { error: errors, time: elapsed(startedAt) }) return Promise.reject(errors) } else { const imagesFiles = fs.readdirSync(this._imgDir).map(f => `${this._imgDir}/${f}`) const sortedImages = sortPages(imagesFiles) this.emit('done.png.all', { output: sortedImages, time: elapsed(startedAt) }) return this._cropLastImages(sortedImages).then(() => sortedImages) } }) } _getConvertOptions () { const co = this._convertOptions const o = {} /* Note: if the density is too low and there is a slide with a transparent background, The image may show a horizontal line on the slide when it is rendered in the PPTX. (was visible at 72, but not visible at 150) */ o.density = co.density ? Math.min(co.density, DPI_DENSITY_MAX) : DPI_DENSITY_DEFAULT return o } /** * Acts on image files in place. * * If the last page of a PDF did not represent a full page in the DOM it is * possible for there to be a boundary that has a transparency level that * results in a horizontal line being visible when the image is added to the * powerpoint slide. * * This function crops this transparency and removes the line, by invoking * ImageMagick * * @param sortedImages * * @returns {Promise} when all conversions have been done, and events emitted * Any errors will be logged, but a rejection will not occur. It is better * to get an output with a line than no output at all. * * @private */ _cropLastImages (sortedImages) { if (!this._cropLastImage) { return Promise.resolve() } const start = nowInMillis() const convertErrors = [] const conversions = this._getLastImageFiles(sortedImages).map((img) => { return new Promise((resolve) => { /* * -gravity south is needed for the chop instruction so we get the bottom edge * * border options are used to add a single pixel of transparency before trim autocrops the image. * Without this the top is trimmed and we don't want that * * -trim leaves the partially transparent pixel at the bottom so we have * to chop it off, but we know there is only one pixel after the trim. */ const args = [`-gravity South`, `-bordercolor none`, `-border 1`, `-trim`, `+repage`, `-chop 0x1`, img] var imStart = nowInMillis() execFile(imageMagickConvert, args, (err, stdout, stderr) => { this.emit('done.im.convert', { output: `${imageMagickConvert} ${args.join(' ')}`, time: elapsed(imStart) }) if (err) { convertErrors.push(err) } if (stderr) { convertErrors.push(stderr) } resolve() }) }) }) return Promise.all(conversions) .then(() => { return new Promise((resolve) => { if (!_.isEmpty(convertErrors)) { this.emit('err.im.convert', { error: convertErrors, time: elapsed(start) }) } else { this.emit('done.im.convert.all', { time: elapsed(start) }) } resolve() }) }) } /** * @param sortedImages sorted by file and page/image number * @returns {Array} of files from sortedImages that are the last image for * each file * @private */ _getLastImageFiles (sortedImages) { let lastFile return sortedImages.reduce((acc, val, i, arr) => { const fileAndPage = /.*[pdf|img]-(\d*)-(\d*).*/.exec(val) const file = fileAndPage[1] const nextFile = lastFile !== undefined && file !== lastFile const lastImage = i === arr.length - 1 const lastImages = [...acc] if (nextFile) { lastImages.push(arr[i - 1]) } if (lastImage) { lastImages.push(val) } lastFile = file return lastImages }, []) } } module.exports = Engine