pdf-powerpoint
Version:
Converts one or more PDFs into a powerpoint with one pdf page per slide
574 lines (520 loc) • 19.3 kB
JavaScript
'use strict'
var EventEmitter2 = require('eventemitter2').EventEmitter2
var exec = require('child_process').exec
var spawn = require('child_process').spawn
var fs = require('fs')
var os = require('os')
var path = require('path')
const _ = require('lodash')
var officegen = require('officegen')
var Promise = require('bluebird')
var rmdir = require('rimraf')
// Loggers
var debug = require('debug')
var pdfLogger = debug('pdfppt:app')
var debugLogger = debug('pdfppt:debug')
pdfLogger.log = console.log.bind(console)
debugLogger.log = console.log.bind(console)
/** allows the caller to provide a path to the Inkscape Executable */
const INKSCAPE_PATH = process.env.PDF_PPT_INKSCAPE_PATH
/** allows the caller to provide a path to the MuPdf Executable */
const MUPDF_PATH = process.env.PDF_PPT_MUPDF_PATH
/** allows the caller to provide a path to the GhostScript Executable */
const GS_PATH = process.env.PDF_PPT_GSPATH
/** allows the caller to provide a path to the ImageMagick Executable */
const IM_PATH = process.env.PDF_PPT_IMPATH
let gsExecutable
let mupdfExecutable
let inkScapeExecutable
let imageMagickConvert
/** Going above 300 has a significant impact on performance
* without much noticeable quality improvement */
const DPI_DENSITY_MAX = 300
/** 72 is the default (because of long-time standard) */
const DPI_DENSITY_DEFAULT = 72
const defaultOptions = {
clean: true,
cropLastImage: false,
dimensions: { width: 800, height: 600, type: 'screen4x3' },
jobId: ''
}
class Powerpoint extends EventEmitter2 {
/**
*
* @param options
* @param options.clean {boolean=true} set to false if intermediate image
* files should be left on the filesystem.
* @param options.jobId {String} if provided, this will be included in any
* logging output
* @param {boolean} [options.cropLastImage=false] requires ImageMagick
* `convert` to be on the path. Will crop the last pdf image before
* placing on slide.
* @param {Number} [options.dimensions.width=800] of slides in pixels
* @param {Number} [options.dimensions.height=600] of slides in pixels
* @param {String} [options.dimensions.type=screen4x3] '35mm' 'A3' 'A4'
* 'B4ISO' 'B4JIS' 'B5ISO' 'B5JIS' 'banner' 'custom' 'hagakiCard' 'ledger'
* 'letter' 'overhead' 'screen16x10' 'screen16x9' 'screen4x3'
*
*/
constructor (options) {
super({
wildcard: true, // Allow clients to listen based on wildcard expressions
maxListeners: 10 // Node Default, set to 0 for Infinity
})
this._setExecutables()
this.options = _.merge({}, defaultOptions, options)
process.nextTick(() => {
this.emit('options', { output: JSON.stringify(this.options) })
})
}
/**
* Inspects any env property that may have been set, and changes the default
* based on operating system if not set.
* @private
*/
_setExecutables () {
const p = os.platform()
// Linux
gsExecutable = GS_PATH || 'gs'
mupdfExecutable = MUPDF_PATH || 'mudraw'
inkScapeExecutable = INKSCAPE_PATH || 'inkscape'
imageMagickConvert = IM_PATH || 'convert'
// Windows
if (p === 'win32') {
gsExecutable = GS_PATH || 'gswin32c.exe'
mupdfExecutable = MUPDF_PATH || 'mutool.exe draw'
imageMagickConvert = IM_PATH || 'convert.exe'
inkScapeExecutable = INKSCAPE_PATH || 'inkscape'
}
// Mac OS
if (p === 'darwin') {
mupdfExecutable = MUPDF_PATH || 'mutool draw'
}
}
/**
*
* @param pdfFiles {array|string} An array of PDF files that should be
* converted
* @param options
* @param options.stagingDir - A directory where intermediate PNG images will
* be placed when converting into slides. A different folder should be
* used for each conversion. If undefined, a random directory will be
* created under the systems temp directory. It will be deleted once the
* job has completed.
*
* @param options.convertOptions - ImageMagick conversion options (minus the
* -) Currently supported: density(<300)
* @param done
*/
convertPDFToPowerpoint (pdfFiles, options, done) {
let callback
let opts = {}
// TODO: Test this
if (typeof arguments[1] === 'object') {
opts = options
callback = done
} else {
callback = arguments[1]
}
// pdfLogger('options:', opts)
this._getStagingDirectory(opts.stagingDir)
.then((outputDir) => {
this.imgDir = path.resolve(outputDir, 'img')
this.pdfDir = path.resolve(outputDir, 'pdf')
let convertPromise
if (this.options.engine === 'inkscape') {
convertPromise = this._convertWithInkscape(outputDir, pdfFiles, options)
} else if (this.options.engine === 'mupdf') {
convertPromise = this._convertWithMuPDF(outputDir, pdfFiles, options)
} else {
convertPromise = this._convertWithGhostScript(outputDir, pdfFiles, options)
}
convertPromise.then(sortedImages => {
this._createPowerpoint(outputDir, sortedImages, callback)
})
}, (err) => callback(err))
.catch(err => callback(err))
}
_convertWithInkscape (outputDir, pdfFiles, options) {
// Split the PDFs
const splitTasks = this._getSplitTasks(pdfFiles)
return Promise.all(splitTasks).then(() => {
return this._readPdfDirectory().then(singlePagePdfFiles => {
const sortedSinglePDFs = this._sortPages(singlePagePdfFiles)
return this._executeInkscape(sortedSinglePDFs, options)
})
})
}
_executeInkscape (sortedSinglePDFs, options) {
const sortedImages = []
const co = this._getConvertOptions(options)
const commands = _.map(sortedSinglePDFs, (pdfFile) => {
const pngFile = `${path.basename(pdfFile, '.pdf')}.png`
const pngPath = path.join(this.imgDir, pngFile)
sortedImages.push(pngPath)
return `-d ${co.density} --export-png=${pngPath} ${pdfFile}`
})
const inkTasks = this._getInkscapeExportTasks(commands)
const start = this.nowInMillis()
return Promise.all(inkTasks).then(() => {
this.emit('done.inkscape.export.all', {time: this.elapsed(start)})
return Promise.resolve(sortedImages)
})
// return this._spawnInkscapeShell(sortedImages, commands).then(() => sortedImages)
}
_getInkscapeExportTasks (commands) {
return commands.map((cmd) => {
return new Promise((resolve, reject) => {
const fullCmd = `${inkScapeExecutable} ${cmd}`
// Including timings here isn't useful because these promises run concurrently
exec(fullCmd, (err, stdout, stderr) => {
this.emit('done.inkscape.export', { output: fullCmd, error: err })
if (err) {
reject(err)
}
resolve()
})
})
})
}
/**
* Executes inkscape export commands in a single reusable shell.
* In theory this should be more efficient, but it runs serially and ends
* up taking a lot longer.
*
* @param {Array} commands inkscape export commands
* @returns {Promise}
* @private
*/
_spawnInkscapeShell (commands) {
return new Promise((resolve, reject) => {
const inkProc = spawn(inkScapeExecutable, ['--shell'])
inkProc.stdout.on('data', d => {
// Each export shell command writes 3 lines to stdout, this is all we have to
// signal that a single conversion was completed. The `Bitmap saved` line is the
// most contextual (and last) line, so that is what is logged
const msg = d.toString()
if (_.startsWith(msg, 'Bitmap saved as:')) {
this.emit('done.inkscape.export',
{output: msg, time: this.elapsed(inkCmdStart)})
inkCmdStart = this.nowInMillis()
}
})
inkProc.on('error', e => {
pdfLogger('Inkscape conversion failed:', e)
reject(e)
})
inkProc.on('exit', () => {
this.emit('done.inkscape.export.all', {time: this.elapsed(inkExecStart)})
resolve()
})
// Run all the conversions in the shell
const inkShellCmd = _.join(commands, ' \n') + ' \nquit\n'
pdfLogger('Inkscape shell commands:', inkShellCmd)
let inkExecStart = this.nowInMillis()
let inkCmdStart = inkExecStart
inkProc.stdin.write(inkShellCmd)
})
}
/**
* Reads the contents of the staging directory that contains the PDF
* files after they are split using pdfseparate. This is because we have
* no idea how many files there will be, and they need to be sorted properly.
*
* @returns {Promise} fulfilled with list of filenames
*/
_readPdfDirectory () {
return new Promise((resolve, reject) => {
fs.readdir(this.pdfDir, (err, files) => {
if (err) {
reject(err)
}
resolve(files.map(f => `${this.pdfDir}/${f}`))
})
})
}
_getSplitTasks (pdfFiles) {
return pdfFiles.map((pdfPath, pdfIndex) => {
return new Promise((resolve, reject) => {
const splitCmd = `pdfseparate ${pdfPath} ${this.pdfDir}/pdf-${pdfIndex}-%d.pdf`
const splitStart = this.nowInMillis()
exec(splitCmd, (err, stdout, stderr) => {
this.emit('done.pdf.separate', { output: splitCmd, time: this.elapsed(splitStart), error: err })
if (err) {
reject(err)
}
resolve()
})
})
})
}
/**
* GhostScript can be invoked directly, since ImageMagick just delegates to it
*
* @param outputDir
* @param pdfFiles
* @param options
* @private
*/
_convertWithGhostScript (outputDir, pdfFiles, options) {
const start = this.nowInMillis()
const co = this._getConvertOptions(options)
const gsCmdRoot = `"${gsExecutable}" -q -dQUIET -dSAFER -sDEVICE=pngalpha -dMaxBitmap=500000000 -r${co.density}`
// Get the image files for each PDF
let gsErr = []
let tasks = pdfFiles.map((pdfPath, pdfIndex) => {
return new Promise((resolve) => {
const imgPrefix = `img-${pdfIndex}-`
const gsCmd = gsCmdRoot + ` -o "${this.imgDir}/${imgPrefix}%d.png" "${pdfPath}"`
const gsStart = this.nowInMillis()
exec(gsCmd, (err, stdout, stderr) => {
this.emit('done.gs.convert', { output: gsCmd, time: this.elapsed(gsStart), error: err })
if (err) {
gsErr.push(err)
}
resolve()
})
})
})
return this.processImgConversionTasks(tasks, gsErr, start)
}
/**
* @param {Array} tasks list of promises
* @param {Array} errors
* @param {Number} startedAt timestamp of when conversion started
* @returns {Promise.<String[]>} sorted images ready for pptx slides
*/
processImgConversionTasks (tasks, errors, startedAt) {
return Promise.all(tasks).then(() => {
if (!_.isEmpty(errors)) {
this.emit('err.png.all', {error: errors, time: this.elapsed(startedAt)})
return Promise.reject(errors)
} else {
const imagesFiles = fs.readdirSync(this.imgDir).map(f => `${this.imgDir}/${f}`)
const sortedImages = this._sortPages(imagesFiles)
this.emit('done.png.all', {output: sortedImages, time: this.elapsed(startedAt)})
return this._cropLastImages(sortedImages).then(() => sortedImages)
}
})
}
_convertWithMuPDF (outputDir, pdfFiles, options) {
const start = this.nowInMillis()
const co = this._getConvertOptions(options)
const cmdRoot = `${mupdfExecutable} -r ${co.density}`
// Get the image files for each PDF
let errors = []
let tasks = pdfFiles.map((pdfPath, pdfIndex) => {
return new Promise((resolve) => {
const imgPrefix = `img-${pdfIndex}-`
const cmd = cmdRoot + ` -o ${this.imgDir}/${imgPrefix}%d.png ${pdfPath}`
const muStart = this.nowInMillis()
exec(cmd, (err, stdout, stderr) => {
this.emit('done.mupdf.convert', { output: cmd, time: this.elapsed(muStart), error: err })
if (err) {
errors.push(err)
}
resolve()
})
})
})
return this.processImgConversionTasks(tasks, errors, start)
}
_createPowerpoint (outputDir, sortedImages, callback) {
var pptxOutput = path.resolve(outputDir, `output_${process.hrtime()[1]}.pptx`)
this._aggregateSlides(sortedImages, pptxOutput, this.imgDir, callback)
}
_getConvertOptions (options) {
const co = options.convertOptions || {}
const o = {}
/* Note: if the density is too low and there is a slide with a transparent background,
The image may show a horizontal line on the slide when it is rendered in the PPTX.
(was visible at 72, but not visible at 150)
*/
o.density = co.density ? Math.min(co.density, DPI_DENSITY_MAX) : DPI_DENSITY_DEFAULT
return o
}
/**
* Acts on image files in place.
*
* If the last page of a PDF did not represent a full page in the DOM it is
* possible for there to be a boundary that has a transparency level that
* results in a horizontal line being visible when the image is added to the
* powerpoint slide.
*
* This function crops this transparency and removes the line, by invoking
* ImageMagick
*
* @param sortedImages
*
* @returns {Promise} when all conversions have been done, and events emitted
* Any errors will be logged, but a rejection will not occur. It is better
* to get an output with a line than no output at all.
*
* @private
*/
_cropLastImages (sortedImages) {
if (!this.options.cropLastImage) {
return Promise.resolve()
}
const start = this.nowInMillis()
let convertErrors = []
const conversions = this._getLastImageFiles(sortedImages).map((img) => {
return new Promise((resolve) => {
/*
* -gravity south is needed for the chop instruction so we get the bottom edge
* border options are used to add a single pixel of transparency before trim autocrops the image. Without this the top is trimmed and we don't want that
* -trim leaves the partially transparent pixel at the bottom so we have to chop it off, but we know there is only one pixel after the trim.
*/
const command = `${imageMagickConvert} ${img} -gravity South -bordercolor none -border 1 -trim +repage -chop 0x1 ${img}`
var imStart = this.nowInMillis()
exec(command, (err, stdout, stderr) => {
this.emit('done.im.convert', { output: command, time: this.elapsed(imStart) })
if (err) {
convertErrors.push(err)
}
if (stderr) {
convertErrors.push(stderr)
}
resolve()
})
})
})
return Promise.all(conversions)
.then(() => {
return new Promise((resolve) => {
if (!_.isEmpty(convertErrors)) {
this.emit('err.im.convert', {error: convertErrors, time: this.elapsed(start)})
} else {
this.emit('done.im.convert.all', {time: this.elapsed(start)})
}
resolve()
})
})
}
/**
* @param sortedImages sorted by file and page/image number
* @returns {Array} of files from sortedImages that are the last image for
* each file
* @private
*/
_getLastImageFiles (sortedImages) {
let lastFile
return sortedImages.reduce((acc, val, i, arr) => {
const fileAndPage = /.*[pdf|img]-(\d*)-(\d*).*/.exec(val)
const file = fileAndPage[1]
const nextFile = lastFile !== undefined && file !== lastFile
const lastImage = i === arr.length - 1
const lastImages = [...acc]
if (nextFile) {
lastImages.push(arr[i - 1])
}
if (lastImage) {
lastImages.push(val)
}
lastFile = file
return lastImages
}, [])
}
/**
*
* @param {Array} images
* @param pptxOutput pptx file path
* @param imgDir the directory where png files are generated
* @param done callback
* @private
*/
_aggregateSlides (images, pptxOutput, imgDir, done) {
this._createSlides(images, pptxOutput, (slideErr, output) => {
done(slideErr, output)
if (this.options.clean) {
var start = this.nowInMillis()
rmdir(imgDir, (err) => {
if (err) {
this.emit('done.png.clean', {output: imgDir, time: this.elapsed(start), error: err})
pdfLogger(this.options.jobId, 'Could not delete working directory:', imgDir, err)
}
})
}
})
}
_createSlides (imageFiles, pptFile, done) {
var start = this.nowInMillis()
var pptx = officegen('pptx')
var d = this.options.dimensions
// https://github.com/Ziv-Barber/officegen/issues/112
pptx.setSlideSize(d.width, d.height, d.type)
this._addSlidesToPresentation(imageFiles, pptx)
this.emit('done.pptx.creation', {time: this.elapsed(start)})
this._savePresentationFile(pptFile, done, pptx)
}
_addSlidesToPresentation (imageFiles, pptx) {
// TODO: Need a callback here if this blocks too long
imageFiles.forEach(i => { pptx.makeNewSlide().addImage(i) })
}
_sortPages (imageFiles) {
// Example: /var/folders/dr/f1q4znd96xv8wp82y4cfgg700000gn/T/pdf_ppt_5tz0dw/img/img-5-10.png
// File = 5, Page = 10
const rex = /.*(img|pdf)-(\d*)-(\d*).*/
return imageFiles.sort((a, b) => {
let aGrps = rex.exec(a)
let bGrps = rex.exec(b)
// PDF File Sequence + Page Sequence Comparison
const fileGrp = 2
const pageGrp = 3
let fileComp = aGrps[fileGrp] - bGrps[fileGrp]
if (fileComp === 0) {
return aGrps[pageGrp] - bGrps[pageGrp]
}
return fileComp
})
}
_savePresentationFile (pptFile, done, pptx) {
var start = this.nowInMillis()
var out = fs.createWriteStream(pptFile)
out.on('close', () => {
this.emit('done.pptx.saved', {output: pptFile, time: this.elapsed(start)})
done(null, pptFile)
})
pptx.generate(out)
}
_getStagingDirectory (stagingDir) {
return new Promise((resolve, reject) => {
if (stagingDir) {
fs.stat(stagingDir, (err, s) => {
if (err || !s.isDirectory()) {
pdfLogger(this.options.jobId, 'staging directory:', stagingDir, 'does not exist, creating a new one')
return this._createTempStagingDirectory()
} else {
this._createImageDirectory(stagingDir, reject, resolve)
}
})
} else {
resolve(this._createTempStagingDirectory())
}
})
}
_createTempStagingDirectory () {
return new Promise((resolve, reject) => {
fs.mkdtemp(path.join(os.tmpdir(), 'pdf_ppt_'), (err, folder) => {
if (err) reject(err)
this._createImageDirectory(folder, reject, resolve)
})
})
}
_createImageDirectory (folder, reject, resolve) {
fs.mkdir(path.resolve(folder, 'img'), (err) => {
if (err) reject(err)
fs.mkdir(path.resolve(folder, 'pdf'), (err) => {
if (err) reject(err)
resolve(folder)
})
})
}
nowInMillis () {
return Date.now() // process.hrtime()[1] / 1000000
}
elapsed (start) {
return this.nowInMillis() - start
}
}
module.exports = Powerpoint