pdf2html
Version:
PDF to HTML or Text conversion using Apache Tika. Also generate PDF thumbnail using Apache PDFBox.
40 lines (33 loc) • 1.39 kB
JavaScript
// lib/TikaWrapper.js
const debug = require('debug')('pdf2html');
const path = require('path');
const CommandExecutor = require('./CommandExecutor');
const { DEFAULT_OPTIONS } = require('./config');
const constants = require('../constants');
const FileManager = require('./FileManager');
/**
* Apache Tika wrapper for content extraction
*/
class TikaWrapper {
static async extract(filepath, format, options = {}) {
return FileManager.withTempFile(filepath, constants.DIRECTORY.PDF, async (tempFilePath) => {
const args = ['-jar', path.join(constants.DIRECTORY.VENDOR, constants.VENDOR_TIKA_JAR), `--${format}`, tempFilePath];
const maxBuffer = options.maxBuffer || DEFAULT_OPTIONS.command.maxBuffer;
return CommandExecutor.execute('java', args, { maxBuffer });
});
}
static async extractHTML(filepath, options) {
debug('Converting PDF to HTML');
return this.extract(filepath, 'html', options);
}
static async extractText(filepath, options) {
debug('Converting PDF to Text');
return this.extract(filepath, 'text', options);
}
static async extractMetadata(filepath, options) {
debug('Extracting metadata from PDF');
const jsonString = await this.extract(filepath, 'json', options);
return JSON.parse(jsonString);
}
}
module.exports = TikaWrapper;