pdf2json
Version:
PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js
401 lines (400 loc) • 18.3 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (g && (g = 0, op[0] && (_ = 0)), _) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
Object.defineProperty(exports, "__esModule", { value: true });
var nodeUtil = require("util");
var fs = require("fs");
var path_1 = require("path");
var p2jcliarg_js_1 = require("./p2jcliarg.js");
var pdfparser_js_1 = require("../../dist/pdfparser.js");
var ParserStream = pdfparser_js_1.default.ParserStream, StringifyStream = pdfparser_js_1.default.StringifyStream, pkInfo = pdfparser_js_1.default.pkInfo, _PRO_TIMER = pdfparser_js_1.default._PARSER_SIG;
var argv = p2jcliarg_js_1.yargs.argv;
var ONLY_SHOW_VERSION = "v" in argv;
var ONLY_SHOW_HELP = "h" in argv;
var VERBOSITY_LEVEL = "s" in argv ? 0 : 5;
var HAS_INPUT_DIR_OR_FILE = "f" in argv;
var PROCESS_RAW_TEXT_CONTENT = "c" in argv;
var PROCESS_FIELDS_CONTENT = "t" in argv;
var PROCESS_MERGE_BROKEN_TEXT_BLOCKS = "m" in argv;
var PROCESS_WITH_STREAM = "r" in argv;
var INPUT_DIR_OR_FILE = argv.f;
var PDFProcessor = /** @class */ (function () {
// constructor
function PDFProcessor(inputDir, inputFile, curCLI) {
this.inputDir = '';
this.inputFile = '';
this.inputPath = '';
this.outputDir = '';
this.outputFile = '';
this.outputPath = '';
// eslint-disable-next-line @typescript-eslint/no-explicit-any
this.pdfParser = null;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
this.curCLI = null;
this.getOutputFile = function () {
return path_1.default.join(this.outputDir, this.outputFile);
};
// public, this instance copies
this.inputDir = path_1.default.normalize(inputDir);
this.inputFile = inputFile;
this.inputPath = path_1.default.join(this.inputDir, this.inputFile);
this.outputDir = path_1.default.normalize(argv.o || inputDir);
this.pdfParser = null;
this.curCLI = curCLI;
}
//private methods
PDFProcessor.prototype.generateMergedTextBlocksStream = function () {
var _this = this;
return new Promise(function (resolve, reject) {
if (!_this.pdfParser) {
reject("PDFParser instance is not available.");
return;
}
var outputStream = ParserStream.createOutputStream(_this.outputPath.replace(".json", ".merged.json"), resolve, reject);
_this.pdfParser
.getMergedTextBlocksStream()
.pipe(new StringifyStream())
.pipe(outputStream);
});
};
PDFProcessor.prototype.generateRawTextContentStream = function () {
var _this = this;
return new Promise(function (resolve, reject) {
var outputStream = ParserStream.createOutputStream(_this.outputPath.replace(".json", ".content.txt"), resolve, reject);
_this.pdfParser.getRawTextContentStream().pipe(outputStream);
});
};
PDFProcessor.prototype.generateFieldsTypesStream = function () {
var _this = this;
return new Promise(function (resolve, reject) {
var outputStream = ParserStream.createOutputStream(_this.outputPath.replace(".json", ".fields.json"), resolve, reject);
_this.pdfParser
.getAllFieldsTypesStream()
.pipe(new StringifyStream())
.pipe(outputStream);
});
};
PDFProcessor.prototype.processAdditionalStreams = function () {
var outputTasks = [];
if (PROCESS_FIELDS_CONTENT) {
//needs to generate fields.json file
outputTasks.push(this.generateFieldsTypesStream());
}
if (PROCESS_RAW_TEXT_CONTENT) {
//needs to generate content.txt file
outputTasks.push(this.generateRawTextContentStream());
}
if (PROCESS_MERGE_BROKEN_TEXT_BLOCKS) {
//needs to generate json file with merged broken text blocks
outputTasks.push(this.generateMergedTextBlocksStream());
}
return Promise.allSettled(outputTasks);
};
PDFProcessor.prototype.onPrimarySuccess = function (resolve, reject) {
this.curCLI.addResultCount();
this.processAdditionalStreams()
.then(function (retVal) { return resolve(retVal); })
.catch(function (err) { return reject(err); });
};
PDFProcessor.prototype.onPrimaryError = function (err, reject) {
this.curCLI.addResultCount(err);
reject(err);
};
PDFProcessor.prototype.parseOnePDFStream = function () {
var _this = this;
return new Promise(function (resolve, reject) {
_this.pdfParser = new pdfparser_js_1.default(null, PROCESS_RAW_TEXT_CONTENT);
_this.pdfParser.on("pdfParser_dataError", function (evtData) {
return _this.onPrimaryError(evtData.parserError, reject);
});
var outputStream = fs.createWriteStream(_this.outputPath);
outputStream.on("finish", function () { return _this.onPrimarySuccess(resolve, reject); });
outputStream.on("error", function (err) { return _this.onPrimaryError(err, reject); });
console.info("Transcoding Stream ".concat(_this.inputFile, " to - ").concat(_this.outputPath));
var inputStream = fs.createReadStream(_this.inputPath);
inputStream
.pipe(_this.pdfParser.createParserStream())
.pipe(new StringifyStream())
.pipe(outputStream);
});
};
PDFProcessor.prototype.parseOnePDF = function () {
var _this = this;
return new Promise(function (resolve, reject) {
_this.pdfParser = new pdfparser_js_1.default(null, PROCESS_RAW_TEXT_CONTENT);
_this.pdfParser.on("pdfParser_dataError", function (evtData) {
return _this.onPrimaryError(evtData.parserError, reject);
});
_this.pdfParser.on("pdfParser_dataReady", function (evtData) {
fs.writeFile(_this.outputPath, JSON.stringify(evtData), function (err) {
if (err) {
_this.onPrimaryError(err, reject);
}
else {
_this.onPrimarySuccess(resolve, reject);
}
});
});
console.info("Transcoding File ".concat(_this.inputFile, " to - ").concat(_this.outputPath));
_this.pdfParser.loadPDF(_this.inputPath, VERBOSITY_LEVEL);
});
};
//public methods
PDFProcessor.prototype.validateParams = function () {
var retVal = '';
if (!fs.existsSync(this.inputDir))
retVal =
"Input error: input directory doesn't exist - ".concat(this.inputDir, ".");
else if (!fs.existsSync(this.inputPath))
retVal =
"Input error: input file doesn't exist - ".concat(this.inputPath, ".");
else if (!fs.existsSync(this.outputDir))
retVal =
"Input error: output directory doesn't exist - ".concat(this.outputDir, ".");
if (retVal !== null) {
this.curCLI.addResultCount(retVal);
return retVal;
}
var inExtName = path_1.default.extname(this.inputFile).toLowerCase();
if (inExtName !== ".pdf") {
retVal =
"Input error: input file name doesn't have pdf extention - ".concat(this.inputFile, ".");
}
else {
this.outputFile = "".concat(path_1.default.basename(this.inputPath, inExtName), ".json");
this.outputPath = path_1.default.normalize("".concat(this.outputDir, "/").concat(this.outputFile));
if (fs.existsSync(this.outputPath)) {
console.warn("Output file will be replaced - ".concat(this.outputPath));
}
else {
var fod = fs.openSync(this.outputPath, "wx");
if (!fod)
retVal = "Input error: can not write to ".concat(this.outputPath);
else {
fs.closeSync(fod);
fs.unlinkSync(this.outputPath);
}
}
}
return retVal;
};
PDFProcessor.prototype.destroy = function () {
this.inputDir = '';
this.inputFile = '';
this.inputPath = '';
this.outputDir = '';
this.outputPath = '';
if (this.pdfParser) {
this.pdfParser.destroy();
}
this.pdfParser = null;
this.curCLI = null;
};
PDFProcessor.prototype.processFile = function () {
var _this = this;
return new Promise(function (resolve, reject) {
var validateMsg = _this.validateParams();
if (validateMsg) {
reject(validateMsg);
}
else {
var parserFunc = PROCESS_WITH_STREAM
? _this.parseOnePDFStream
: _this.parseOnePDF;
parserFunc
.call(_this)
.then(function (value) { return resolve(value); })
.catch(function (err) { return reject(err); });
}
});
};
return PDFProcessor;
}());
var PDFCLI = /** @class */ (function () {
// constructor
function PDFCLI() {
this.inputCount = 0;
this.successCount = 0;
this.failedCount = 0;
this.warningCount = 0;
this.statusMsgs = [];
this.inputCount = 0;
this.successCount = 0;
this.failedCount = 0;
this.warningCount = 0;
this.statusMsgs = [];
}
PDFCLI.prototype.initialize = function () {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
nodeUtil.verbosity(VERBOSITY_LEVEL);
var retVal = true;
try {
if (ONLY_SHOW_VERSION) {
console.log(pkInfo.version);
retVal = false;
}
else if (ONLY_SHOW_HELP) {
p2jcliarg_js_1.yargs.showHelp();
retVal = false;
}
else if (!HAS_INPUT_DIR_OR_FILE) {
p2jcliarg_js_1.yargs.showHelp();
console.error("-f is required to specify input directory or file.");
retVal = false;
}
}
catch (e) {
console.error("Exception: ".concat(e.message));
retVal = false;
}
return retVal;
};
PDFCLI.prototype.start = function () {
return __awaiter(this, void 0, void 0, function () {
var inputStatus, e_1;
return __generator(this, function (_a) {
switch (_a.label) {
case 0:
if (!this.initialize() || !INPUT_DIR_OR_FILE) {
console.error("Invalid input parameters.");
return [2 /*return*/];
}
console.log(_PRO_TIMER);
console.time(_PRO_TIMER);
_a.label = 1;
case 1:
_a.trys.push([1, 6, 7, 8]);
inputStatus = fs.statSync(INPUT_DIR_OR_FILE);
if (!inputStatus.isFile()) return [3 /*break*/, 3];
this.inputCount = 1;
return [4 /*yield*/, this.processOneFile(path_1.default.dirname(INPUT_DIR_OR_FILE), path_1.default.basename(INPUT_DIR_OR_FILE))];
case 2:
_a.sent();
return [3 /*break*/, 5];
case 3:
if (!inputStatus.isDirectory()) return [3 /*break*/, 5];
return [4 /*yield*/, this.processOneDirectory(path_1.default.normalize(INPUT_DIR_OR_FILE))];
case 4:
_a.sent();
_a.label = 5;
case 5: return [3 /*break*/, 8];
case 6:
e_1 = _a.sent();
console.error("Exception: ", e_1);
return [3 /*break*/, 8];
case 7:
this.complete();
return [7 /*endfinally*/];
case 8: return [2 /*return*/];
}
});
});
};
PDFCLI.prototype.complete = function () {
if (this.statusMsgs.length > 0)
console.log(this.statusMsgs);
console.log("".concat(this.inputCount, " input files\t").concat(this.successCount, " success\t").concat(this.failedCount, " fail\t").concat(this.warningCount, " warning"));
process.nextTick(function () {
console.timeEnd(_PRO_TIMER);
// process.exit((this.inputCount === this.successCount) ? 0 : 1);
});
};
PDFCLI.prototype.processOneFile = function (inputDir, inputFile) {
var _this = this;
return new Promise(function (resolve, reject) {
var p2j = new PDFProcessor(inputDir, inputFile, _this);
p2j
.processFile()
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.then(function (retVal) {
_this.addStatusMsg(null, "".concat(path_1.default.join(inputDir, inputFile), " => ").concat(p2j.getOutputFile()));
retVal.forEach(function (ret) { return _this.addStatusMsg(null, "+ ".concat(ret.value)); });
resolve(retVal);
})
.catch(function (error) {
_this.addStatusMsg(error, "".concat(path_1.default.join(inputDir, inputFile), " => ").concat(error));
reject(error);
})
.finally(function () { return p2j.destroy(); });
});
};
PDFCLI.prototype.processFiles = function (inputDir, files) {
var _this = this;
var allPromises = [];
files.forEach(function (file, idx) {
return allPromises.push(_this.processOneFile(inputDir, file));
});
return Promise.allSettled(allPromises);
};
PDFCLI.prototype.processOneDirectory = function (inputDir) {
var _this = this;
return new Promise(function (resolve, reject) {
fs.readdir(inputDir, function (err, files) {
if (err) {
_this.addStatusMsg(true, "[".concat(inputDir, "] - ").concat(err.toString()));
reject(err);
}
else {
var _iChars_1 = "!@#$%^&*()+=[]\\';,/{}|\":<>?~`.-_ ";
var pdfFiles = files.filter(function (file) {
return file.slice(-4).toLowerCase() === ".pdf" &&
_iChars_1.indexOf(file.substring(0, 1)) < 0;
});
_this.inputCount = pdfFiles.length;
if (_this.inputCount > 0) {
_this.processFiles(inputDir, pdfFiles)
.then(function (value) { return resolve(value); })
.catch(function (err) { return reject(err); });
}
else {
_this.addStatusMsg(true, "[".concat(inputDir, "] - No PDF files found"));
resolve('no pdf files found');
}
}
});
});
};
PDFCLI.prototype.addStatusMsg = function (error, oneMsg) {
this.statusMsgs.push(error ? "\u2717 Error : ".concat(oneMsg) : "\u2713 Success : ".concat(oneMsg));
};
PDFCLI.prototype.addResultCount = function (error) {
error ? this.failedCount++ : this.successCount++;
};
return PDFCLI;
}());
exports.default = PDFCLI;