UNPKG

pdf3json

Version:

A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js

github.com/modesty/pdf2json

modesty/pdf2json

149 lines (117 loc) • 4.56 kB

JavaScript

var PDFJS = require("./lib/pdf.js"), nodeUtil = require("util"), nodeEvents = require("events"), _ = require("underscore"), fs = require('fs'), async = require("async"); var PDFParser = (function () { 'use strict'; // private static var _nextId = 1; var _name = 'PDFParser'; var _binBuffer = {}; var _maxBinBufferCount = 10; // constructor var cls = function (context) { //call constructor for super class nodeEvents.EventEmitter.call(this); // private var _id = _nextId++; // public (every instance will have their own copy of these methods, needs to be lightweight) this.get_id = function() { return _id; }; this.get_name = function() { return _name + _id; }; this.context = context; this.pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started this.data = null; //if file read success, data is PDF content; if failed, data is "err" object this.PDFJS = new PDFJS(); this.parsePropCount = 0; this.processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging }; // inherit from event emitter nodeUtil.inherits(cls, nodeEvents.EventEmitter); // public static cls.get_nextId = function () { return _name + _nextId; }; //private methods, needs to invoked by [funcName].call(this, ...) var _onPDFJSParseDataReady = function(data) { _.extend(this.data, data); this.parsePropCount++; if (this.parsePropCount >= 2) { this.emit("pdfParser_dataReady", this); nodeUtil.p2jinfo("PDF parsing completed."); } }; var _onPDFJSParserDataError = function(data) { this.data = data; this.emit("pdfParser_dataError", this); }; var startParsingPDF = function(buffer) { this.data = {}; this.parsePropCount = 0; this.PDFJS.on("pdfjs_parseDataReady", _.bind(_onPDFJSParseDataReady, this)); this.PDFJS.on("pdfjs_parseDataError", _.bind(_onPDFJSParserDataError, this)); this.PDFJS.parsePDFData(buffer || _binBuffer[this.pdfFilePath]); }; var processBinaryCache = function() { if (_.has(_binBuffer, this.pdfFilePath)) { startParsingPDF.call(this); return true; } var allKeys = _.keys(_binBuffer); if (allKeys.length > _maxBinBufferCount) { var idx = this.get_id() % _maxBinBufferCount; var key = allKeys[idx]; _binBuffer[key] = null; delete _binBuffer[key]; nodeUtil.p2jinfo("re-cycled cache for " + key); } return false; }; var processPDFContent = function(err, data) { nodeUtil.p2jinfo("Load PDF file status:" + (!!err ? "Error!" : "Success!") ); if (err) { this.data = err; this.emit("pdfParser_dataError", this); } else { _binBuffer[this.pdfFilePath] = data; startParsingPDF.call(this); } }; var fq = async.queue(function (task, callback) { fs.readFile(task.path, callback); }, 250); // public (every instance will share the same method, but has no access to private fields defined in constructor) cls.prototype.loadPDF = function (pdfFilePath, verbosity) { nodeUtil.verbosity(verbosity); nodeUtil.p2jinfo("about to load PDF file " + pdfFilePath); this.pdfFilePath = pdfFilePath; if (this.processFieldInfoXML) { this.PDFJS.tryLoadFieldInfoXML(pdfFilePath); } if (processBinaryCache.call(this)) return; // fs.readFile(pdfFilePath, _.bind(processPDFContent, this)); fq.push({path: pdfFilePath}, _.bind(processPDFContent, this)); }; // Introduce a way to directly process buffers without the need to write it to a temporary file cls.prototype.parseBuffer = function (pdfBuffer) { startParsingPDF.call(this, pdfBuffer); }; cls.prototype.destroy = function() { this.removeAllListeners(); //context object will be set in Web Service project, but not in command line utility if (this.context) { this.context.destroy(); this.context = null; } this.pdfFilePath = null; this.data = null; this.PDFJS.destroy(); this.PDFJS = null; this.parsePropCount = 0; }; return cls; })(); module.exports = PDFParser;