pdf2json
Version:
PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js
321 lines (278 loc) • 8.81 kB
JavaScript
import fs from "fs";
import nodeUtil from "util";
import { readFile } from "fs/promises";
import { EventEmitter } from "events";
import { Buffer } from "buffer";
import PDFJS from "./lib/pdf.js";
import { ParserStream, StringifyStream } from "./lib/parserstream.js";
import { kColors, kFontFaces, kFontStyles } from "./lib/pdfconst.js";
import { pkInfo, _PARSER_SIG } from "./lib/pkinfo.js";
import PDFUnit from "./lib/pdfunit.js";
/**
* Class representing a PDF Parser.
* @class PDFParser
* @extends EventEmitter
*/
export default class PDFParser extends EventEmitter {
/**
* Static method to retrieve color dictionary.
* @returns {object} Color dictionary
*/
static get colorDict() {
return kColors;
}
/**
* Static method to retrieve font face dictionary.
* @returns {object} Font face dictionary
*/
static get fontFaceDict() {
return kFontFaces;
}
/**
* Static method to retrieve font style dictionary.
* @returns {object} Font style dictionary
*/
static get fontStyleDict() {
return kFontStyles;
}
/**
* static property to expose PDFUnit class
* @returns {PDFUnit} PDFUnit class
*/
static get PDFUnit() {
return PDFUnit;
}
/**
* static property to expose ParserStream class
*/
static get ParserStream() {
return ParserStream;
}
/**
* static property to expose StringifyStream class
*/
static get StringifyStream() {
return StringifyStream;
}
/**
* static property to expose pkInfo function
*/
static get pkInfo() {
return pkInfo;
}
/**
* static property to expose _PARSER_SIG string
*/
// eslint-disable-next-line @typescript-eslint/naming-convention
static get _PARSER_SIG() {
return _PARSER_SIG;
}
static #maxBinBufferCount = 10;
static #binBuffer = {};
#password = "";
#context = null; // service context object, only used in Web Service project; null in command line #pdfFilePath = null;
#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started #data = null;
#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache #PDFJS = null;
#data = null; //if file read success, data is PDF content; if failed, data is "err" object #processFieldInfoXML = false;
#PDFJS = null; //will be initialized in constructor
#processFieldInfoXML = false; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
/**
* PDFParser constructor.
* @constructor PDFParser class.
* @param {object} context - The context object (only used in Web Service project); null in command line
* @param {boolean} needRawText - Whether raw text is needed or not
* @param {string} password - The password for PDF file
* @info Private methods accessible using the [funcName].call(this, ...) syntax
*/
constructor(context, needRawText, password) {
super();
this.#context = context;
this.#pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started this.#pdfFileMTime = null;
this.#pdfFileMTime = null; // last time the current pdf was modified, used to recognize changes and ignore cache this.#data = null;
this.#data = null; //if file read success, data is PDF content; if failed, data is "err" object this.#processFieldInfoXML = false;
this.#processFieldInfoXML = false; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
this.#PDFJS = new PDFJS(needRawText);
this.#password = password;
}
/**
* @private
* @param {object} data - The parsed data
*/
#onPDFJSParseDataReady(data) {
if (!data) {
nodeUtil.p2jinfo("PDF parsing completed.");
this.emit("pdfParser_dataReady", this.#data);
} else {
this.#data = { ...this.#data, ...data };
}
}
/**
* @private
* @param {Error} err - The error object
*/
#onPDFJSParserDataError(err) {
this.#data = null;
this.emit("pdfParser_dataError", { parserError: err });
}
/**
* @private
* @param {Buffer} buffer - The PDF buffer
*/
#startParsingPDF(buffer) {
this.#data = {};
this.#PDFJS.on("pdfjs_parseDataReady", (data) =>
this.#onPDFJSParseDataReady(data)
);
this.#PDFJS.on("pdfjs_parseDataError", (err) =>
this.#onPDFJSParserDataError(err)
);
//v1.3.0 the following Readable Stream-like events are replacement for the top two custom events
this.#PDFJS.on("readable", (meta) => this.emit("readable", meta));
this.#PDFJS.on("data", (data) => this.emit("data", data));
this.#PDFJS.on("error", (err) => this.#onPDFJSParserDataError(err));
this.#PDFJS.parsePDFData(
buffer || PDFParser.#binBuffer[this.binBufferKey],
this.#password
);
}
/**
* @private
* @returns {boolean}
*/
#processBinaryCache() {
if (this.binBufferKey in PDFParser.#binBuffer) {
this.#startParsingPDF();
return true;
}
const allKeys = Object.keys(PDFParser.#binBuffer);
if (allKeys.length > PDFParser.#maxBinBufferCount) {
const idx = this.id % PDFParser.#maxBinBufferCount;
const key = allKeys[idx];
PDFParser.#binBuffer[key] = null;
delete PDFParser.#binBuffer[key];
nodeUtil.p2jinfo(`re-cycled cache for ${key}`);
}
return false;
}
/**
* Getter for #data
* @returns {object|null} Data
*/
get data() {
return this.#data;
}
/**
* Getter for binBufferKey
* @returns {string} The binBufferKey
*/
get binBufferKey() {
return this.#pdfFilePath + this.#pdfFileMTime;
}
/**
* Creates a parser stream
* @returns {ParserStream} A new parser stream
*/
createParserStream() {
return new ParserStream(this, { objectMode: true, bufferSize: 64 * 1024 });
}
/**
* Asynchronously load a PDF from a file path.
* @param {string} pdfFilePath - Path of the PDF file
* @param {number} verbosity - Verbosity level
*/
async loadPDF(pdfFilePath, verbosity) {
nodeUtil.verbosity(verbosity || 0);
nodeUtil.p2jinfo(`about to load PDF file ${pdfFilePath}`);
this.#pdfFilePath = pdfFilePath;
try {
this.#pdfFileMTime = fs.statSync(pdfFilePath).mtimeMs;
if (this.#processFieldInfoXML) {
this.#PDFJS.tryLoadFieldInfoXML(pdfFilePath);
}
if (this.#processBinaryCache()) return;
PDFParser.#binBuffer[this.binBufferKey] = await readFile(pdfFilePath);
nodeUtil.p2jinfo(`Load OK: ${pdfFilePath}`);
this.#startParsingPDF();
} catch (err) {
nodeUtil.p2jerror(`Load Failed: ${pdfFilePath} - ${err}`);
this.emit("pdfParser_dataError", err);
}
}
/**
* Parse PDF buffer. Introduce a way to directly process buffers without the need to write it to a temporary file
* @param {Buffer} pdfBuffer - PDF buffer
* @param {number} verbosity - Verbosity level, ERRORS = 0, WARNINGS = 1, INFOS = 5;
*/
parseBuffer(pdfBuffer, verbosity) {
nodeUtil.verbosity(verbosity); // validated in util.js
if ((!pdfBuffer?.length) || (!pdfBuffer.buffer)) {
nodeUtil.p2jerror("Error: empty PDF buffer, nothing to parse.");
return;
}
let pdfBufferParse = pdfBuffer;
if (pdfBufferParse.buffer.byteLength !== pdfBufferParse.length) {
pdfBufferParse = Buffer.from(pdfBufferParse.buffer, 0, pdfBufferParse.byteLength);
}
this.#startParsingPDF(pdfBufferParse);
}
/**
* Retrieve raw text content from PDF.
* @returns {string} Raw text content
*/
getRawTextContent() {
return this.#PDFJS.getRawTextContent();
}
/**
* Retrieve raw text content stream.
* @returns {Stream} Raw text content stream
*/
getRawTextContentStream() {
return ParserStream.createContentStream(this.getRawTextContent());
}
/**
* Retrieve all field types.
* @returns {object[]} All field types
*/
getAllFieldsTypes() {
return this.#PDFJS.getAllFieldsTypes();
}
/**
* Retrieve all field types stream.
* @returns {Stream} All field types stream
*/
getAllFieldsTypesStream() {
return ParserStream.createContentStream(this.getAllFieldsTypes());
}
/**
* Retrieve merged text blocks if needed.
* @returns {object} Merged text blocks
*/
getMergedTextBlocksIfNeeded() {
return this.#PDFJS.getMergedTextBlocksIfNeeded();
}
/**
* Retrieve merged text blocks stream.
* @returns {Stream} Merged text blocks stream
*/
getMergedTextBlocksStream() {
return ParserStream.createContentStream(this.getMergedTextBlocksIfNeeded());
}
/**
* Destroy the PDFParser instance.
*/
destroy() {
// invoked with stream transform process
super.removeAllListeners();
//context object will be set in Web Service project, but not in command line utility
if (this.#context) {
this.#context.destroy();
this.#context = null;
}
this.#pdfFilePath = null;
this.#pdfFileMTime = null;
this.#data = null;
this.#processFieldInfoXML = false; //disable additional _fieldInfo.xml parsing and merging (do NOT set to true)
this.#PDFJS.destroy();
this.#PDFJS = null;
}
}