pdf-data-parser
Version:
Parse, search and stream PDF tabular data using Node.js with Mozilla's PDF.js library.
525 lines (437 loc) • 15.8 kB
JavaScript
/**
* lib/PdfDataParser
*
* Builds upon the Pdf.js node.js example getContent.js
* and this projects test app getCells.js.
*
* Gets content items and group the text into cells and rows.
* Order items by using marked content and/or comparing x, y coordinates.
*
* Output is an array of arrays.
*/
import EventEmitter from 'node:events';
import Cell from "./cell.js";
import modulesPath from "./modulesPath.js";
export default class PdfDataParser extends EventEmitter {
/**
*
* @param {Object} options
* @param {String|URL} [options.url] the URL or local file name of the .pdf
* @param {String|ArrayBuffer} [options.data] pdf file data as an array, instead of using url
* @param {String} [options.password] password for decrypting the pdf document, optional
* @param {Number[]} [options.pages] array of page numbers to process, if undefined defaults to all pages
* @param {String|RegExp} [options.heading] PDF section heading where data is located, default: none
* @param {String|RegExp} [options.stopHeading] PDF section heading after data table, default: none
* @param {Number} [options.cells] minimum number cells in a row for output, or "min-max" e.g. "7-9"
* @param {Boolean} [options.newlines] preserve new lines in cell data, default: false
* @param {Number} [options.pageHeader] height of page header area in points, default: 0
* @param {Number} [options.pageFooter] height of page footer area in points, default: 0
* @param {Boolean} [options.hasHeader] indicates if the table has a header row, default: true
* @param {Boolean} [options.repeatingHeaders] indicates if table header is repeated on each page, default: true
* @param {Boolean|Number} [options.trim] trim whitespace, false (0) = none, true (1) = both, 2 = starting only, 3 = trailing only, default: true
* @param {Boolean} [options.artifacts] parse artifacts content, default: false
* @param {Number} [options.lineHeight] approximate line height ratio based on font size; default 1.67
* @param {Boolean} [options.orderXY] order cells by XY coordinates on page; default true
* @param {Boolean} [options.missingValues] check for blank cells by comparing XY coordinates against table header cells, default: false
*/
constructor(options = {}) {
super({ captureRejections: true });
this.options = Object.assign({
hasHeader: true,
repeatingHeaders: true,
trim: true,
orderXY: true
}, options);
this.cellsRange = {
min: 1,
max: 256,
heading: 0 // RepeatHeading
};
if (options.cells) {
if (typeof options.cells === "number") {
this.cellsRange.min = options.cells;
}
else if (typeof options.cells === "string") {
let minmax = options.cells.split("-")
if (minmax.length > 1)
this.cellsRange.min = parseInt(minmax[ 0 ]);
if (minmax.length > 2)
this.cellsRange.max = parseInt(minmax[ 1 ]);
}
}
if (options.RepeatHeading?.header || options[ "RepeatHeading.header" ] || options.header)
this.cellsRange.heading = 1;
// parsing properties
this.doc;
this.page;
this.headingFound = Object.hasOwn(options, "heading") ? false : true;
this.tableFound = this.headingFound;
this.tableDone = false;
this.firstPageNumber = options.pages ? options.pages[ 0 ] : 1;
this._cells = []; // all cells parsed from document in x,y order
this._headerRow = []; // header row, i.e. column headers found on first page
// output rows contain just value
this._rows = []; // array of output rows
// some default settings
this.headerY = 9999;
this.footerY = 0;
// parser state
this.started = false;
this.paused = false;
this.cancelled = false;
}
/**
* Load and parse the PDF document.
* @returns an array of row arrays.
* If using an event listener the return value will be an empty array.
*/
async parse() {
try {
const { getDocument } = await import("pdfjs-dist/legacy/build/pdf.mjs");
let args = {
url: this.options.url,
data: this.options.data,
password: this.options.password,
fontExtraProperties: true,
standardFontDataUrl: await modulesPath("./pdfjs-dist/standard_fonts/")
};
var loadingTask = getDocument(args);
this.doc = await loadingTask.promise;
const numPages = this.doc.numPages;
let markInfo = await this.doc.getMarkInfo();
if (!(markInfo && markInfo.Marked)) {
console.warn("Warning: PDF document does not contain Marked Content".yellow);
}
for (let pn = 1; pn <= numPages; pn++) {
if (this.options.pages && !this.options.pages.includes(pn))
continue;
this.page = await this.doc.getPage(pn);
const vp = this.page.getViewport({ scale: 1.0 });
this.headerY = vp.height - (this.options.pageHeader || 0);
this.footerY = this.options.pageFooter || 0;
this._cells = [];
if (markInfo?.Marked)
await this.parseMarkedPage();
else
await this.parseLinedPage();
await this.processCells();
// release page resources.
await this.page.cleanup();
if (this.tableDone)
break;
}
this.emit("end");
return this._rows;
}
catch (err) {
console.error(err);
this.emit("error", err);
}
}
pause() {
// console.debug("parser pause");
this.paused = true;
}
resume() {
// console.debug("parser resume");
if (this.paused && !this.cancelled) {
this.paused = false;
//this.processCells();
}
}
cancel() {
// console.debug("parser cancel");
this.cancelled = true;
}
/**
* Parse the content items returned by PDF.js.
* Use PDF.js marked content to collect multiple items into cells.
* Result is cells array contains cells in sorted x.y order.
*/
async parseMarkedPage() {
let cell = null;
let markedContent = ""; // assume NO nesting of markedContent tags, at least I haven't seen it yet.
let artifact = false;
let paragraph = false;
let span = false;
let content = await this.page.getTextContent({ includeMarkedContent: true, disableNormalization: false, disableCombineTextItems: false });
for (let item of content.items) {
if (item.type === "beginMarkedContent") {
switch (item.tag) {
case "Artifact":
markedContent = "Artifact";
artifact = true;
// insert working cell
this.insertCell(cell);
cell = null;
// note: start a new cell, because headers and footers could be in artifacts
break;
default:
console.warn("unknown content tag: ".yellow + item.tag);
}
}
else if (item.type === "beginMarkedContentProps") {
switch (item.tag) {
case 'P':
markedContent = "P";
paragraph = true;
break;
case "Span":
markedContent = "Span";
span = true;
break;
default:
}
}
else if (item.type === "endMarkedContent") {
switch (markedContent) {
case "Artifact":
artifact = false;
// ignore text in artifacts like headers and footers
if (this.options.artifacts)
this.insertCell(cell);
cell = null;
break;
case "P":
break;
case "Span":
break;
}
markedContent = "";
}
else if (item.type) {
// unknown type
console.warn("Warning: unknown content type: ".yellow + item.type);
}
else {
// a string item
if (item.dir !== 'ltr') // expect direction left-to-right
console.warn("Warning: text direction is: ".yellow + item.dir);
if (paragraph || span) {
// ignore EOL
if (item.str === "" && item.width === 0 && (paragraph && item.hasEOL))
continue;
// ignore spacing between cells
if (item.str === " " && (paragraph || (item.width > cell?.fontWidth)))
continue;
// for span and less than one character width assume we need it
// check to save and start a new cell
if (cell && cell.count > 0) {
cell.hasSpan = cell.hasSpan || span;
if (!cell.isAdjacent(item)) {
this.insertCell(cell);
cell = null;
}
}
}
if (!cell)
cell = new Cell(this.options);
// append text to cell
cell.addItem(item);
paragraph = false;
span = false;
}
}
// push last cell
if (cell)
this.insertCell(cell);
}
async parseLinedPage() {
let cell = new Cell(this.options);
let wasEOL = false;
let content = await this.page.getTextContent({ disableNormalization: false, disableCombineTextItems: false });
for (let item of content.items) {
if (item.dir !== 'ltr') // expect direction left-to-right
console.warn(item.dir.yellow);
let aligns = cell.alignment(item);
if (!aligns.adjacent && cell.count > 0) {
this.insertCell(cell);
cell = new Cell(this.options);
}
if (wasEOL && (aligns.top || ((aligns.left || aligns.right) && aligns.adjacent))) {
// ignore newline in the middle of a line, e.g. a split heading
// may be sensitive to normal line spacing and heading line spacing
wasEOL = false;
}
if (wasEOL && cell.count > 0) {
this.insertCell(cell);
cell = new Cell(this.options);
}
// characters have a height, ignore more than one space between cells
if (item.height > 0 || (item.str === " " && item.width < cell?.fontWidth))
cell.addItem(item);
wasEOL = item.hasEOL;
}
// process last cell
if (cell.count > 0) {
this.insertCell(cell);
}
}
/**
* Add item to cells array in x,y order.
*
* Order of cells is top of page (max) to bottom of page (0).
* Within a row order is left (0) to right (max).
* Usually cells flow in order from pdf.js, but sometimes not.
*
* Filters out cells in page header and page footer areas.
*
* @param {*} cell
*/
insertCell(cell) {
//console.log("C");
if (!cell || cell.count <= 0 || cell.inserted)
return;
// filter out cells in page header and footer areas
if (cell.y1 >= this.headerY || cell.y1 <= this.footerY)
return;
if (this.options.orderXY) {
let i = this._cells.length - 1;
let c = this._cells[ i ];
// while cell should be above c
while (c && c.isSameLine(cell) > 0) {
c = this._cells[ --i ];
}
// while same row and cell is less than c
// find position in row based on left edge
while (c && c.isSameLine(cell) === 0 && cell.x1 < c.x1) {
c = this._cells[ --i ];
}
// insert the cell
this._cells.splice(i + 1, 0, cell);
}
else
this._cells.push(cell);
cell.inserted = true;
}
/**
* Iterate the cells and determine rows.
*/
async processCells() {
this.rowNum = 1; // incremented in output()
let row = [];
let prevCell = new Cell(this.options);
for (let cell of this._cells) {
// check if end of row
if (row.length > 0 && (cell.isSameLine(prevCell) !== 0 || (prevCell.x1 > cell.x1))) {
// output current row
if (this.filters(row))
this.output(row);
// start new row
row = [];
prevCell = new Cell(this.options);
}
if (this.tableDone)
break;
// append cell to row
if (cell.count) {
row.push(cell);
prevCell = cell; // prevCell must have text
}
}
// push last row
if (this.inCellRange(row.length) && this.filters(row)) {
this.output(row);
}
}
inCellRange(rowlen) {
return (rowlen >= this.cellsRange.min && rowlen <= this.cellsRange.max) || (rowlen === this.cellsRange.heading);
}
/**
* Performs row filtering.
*
* @param {*} row is an array of cells
*/
filters(row) {
if (!this.headingFound) {
this.headingFound = this.compareHeading(row, this.options.heading);
}
else if (!this.tableFound) {
this.tableFound = this.inCellRange(row.length);
}
else if (this.options.heading && !this.tableDone) {
this.tableDone = !this.inCellRange(row.length) || this.compareHeading(row, this.options.stopHeading);
}
let output = this.headingFound && this.tableFound && !this.tableDone && this.inCellRange(row.length);
if (output && this.rowNum === 1) {
// first page
if (this.page.pageNumber === this.firstPageNumber) {
this._headerRow = row;
}
// subsequent pages
else if (this.options.repeatingHeaders && this.rowsEqual(this._headerRow, row)) {
output = false;
}
}
return output;
}
/**
*
* @param {Object} row - the row to check
* @param {String} heading - text to compare against
*/
compareHeading(row, heading) {
if (row == null || row.length === 0)
return false;
if (Object.prototype.toString.call(heading).slice(8, -1) === "RegExp")
return heading.test(row[ 0 ].text);
else
return row[ 0 ].text === heading;
}
rowsEqual(row1, row2) {
if (!row1 || !row2) {
//console.log("row1 " + row1);
//console.log("row2 " + row2);
return false;
}
var i = row1.length;
if (i !== row2.length)
return false;
while (i--) {
if (row1[ i ].text !== row2[ i ].text)
return false;
}
return true;
}
/**
* Emits or appends data to output.
*
* @param {*} row is an array of cells
*/
async output(row) {
let rowValues = [];
let col = 0;
for (let cell of row) {
// check for missing cells
if (this.options.missingValues && row.length < this._headerRow.length && row.length >= this.cellsRange.min) {
while ((col < this._headerRow.length && cell.x1 > this._headerRow[ col ].x2)
|| (col+1 < this._headerRow.length && cell.x1 > this._headerRow[ col + 1 ].x1)) {
rowValues.push(null);
col++;
}
}
let text;
if (this.options?.trim) {
if (this.options.trim === 2)
text = cell.text.trimStart();
else if (this.options.trim === 3)
text = cell.text.trimEnd();
else
text = cell.text.trim();
}
rowValues.push(text);
col++;
}
// check for missing cells at end
if (this.options.missingValues && row.length < this._headerRow.length && row.length >= this.cellsRange.min) {
while (rowValues.length < this._headerRow.length) {
rowValues.push(null);
}
}
if (this.listenerCount("data") > 0)
this.emit("data", rowValues);
else
this._rows.push(rowValues);
this.rowNum++;
}
};