pdfreader
Version:
Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.
759 lines (672 loc) • 21.4 kB
JavaScript
var util = require('util');
var PDFParser = require('pdf2json');
/**
* Minimal logger
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
var nullLog = function LOG() {};
var realLog = function LOG() {
for (var i in arguments)
if (arguments[i] instanceof Object || arguments[i] instanceof Array)
arguments[i] = util.inspect(arguments[i]);
console.log("[DEBUG] " + Array.prototype.join.call(arguments, " "));
};
var LOG = nullLog;
function log() {
LOG.apply(null, arguments);
}
function toggle(enabled) {
LOG = !enabled ? nullLog : realLog;
}
var LOG$1 = /*#__PURE__*/Object.freeze({
__proto__: null,
log: log,
toggle: toggle
});
/**
* PdfReader: class that reads a PDF file, and calls a function on each item found while parsing that file.
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
*
* An item object can match one of the following objects:
* - null, when the parsing is over, or an error occured.
* - {file:{path:string}}, when a PDF file is being opened.
* - {page:integer}, when a new page is being parsed, provides the page number, starting at 1.
* - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position.
*
**/
function forEachItem(pdf, handler) {
var pageNumber = 0;
// pdf.formImage was removed in pdf2json@2, but we keep backward compatibility too
var Pages = pdf.Pages || pdf.formImage.Pages;
for (var p in Pages) {
var page = Pages[p];
var number = ++pageNumber;
handler(null, {
page: number,
width: page.Width || (pdf.formImage ? pdf.formImage.Width : 0),
height:
page.Height ||
(pdf.formImage ? pdf.formImage.Pages[number - 1].Height : 0),
});
for (var t in page.Texts) {
var item = page.Texts[t];
item.text = decodeURIComponent(item.R[0].T);
handler(null, item);
}
}
handler();
}
function PdfReader(options) {
log("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter
this.options = options || {};
}
/**
* parseFileItems: calls itemHandler(error, item) on each item parsed from the pdf file
**/
PdfReader.prototype.parseFileItems = function (pdfFilePath, itemHandler) {
itemHandler(null, { file: { path: pdfFilePath } });
var pdfParser;
if (this.options.password) {
pdfParser = new PDFParser(null, null, this.options.password);
} else {
pdfParser = new PDFParser();
}
pdfParser.on("pdfParser_dataError", itemHandler);
pdfParser.on("pdfParser_dataReady", function (pdfData) {
forEachItem(pdfData, itemHandler);
});
var verbosity = this.options.debug ? 1 : 0;
pdfParser.loadPDF(pdfFilePath, verbosity);
this.options.signal?.addEventListener("abort", function () {
pdfParser.destroy();
});
};
/**
* parseBuffer: calls itemHandler(error, item) on each item parsed from the pdf file received as a buffer
*/
PdfReader.prototype.parseBuffer = function (pdfBuffer, itemHandler) {
itemHandler(null, { file: { buffer: pdfBuffer } });
var pdfParser;
if (this.options.password) {
pdfParser = new PDFParser(null, null, this.options.password);
} else {
pdfParser = new PDFParser();
}
pdfParser.on("pdfParser_dataError", itemHandler);
pdfParser.on("pdfParser_dataReady", function (pdfData) {
forEachItem(pdfData, itemHandler);
});
var verbosity = this.options.debug ? 1 : 0;
pdfParser.parseBuffer(pdfBuffer, verbosity);
this.options.signal?.addEventListener("abort", function () {
pdfParser.destroy();
});
};
/**
* parseColumns, for pdfreader, used by the Rule class.
* accumulates values below each column header (on 1st row, given their name), without detecting empty rows.
* TODO: use ColumnsParser
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
const parseColumns$1 = function (/* columns */) {
this.output = [];
this.cols = Array.prototype.slice.apply(arguments);
var colNames = this.cols,
colX = [],
rows = this.output,
line = -1, // header
lineY = null;
function processItem(item) {
if (line == -1) {
// parse x-position of column headers
var i = colNames.indexOf(item.text);
if (i > -1) colX[i] = item.x;
if (colX.length == colNames.length) {
log("table header:", colNames, colX);
line++;
}
} else {
if (lineY === null) {
lineY = item.y;
} else if (lineY != item.y) {
lineY = item.y;
line++;
}
// parsing values for each column
var col = 0;
for (var i = colX.length - 1; i >= 0; --i)
if (item.x > colX[i]) {
col = i;
break;
}
rows[lineY] = rows[lineY] || {};
rows[lineY][col] = item.text;
}
}
processItem(this.currentItem); // apply on header's first item
return processItem; // then the same function will be run on all following items, until another rule is triggered
};
var parseColumnsExports = /*#__PURE__*/Object.freeze({
__proto__: null,
parseColumns: parseColumns$1
});
/**
* parseTable accumulator, for pdfreader, used by the Rule class.
* items are classified into columns and rows, based on their left and top coordinates,
* and left position of column headers.
* TODO: use TableParser
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
function getTopPos(item) {
return item.y;
}
function getLeftPos(item) {
return item.x;
}
function getText$1(item) {
return item.text;
}
/**
* makeClassifier(): makes a classifier, based on an array of numbers and an expected number of clusters.
* nbClusters: expected number of clusters
* arr: array of numbers
* => returns a function that takes a number, and returns the number of its corresponding column.
**/
function makeFloorClassifier(nbClusters, arr) {
var min = Math.min.apply(Math, arr);
var delta = Math.max.apply(Math, arr) - min;
min -= delta / nbClusters / 2;
return function classify(value) {
return Math.floor((nbClusters * (value - min)) / delta);
};
}
function makeColumnClassifier(header) {
var colX = [0].concat(header.map(getLeftPos)).sort(function (a, b) {
return a - b;
});
return function classify(item) {
for (var i = colX.length - 1; i > -1; --i)
if (getLeftPos(item) >= colX[i]) return i;
};
}
function buildRowList(items, classifyRow) {
var rows = [];
for (var i in items) {
var item = items[i];
var row = classifyRow(getTopPos(item));
(rows[row] = rows[row] || []).push(item);
}
return rows;
}
function joinCellCollisions$1(separ) {
return function (cell) {
return (cell || []).map(getText$1).join(separ).substr(0, 7);
};
}
function fillTab(str) {
return str.substr(0, 7);
}
function renderTable(table) {
return (table || [])
.map(function (row) {
return (row || []).map(fillTab).join("\t");
})
.join("\n");
}
function renderMatrix$1(matrix) {
return (matrix || [])
.map(function (row) {
return (row || []).map(joinCellCollisions$1("+")).join("\t");
})
.join("\n");
}
function renderRows$1(rows) {
return (rows || [])
.map(function (row, rowId) {
var cells = [rowId + ":"];
for (var i in row)
cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7));
return cells.join("\t");
})
.join("\n");
}
function renderItems(items) {
return items
.map(function (i) {
return [i.y, i.x, i.text].join("\t");
})
.join("\n");
}
function buildMatrix(rows, classifyColumn) {
var matrix = [];
for (var y in rows) {
var row = [];
for (var x in rows[y]) {
var item = rows[y][x];
var colN = classifyColumn(item);
(row[colN] = row[colN] || []).push(item);
}
matrix.push(row);
}
return matrix;
}
function detectCollisions(matrix) {
var collisions = [];
(matrix || []).map(function (row, rowN) {
(row || []).map(function (cellItems, colN) {
if (cellItems.length > 1)
collisions.push({
row: rowN,
col: colN,
items: cellItems,
});
});
});
return collisions;
}
const parseTable$1 = function makeAccumulator(nbRows, headerRow) {
var rule = this,
items = [];
rule.nbRows = nbRows || 0;
rule.output = {
items: items,
rows: null,
matrix: null,
};
function accumulate(item) {
items.push(item);
}
// when parsing is done: generate a clean table, from items.
rule.whenDone(function () {
// classify items into rows
var classifyRow = makeFloorClassifier(rule.nbRows, items.map(getTopPos));
//LOG(items.map(function(i){ return [getTopPos(i), classifyRow(getTopPos(i)), i.text].join("\t"); }).join("\n"));
this.output.rows = buildRowList(items, classifyRow);
// classify row items into columns
var classifyColumn = makeColumnClassifier(this.output.rows[headerRow || 0]);
this.output.matrix = buildMatrix(this.output.rows, classifyColumn);
});
return accumulate; // then the same function will be run on all following items, until another rule is triggered
};
var parseTableExports = /*#__PURE__*/Object.freeze({
__proto__: null,
detectCollisions: detectCollisions,
parseTable: parseTable$1,
renderItems: renderItems,
renderMatrix: renderMatrix$1,
renderRows: renderRows$1,
renderTable: renderTable
});
/**
* Rule: class that can be used to define and process data extraction rules, while parsing a PDF document.
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
/**
* regexp: a regular expression which a PDF item's text must match in order to execute that rule.
* => a Rule object exposes "accumulators": methods that defines the data extraction strategy of a rule.
**/
function Rule(regexp) {
this.regexp = regexp;
var self = this;
// proxy accumulators methods
Object.keys(Rule.accumulators).forEach(function (name) {
self[name] = function () {
log("building rule:", regexp, "->", name);
self.methodName = name;
self.accumulatorParams = arguments;
self.accumulatorBuilder = Rule.accumulators[name];
return self;
};
});
}
// shortcut for defining Rule objects in a more concise manner
Rule.on = function (regexp) {
return new Rule(regexp);
};
Rule.after = function (regexp) {
var rule = new Rule(regexp);
rule.skipCurrentItem = true;
return rule;
};
/**
* then(): defines a function to be called after a Rule's accumulator has finished processing items.
* fct: the function to be called after a Rule's accumulator has finished processing items.
* the output of the accumulator will be passed as the first parameter of that function.
**/
Rule.prototype.then = function (fct) {
var self = this;
this.terminate = function () {
fct.call(self, self.output);
};
return this;
};
// private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator.
Rule.prototype.test = function (item) {
if (this.regexp.test(item.text)) {
// lazy init of accumulators: build and init the accumulator on first match
this.currentItem = item;
if (!this.accumulatorImpl && this.accumulatorBuilder) {
this.accumulatorImpl = this.accumulatorBuilder.apply(
this,
this.accumulatorParams
);
this.accumulatorImpl.methodName = this.methodName;
this.accumulatorImpl.terminate = this.terminate;
}
return this.accumulatorImpl;
}
};
// intended to be run from accumulator, in order to process output before calling termination then() handler.
Rule.prototype.whenDone = function (fct) {
var self = this;
var then = this.terminate;
this.terminate = function () {
fct.call(self);
then();
};
};
/**
* rules: array of Rule objects that will be executed one-by-one, whenever a PDF item matches a rule.
* each rule can only be executed once.
* => returns a function to be called for each item by the PdfReader.
**/
Rule.makeItemProcessor = function (rules) {
var currentAccumulator = null;
function terminateAccumulator() {
var terminatePreviousAcc = (currentAccumulator || {}).terminate;
if (terminatePreviousAcc) {
log("terminating accumulator:", currentAccumulator.methodName);
terminatePreviousAcc(currentAccumulator); // TODO: remove currentAccumulator parameter
}
}
var applyRulesOnNextItem = true;
return function (item) {
if (!item)
// last item of the file => flush buffers
return terminateAccumulator();
else if (!item.text) return;
//LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem);
if (applyRulesOnNextItem)
for (var r in rules) {
var accumulator = rules[r].test(item);
if (accumulator) {
terminateAccumulator();
log("current accumulator:", accumulator.methodName);
if (rules[r].skipCurrentItem) applyRulesOnNextItem = false;
currentAccumulator = accumulator;
delete rules[r];
return;
}
}
else applyRulesOnNextItem = true;
// if reaching this point, the current item matches none of the rules => accumulating data on current accumulator
if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item);
};
};
/**
* Rule.accumulators: array of accumulators that can be used for defining Rule objects.
* An accumulator is a function that may (or may not) accept parameters, to be provided by the developer of a parser.
* It returns another function that will be run on every following PDF item, in order to accumulate data.
* The output of an accumulator is stored in this.output (field of its parent Rule object).
**/
Rule.accumulators = {
stopAccumulating: function () {
return function () {};
},
};
// method for adding accumulators
Rule.addAccumulator = function (methodName, methodBuilder) {
Rule.accumulators[methodName] = methodBuilder;
};
/**
* This accumulator will store the group values extracted by the regexp of the Rule object,
* on the current matching PDF item, into an array.
*
* E.g. with regex: /hello ([a-z]+)/, the text "hello world" will yield "world".
**/
Rule.addAccumulator("extractRegexpValues", function () {
var matches = this.regexp.exec(this.currentItem.text);
this.output = matches.slice(1);
return function () {}; // following lines are not to be processed by this accumulator
});
/**
* This accumulator will store the value of the next PDF item.
**/
Rule.addAccumulator("parseNextItemValue", function () {
var self = this,
done = false;
return function (item) {
if (done) return;
done = true;
self.output = item.text;
};
});
/**
* This accumulator will store the text of all following PDF items into an array.
**/
Rule.addAccumulator("accumulateAfterHeading", function () {
var output = (this.output = []);
return function accumulate(item) {
output.push(item.text);
};
});
/**
* This accumulator will store the text of all following PDF items with equal x-coordinates.
**/
Rule.addAccumulator("accumulateFromSameX", function () {
var output = (this.output = []),
x = null;
return function accumulate(item) {
if (x === null) x = item.x;
if (x == item.x) output.push(item.text);
};
});
/**
* This accumulator will store a table by detecting its columns, given their names.
**/
Rule.addAccumulator("parseColumns", parseColumns$1);
/**
* This accumulator will store a table by detecting its columns, given their count.
**/
Rule.addAccumulator("parseTable", parseTable$1);
/**
* Applies a list of simple actions to apply to each provided item, in order to accumulate field values.
* Provides a list of parsed `fields`.
* Calls `callback(error, this)` when all accumulators were processed, or when processing a null item.
**/
function SequentialParser(accumulators, callback) {
var step = 0;
var fields = {};
return {
fields: fields,
addField: function (key, value) {
this.fields[key] = value;
},
parseItem: function (item) {
if (step >= accumulators.length) {
return console.warn(
"warning: skipping item, because SequentialParser is done."
);
}
var current = accumulators[step];
if (current.field) {
this.addField(current.field, item);
++step;
} else if (current.accumulator) {
var doneAccumulating = current.accumulator(item, this);
if (doneAccumulating) ++step;
} // no action => skip item
else ++step;
if (!item || step >= accumulators.length) {
callback && callback(null, this);
}
},
};
}
/**
* TableParser
* Classifies items into columns and rows, based on their left and top coordinates,
* and left position of column headers.
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
function TableParser() {
this.rows = {};
}
TableParser.prototype.processItem = function (item, col) {
var row = (this.rows["" + item.y] = this.rows["" + item.y] || {});
(row[col] = row[col] || []).push(item);
};
TableParser.prototype.processHeadingItem = function (item, col) {
this.processItem(
{
y: 0,
x: item.x,
text: item.text,
},
col
);
};
// Rows
function sortAsFloatValues(values) {
return values.slice().sort(function (a, b) {
return parseFloat(a) - parseFloat(b);
});
}
TableParser.prototype.getRows = function () {
var rows = this.rows;
var yValues = sortAsFloatValues(Object.keys(rows));
return yValues.map(function (y) {
return rows["" + y];
});
};
function renderRows(rows) {
return (rows || [])
.map(function (row, rowId) {
var cells = [];
for (var i in row)
for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text);
return rowId + ":\t" + cells.join(", ");
})
.join("\n");
}
TableParser.prototype.renderRows = function () {
return renderRows(this.getRows());
};
// Matrix
function getSortedXValues(rows) {
var xSet = {};
for (var y in rows) for (var x in rows[y]) xSet[x] = true;
return sortAsFloatValues(Object.keys(xSet));
}
/** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */
TableParser.prototype.getMatrix = function () {
var rows = this.getRows();
var xValues = getSortedXValues(rows);
return rows.map(function (row, y) {
var rowNew = [];
for (var x in row) {
var items = row[x];
var colN = xValues.indexOf(x);
rowNew[colN] = (rowNew[colN] || []).concat(items);
}
return rowNew;
});
};
/**
* For use with console.table().
* @param {String} collisionSeparator separator to use when there are multiple values to join for a given column
* @returns a 2-dimension matrix: row -> column -> value
*/
TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) {
return this.getMatrix().map((rowColumns) =>
rowColumns.map((items) =>
items.map((item) => item.text).join(collisionSeparator || "")
)
);
};
function getText(item) {
return item.text;
}
function joinCellCollisions(separ) {
return function (cell) {
return (cell || []).map(getText).join(separ).substr(0, 7);
};
}
function renderMatrix(matrix) {
return (matrix || [])
.map(function (row) {
return (row || []).map(joinCellCollisions("+")).join("\t");
})
.join("\n");
}
TableParser.prototype.renderMatrix = function () {
return renderMatrix(this.getMatrix());
};
/**
* ColumnsParser
* Classifies items into columns, nearest to the left position of their corresponding header.
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
function getColumnIndex(cols, x) {
var bestDist = null;
for (var i = 0; i < cols.length; ++i) {
var dist = Math.abs(x - cols[i].x);
if (bestDist !== null && dist > bestDist) {
break;
} else {
bestDist = dist;
}
}
return i - 1;
}
function ColumnsParser(colNames) {
this.cols = [];
var cols = this.cols,
colNames = colNames.slice(), // clone (for parameter immutability)
line = -1; // -1 = header
this.processItem = function (item) {
if (line == -1) {
// parse x-position of column headers
var i = colNames.indexOf(item.text);
if (i > -1) {
log("ColumnsParser header", i, item.text, "=> x:", item.x);
cols[i] = {
name: item.text,
x: item.x,
items: [],
};
colNames[i] = ""; // needed so that a column name can be associated to more than 1 index
}
if (cols.length == colNames.length) {
// done parsing header
line++;
}
} else {
cols[getColumnIndex(cols, item.x)].items.push(item);
}
};
}
const parseTable = Object.assign(
parseTable$1,
parseTableExports
);
const parseColumns = Object.assign(
parseColumns$1,
parseColumnsExports
);
exports.ColumnsParser = ColumnsParser;
exports.LOG = LOG$1;
exports.PdfReader = PdfReader;
exports.Rule = Rule;
exports.SequentialParser = SequentialParser;
exports.TableParser = TableParser;
exports.parseColumns = parseColumns;
exports.parseTable = parseTable;
;