UNPKG

pdfreader

Version:

Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.

117 lines (100 loc) 2.89 kB
/** * TableParser * Classifies items into columns and rows, based on their left and top coordinates, * and left position of column headers. * @author Adrien Joly, http://github.com/adrienjoly * This content is released under the MIT License. **/ export function TableParser() { this.rows = {}; } TableParser.prototype.processItem = function (item, col) { var row = (this.rows["" + item.y] = this.rows["" + item.y] || {}); (row[col] = row[col] || []).push(item); }; TableParser.prototype.processHeadingItem = function (item, col) { this.processItem( { y: 0, x: item.x, text: item.text, }, col ); }; // Rows function sortAsFloatValues(values) { return values.slice().sort(function (a, b) { return parseFloat(a) - parseFloat(b); }); } TableParser.prototype.getRows = function () { var rows = this.rows; var yValues = sortAsFloatValues(Object.keys(rows)); return yValues.map(function (y) { return rows["" + y]; }); }; function renderRows(rows) { return (rows || []) .map(function (row, rowId) { var cells = []; for (var i in row) for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text); return rowId + ":\t" + cells.join(", "); }) .join("\n"); } TableParser.prototype.renderRows = function () { return renderRows(this.getRows()); }; // Matrix function getSortedXValues(rows) { var xSet = {}; for (var y in rows) for (var x in rows[y]) xSet[x] = true; return sortAsFloatValues(Object.keys(xSet)); } /** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */ TableParser.prototype.getMatrix = function () { var rows = this.getRows(); var xValues = getSortedXValues(rows); return rows.map(function (row, y) { var rowNew = []; for (var x in row) { var items = row[x]; var colN = xValues.indexOf(x); rowNew[colN] = (rowNew[colN] || []).concat(items); } return rowNew; }); }; /** * For use with console.table(). * @param {String} collisionSeparator separator to use when there are multiple values to join for a given column * @returns a 2-dimension matrix: row -> column -> value */ TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) { return this.getMatrix().map((rowColumns) => rowColumns.map((items) => items.map((item) => item.text).join(collisionSeparator || "") ) ); }; function getText(item) { return item.text; } function joinCellCollisions(separ) { return function (cell) { return (cell || []).map(getText).join(separ).substr(0, 7); }; } function renderMatrix(matrix) { return (matrix || []) .map(function (row) { return (row || []).map(joinCellCollisions("+")).join("\t"); }) .join("\n"); } TableParser.prototype.renderMatrix = function () { return renderMatrix(this.getMatrix()); };