pdfreader
Version:
Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.
49 lines (46 loc) • 1.48 kB
JavaScript
/**
* parseColumns, for pdfreader, used by the Rule class.
* accumulates values below each column header (on 1st row, given their name), without detecting empty rows.
* TODO: use ColumnsParser
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
import { log as LOG } from "./LOG.js";
export const parseColumns = function (/* columns */) {
this.output = [];
this.cols = Array.prototype.slice.apply(arguments);
var colNames = this.cols,
colX = [],
rows = this.output,
line = -1, // header
lineY = null;
function processItem(item) {
if (line == -1) {
// parse x-position of column headers
var i = colNames.indexOf(item.text);
if (i > -1) colX[i] = item.x;
if (colX.length == colNames.length) {
LOG("table header:", colNames, colX);
line++;
}
} else {
if (lineY === null) {
lineY = item.y;
} else if (lineY != item.y) {
lineY = item.y;
line++;
}
// parsing values for each column
var col = 0;
for (var i = colX.length - 1; i >= 0; --i)
if (item.x > colX[i]) {
col = i;
break;
}
rows[lineY] = rows[lineY] || {};
rows[lineY][col] = item.text;
}
}
processItem(this.currentItem); // apply on header's first item
return processItem; // then the same function will be run on all following items, until another rule is triggered
};