pdfreader
Version:
Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.
193 lines (176 loc) • 6.39 kB
JavaScript
/**
* Rule: class that can be used to define and process data extraction rules, while parsing a PDF document.
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
import { log as LOG } from "./lib/LOG.js";
import { parseColumns } from "./lib/parseColumns.js";
import { parseTable } from "./lib/parseTable.js";
/**
* regexp: a regular expression which a PDF item's text must match in order to execute that rule.
* => a Rule object exposes "accumulators": methods that defines the data extraction strategy of a rule.
**/
export function Rule(regexp) {
this.regexp = regexp;
var self = this;
// proxy accumulators methods
Object.keys(Rule.accumulators).forEach(function (name) {
self[name] = function () {
LOG("building rule:", regexp, "->", name);
self.methodName = name;
self.accumulatorParams = arguments;
self.accumulatorBuilder = Rule.accumulators[name];
return self;
};
});
}
// shortcut for defining Rule objects in a more concise manner
Rule.on = function (regexp) {
return new Rule(regexp);
};
Rule.after = function (regexp) {
var rule = new Rule(regexp);
rule.skipCurrentItem = true;
return rule;
};
/**
* then(): defines a function to be called after a Rule's accumulator has finished processing items.
* fct: the function to be called after a Rule's accumulator has finished processing items.
* the output of the accumulator will be passed as the first parameter of that function.
**/
Rule.prototype.then = function (fct) {
var self = this;
this.terminate = function () {
fct.call(self, self.output);
};
return this;
};
// private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator.
Rule.prototype.test = function (item) {
if (this.regexp.test(item.text)) {
// lazy init of accumulators: build and init the accumulator on first match
this.currentItem = item;
if (!this.accumulatorImpl && this.accumulatorBuilder) {
this.accumulatorImpl = this.accumulatorBuilder.apply(
this,
this.accumulatorParams
);
this.accumulatorImpl.methodName = this.methodName;
this.accumulatorImpl.terminate = this.terminate;
}
return this.accumulatorImpl;
}
};
// intended to be run from accumulator, in order to process output before calling termination then() handler.
Rule.prototype.whenDone = function (fct) {
var self = this;
var then = this.terminate;
this.terminate = function () {
fct.call(self);
then();
};
};
/**
* rules: array of Rule objects that will be executed one-by-one, whenever a PDF item matches a rule.
* each rule can only be executed once.
* => returns a function to be called for each item by the PdfReader.
**/
Rule.makeItemProcessor = function (rules) {
var currentAccumulator = null;
function terminateAccumulator() {
var terminatePreviousAcc = (currentAccumulator || {}).terminate;
if (terminatePreviousAcc) {
LOG("terminating accumulator:", currentAccumulator.methodName);
terminatePreviousAcc(currentAccumulator); // TODO: remove currentAccumulator parameter
}
}
var applyRulesOnNextItem = true;
return function (item) {
if (!item)
// last item of the file => flush buffers
return terminateAccumulator();
else if (!item.text) return;
//LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem);
if (applyRulesOnNextItem)
for (var r in rules) {
var accumulator = rules[r].test(item);
if (accumulator) {
terminateAccumulator();
LOG("current accumulator:", accumulator.methodName);
if (rules[r].skipCurrentItem) applyRulesOnNextItem = false;
currentAccumulator = accumulator;
delete rules[r];
return;
}
}
else applyRulesOnNextItem = true;
// if reaching this point, the current item matches none of the rules => accumulating data on current accumulator
if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item);
};
};
/**
* Rule.accumulators: array of accumulators that can be used for defining Rule objects.
* An accumulator is a function that may (or may not) accept parameters, to be provided by the developer of a parser.
* It returns another function that will be run on every following PDF item, in order to accumulate data.
* The output of an accumulator is stored in this.output (field of its parent Rule object).
**/
Rule.accumulators = {
stopAccumulating: function () {
return function () {};
},
};
// method for adding accumulators
Rule.addAccumulator = function (methodName, methodBuilder) {
Rule.accumulators[methodName] = methodBuilder;
};
/**
* This accumulator will store the group values extracted by the regexp of the Rule object,
* on the current matching PDF item, into an array.
*
* E.g. with regex: /hello ([a-z]+)/, the text "hello world" will yield "world".
**/
Rule.addAccumulator("extractRegexpValues", function () {
var matches = this.regexp.exec(this.currentItem.text);
this.output = matches.slice(1);
return function () {}; // following lines are not to be processed by this accumulator
});
/**
* This accumulator will store the value of the next PDF item.
**/
Rule.addAccumulator("parseNextItemValue", function () {
var self = this,
done = false;
return function (item) {
if (done) return;
done = true;
self.output = item.text;
};
});
/**
* This accumulator will store the text of all following PDF items into an array.
**/
Rule.addAccumulator("accumulateAfterHeading", function () {
var output = (this.output = []);
return function accumulate(item) {
output.push(item.text);
};
});
/**
* This accumulator will store the text of all following PDF items with equal x-coordinates.
**/
Rule.addAccumulator("accumulateFromSameX", function () {
var output = (this.output = []),
x = null;
return function accumulate(item) {
if (x === null) x = item.x;
if (x == item.x) output.push(item.text);
};
});
/**
* This accumulator will store a table by detecting its columns, given their names.
**/
Rule.addAccumulator("parseColumns", parseColumns);
/**
* This accumulator will store a table by detecting its columns, given their count.
**/
Rule.addAccumulator("parseTable", parseTable);