pdf-parse-test
Version:
Pure TypeScript, cross-platform module for extracting text, images, and tabular data from PDFs. Run directly in your browser or in Node!
180 lines • 6.1 kB
JavaScript
var RelativeDirections;
(function (RelativeDirections) {
RelativeDirections[RelativeDirections["None"] = 0] = "None";
RelativeDirections[RelativeDirections["Left"] = 1] = "Left";
RelativeDirections[RelativeDirections["Right"] = 2] = "Right";
RelativeDirections[RelativeDirections["Top"] = 3] = "Top";
RelativeDirections[RelativeDirections["Bottom"] = 4] = "Bottom";
})(RelativeDirections || (RelativeDirections = {}));
export class Rectangle {
id;
x;
y;
width;
height;
x2;
y2;
text;
constructor(id, x, y, width, height) {
this.id = id;
this.x = x;
this.y = y;
this.width = width;
this.height = height;
this.x2 = x + width;
this.y2 = y + height;
this.text = '';
}
toString() {
return `${this.id} ${this.text}`;
}
tryAddText(item) {
const x = item.transform[4];
const y = item.transform[5];
const isInside = x >= this.x && y >= this.y && x <= this.x2 && y <= this.y2;
if (isInside) {
if (item.str?.length === 0 && this.text.length === 0) {
return true;
}
this.text += `${item.str}${item.hasEOL ? '\n' : ''}`;
return true;
}
return false;
}
isNeighbour(rect, distance = 1) {
const result = RelativeDirections.None;
const heightOk = Math.abs(this.height - rect.height) < distance;
const yOk = Math.abs(this.y - rect.y) < distance;
if (heightOk && yOk) {
const isLeft = Math.abs(this.x - rect.x2) < distance;
if (isLeft)
return RelativeDirections.Left;
const isRight = Math.abs(this.x2 - rect.x) < distance;
if (isRight)
return RelativeDirections.Right;
}
const widthOk = Math.abs(this.width - rect.width) < distance;
const xOk = Math.abs(this.x - rect.x) < distance;
if (widthOk && xOk) {
const isTop = Math.abs(this.y - rect.y2) < distance;
if (isTop)
return RelativeDirections.Top;
const isBottom = Math.abs(this.y2 - rect.y) < distance;
if (isBottom)
return RelativeDirections.Bottom;
}
return result;
}
}
export class Table {
grid;
minTableX1 = Number.MAX_VALUE;
minTableY1 = Number.MAX_VALUE;
maxTableX2 = Number.MIN_VALUE;
maxTableY2 = Number.MIN_VALUE;
constructor(rect) {
this.grid = [[rect]];
}
_cellCount = -1;
get cellCount() {
if (this._cellCount > -1) {
return this._cellCount;
}
for (const row of this.grid) {
this._cellCount += row.length;
}
return this._cellCount;
}
get width() {
return this.maxTableX2 - this.minTableX1;
}
get height() {
return this.maxTableY2 - this.minTableY1;
}
static tryAddText(pageTables, item) {
for (const table of pageTables) {
if (table.cellCount < 3)
continue;
if (table.isInside(item)) {
for (const row of table.grid) {
for (const rectangle of row) {
const res = rectangle.tryAddText(item);
if (res)
return true;
}
}
}
}
return false;
}
static addRectangle(pageTables, rect) {
for (const table of pageTables) {
for (let rowIndex = 0; rowIndex < table.grid.length; rowIndex++) {
const row = table.grid[rowIndex];
for (let colIndex = 0; row && colIndex < row.length; colIndex++) {
const currentRect = row[colIndex];
const dir = currentRect?.isNeighbour(rect);
if (dir === RelativeDirections.Right) {
row.push(rect);
return true;
}
if (dir === RelativeDirections.Bottom) {
const bottomRow = table.grid[rowIndex + 1];
if (bottomRow === undefined) {
const newRow = [rect];
table.grid.push(newRow);
return true;
}
}
if (dir === RelativeDirections.Left || dir === RelativeDirections.Top) {
// TODO remove
// debugger;
}
}
}
}
pageTables.push(new Table(rect));
return true;
}
getTableArray() {
const result = [];
for (const row of this.grid) {
const rowStr = [];
for (const rect of row) {
rowStr.push(rect.text.trim());
}
result.push(rowStr);
}
return result;
}
initMinMax() {
const firstRow = this.grid[0];
const lastRow = this.grid[this.grid.length - 1];
if (firstRow === undefined || lastRow === undefined) {
throw new Error('malformed table');
}
const firstRect = firstRow[0];
const lastRect = lastRow[lastRow.length - 1];
if (firstRect === undefined || lastRect === undefined) {
throw new Error('malformed table');
}
this.minTableX1 = firstRect.x;
this.minTableY1 = firstRect.y;
this.maxTableX2 = lastRect.x2;
this.maxTableY2 = lastRect.y2;
}
isInside(item) {
const x = item.transform[4];
const y = item.transform[5];
return x >= this.minTableX1 && y >= this.minTableY1 && x <= this.maxTableX2 && y <= this.maxTableY2;
}
toString() {
const result = [];
for (const row of this.grid) {
const rowStr = row.map((i) => i.text).join('\t');
result.push(rowStr);
}
return result.join('\n');
}
}
//# sourceMappingURL=TableUtil.js.map