pdf-data-parser

Version:

Parse, search and stream PDF tabular data using Node.js with Mozilla's PDF.js library.

github.com/drewletcher/pdf-data-parser

drewletcher/pdf-data-parser

106 lines • 4.09 kB

TypeScript

export default class PdfDataParser { /** * * @param {Object} options * @param {String|URL} [options.url] the URL or local file name of the .pdf * @param {String|ArrayBuffer} [options.data] pdf file data as an array, instead of using url * @param {String} [password] password for decrypting the pdf document, optional * @param {Number[]} [options.pages] array of page numbers to process, if undefined defaults to all pages * @param {String|RegExp} [options.heading] PDF section heading where data is located, default: none * @param {String|RegExp} [options.stopHeading] PDF section heading after data table, default: none * @param {Number} [options.cells] minimum number cells in a row for output, or "min-max" e.g. "7-9" * @param {Boolean} [options.newlines] preserve new lines in cell data, default: false * @param {Number} [options.pageHeader] height of page header area in points, default: 0 * @param {Number} [options.pageFooter] height of page footer area in points, default: 0 * @param {Boolean} [options.repeatingHeaders] indicates if table headers are repeated on each page, default: false * @param {Boolean|Number} [options.trim] trim whitespace, false (0) = none, true (1) = both, 2 = starting only, 3 = trailing only, default: true * @param {Boolean} [options.artifacts] parse artifacts content, default: false * @param {Number} [options.lineHeight] approximate line height ratio based on font size; default 1.67 * @param {Boolean} [options.orderXY] order cells by XY coordinates on page; default true */ constructor(options?: { url?: string | URL | undefined; data?: string | ArrayBuffer | undefined; }); options: { trim: boolean; orderXY: boolean; } & { url?: string | URL | undefined; data?: string | ArrayBuffer | undefined; }; cells: { min: number; max: number; heading: number; }; _cells: any[]; _rows: any[]; headingFound: boolean; tableFound: boolean; tableDone: boolean; firstPageNumber: any; headerY: number; footerY: number; started: boolean; paused: boolean; cancelled: boolean; /** * Load and parse the PDF document. * @returns Rows an array containing arrays of data values. * If using an event listener the return value will be an empty array. */ parse(): Promise<any[] | undefined>; doc: any; page: any; pause(): void; resume(): void; cancel(): void; /** * Parse the content items returned by PDF.js. * Use PDF.js marked content to collect multiple items into cells. * Result is cells array contains cells in sorted x.y order. */ parseMarkedPage(): Promise<void>; parseLinedPage(): Promise<void>; /** * Add item to cells array in x,y order. * * Order of cells is top of page (max) to bottom of page (0). * Within a row order is left (0) to right (max). * Usually cells flow in order from pdf.js, but sometimes not. * * Filters out cells in page header and page footer areas. * * @param {*} cell */ insertCell(cell: any): void; inCellRange(rowlen: any): boolean; /** * Iterate the cells and determine rows. * Cells in a row are determined by overlapping Y boundaries. */ parseCells(): Promise<void>; rowNum: number | undefined; /** * Performs row filtering. * * @param {*} row is an array of data values */ filters(row: any): boolean; headersRow: any; /** * Emits or appends data to output. * * @param {*} row is an array of data values */ output(row: any): Promise<void>; /** * * @param {Object} row - the row to check * @param {String} heading - text to compare against */ compareHeading(row: Object, heading: string): any; rowsEqual(row1: any, row2: any): boolean; } //# sourceMappingURL=PdfDataParser.d.ts.map