pdf-parse-test
Version:
Pure TypeScript, cross-platform module for extracting text, images, and tabular data from PDFs. Run directly in your browser or in Node!
102 lines • 4.91 kB
TypeScript
import { type HeaderResult } from './HeaderResult.js';
import { ImageResult } from './ImageResult.js';
import { InfoResult } from './InfoResult.js';
import { type ParseParameters } from './ParseParameters.js';
import { ScreenshotResult } from './ScreenshotResult.js';
import { TableResult } from './TableResult.js';
import { TextResult } from './TextResult.js';
import type { DocumentInitParameters } from 'pdfjs-dist/types/src/display/api.js';
/**
* Loads PDF documents and exposes helpers for text, image, table, metadata, and screenshot extraction.
*/
export declare class PDFParse {
private readonly options;
private doc;
progress: {
loaded: number;
total: number;
};
/**
* Create a new parser with `DocumentInitParameters`.
* Converts Node.js `Buffer` data to `Uint8Array` automatically and ensures a default verbosity level.
* @param options Initialization parameters.
*/
constructor(options: DocumentInitParameters);
destroy(): Promise<void>;
static get isNodeJS(): boolean;
static setWorker(workerSrc?: string): string;
/**
* Perform an HTTP HEAD request to retrieve the file size and verify existence;
* when `check` is true, fetch a small range and inspect the magic number to confirm the URL points to a valid PDF.
* @param check When `true`, download a small byte range to validate the file signature.
* Default: `false`.
*/
getHeader(check?: boolean): Promise<HeaderResult>;
/**
* Load document-level metadata (info, outline, permissions, page labels) and optionally gather per-page link details.
* @param params Parse options; set `parsePageInfo` to collect per-page metadata described in `ParseParameters`.
* @returns Aggregated document metadata in an `InfoResult`.
*/
getInfo(params?: ParseParameters): Promise<InfoResult>;
private getPageLinks;
/**
* Extract plain text for each requested page, optionally enriching hyperlinks and enforcing line or cell separators.
* @param params Parse options controlling pagination, link handling, and line/cell thresholds.
* @returns A `TextResult` containing page-wise text and a concatenated document string.
*/
getText(params?: ParseParameters): Promise<TextResult>;
private load;
private shouldParse;
private getPageText;
private getHyperlinks;
/**
* Extract embedded images from requested pages.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Images smaller than `params.imageThreshold` (width OR height) are skipped.
* - Returned ImageResult contains per-page PageImages; each image entry includes:
* - data: Uint8Array (present when params.imageBuffer === true)
* - dataUrl: string (present when params.imageDataUrl === true)
* - width, height, kind, name
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param params ParseParameters controlling page selection, thresholds and output format.
* @returns Promise<ImageResult> with extracted images grouped by page.
*/
getImage(params?: ParseParameters): Promise<ImageResult>;
private convertToRGBA;
private resolveEmbeddedImage;
/**
* Render pages to raster screenshots.
*
* Behavior notes:
* - Pages are selected according to ParseParameters (partial, first, last).
* - Use params.scale for zoom; if params.desiredWidth is specified it takes precedence.
* - Each ScreenshotResult page contains:
* - data: Uint8Array (when params.imageBuffer === true)
* - dataUrl: string (when params.imageDataUrl === true)
* - pageNumber, width, height, scale
* - Works in both Node.js (canvas.toBuffer) and browser (canvas.toDataURL) environments.
*
* @param parseParams ParseParameters controlling page selection and render options.
* @returns Promise<ScreenshotResult> with rendered page images.
*/
getScreenshot(parseParams?: ParseParameters): Promise<ScreenshotResult>;
/**
* Detect and extract tables from pages by analysing vector drawing operators, then populate cells with text.
*
* Behavior notes:
* - Scans operator lists for rectangles/lines that form table grids (uses PathGeometry and LineStore).
* - Normalizes detected geometry and matches positioned text to table cells.
* - Honors ParseParameters for page selection.
*
* @param params ParseParameters controlling which pages to analyse (partial/first/last).
* @returns Promise<TableResult> containing discovered tables per page.
*/
getTable(params?: ParseParameters): Promise<TableResult>;
private getPathGeometry;
private getPageTables;
private fillPageTables;
}
//# sourceMappingURL=PDFParse.d.ts.map