UNPKG

pdf-parse-test

Version:

Pure TypeScript, cross-platform module for extracting text, images, and tabular data from PDFs. Run directly in your browser or in Node!

118 lines 4.95 kB
export interface ParseParameters { /** * Array of page numbers to parse. * When provided, only these pages will be parsed and returned in the same order. * Example: [1, 3, 5]. Parse only one page: [7]. * Default: `undefined`. */ partial?: Array<number>; /** * Parse the first N pages (pages 1..N). * Ignored when `partial` is provided. If both `first` and `last` are set, they define * an explicit inclusive page range (first..last) and this "first N" semantics is ignored. * Default: `undefined`. */ first?: number; /** * Parse the last N pages (pages total-N+1..total). * Ignored when `partial` is provided. If both `first` and `last` are set, they define * an explicit inclusive page range (first..last) and this "last N" semantics is ignored. * Default: `undefined`. */ last?: number; /** * Collect per-page metadata such as embedded links, title, pageLabel, and dimensions; * ISBN, DOI, abstract, and references are work in progress when getInfo() is used. * Default: `false`. */ parsePageInfo?: boolean; /** * Attempt to detect and include hyperlink annotations (e.g. URLs) associated with text. * Detected links are formatted as Markdown inline links (for example: [text](https://example.com)). * Default: `false`. */ parseHyperlinks?: boolean; /** * Enforce logical line breaks by inserting a newline when the vertical distance * between text items exceeds `lineThreshold`. * Useful to preserve paragraph/line structure when text items are emitted as separate segments. * Default: `true`. */ lineEnforce?: boolean; /** * Threshold to decide whether nearby text items belong to different lines. * Larger values make the parser more likely to start a new line between items. * Default: `4.6`. */ lineThreshold?: number; /** * String inserted between text items on the same line when a sufficiently large horizontal gap is detected. * Typically used to emulate a cell/column separator (for example, "\\t" for tabs). * Default: `'\t'`. */ cellSeparator?: string; /** * Horizontal distance threshold to decide when two text items on the same baseline should be treated as separate cells. * Larger value produces fewer (wider) cells; smaller value creates more cell breaks. * Default: `7`. */ cellThreshold?: number; /** * Optional string appended at the end of each page's extracted text to mark page boundaries. * Supports placeholders `page_number` and `total_number` which are substituted accordingly. * If omitted or empty, no page boundary marker is added. * Default: `'\n-- page_number of total_number --'`. */ pageJoiner?: string; /** * Optional string used to join text items when returning a page's text. * If provided, this value is used instead of the default empty-string joining behavior. * Default: `undefined`. */ itemJoiner?: string; /** * Minimum image dimension (in pixels) for width or height. * When set, images where width OR height are below or equal this value will be ignored by `getImage()`. * Useful for excluding tiny decorative or tracking images. * Default: `80`. * Disable: `0`. */ imageThreshold?: number; /** * Screenshot scale factor: use 1 for the original size, 1.5 for a 50% larger image, etc. * Default: `1`. */ scale?: number; /** * Desired screenshot width in pixels. * When set, the scale option is ignored. * Default: `undefined`. */ desiredWidth?: number; /** * Applies to both getImage() and getScreenshot(): include the image as a base64 data URL string. * Default: `true`. */ imageDataUrl?: boolean; /** * Applies to both getImage() and getScreenshot(): include the image as a binary buffer. * Default: `true`. */ imageBuffer?: boolean; /** * Include marked content items in the items array of TextContent to capture PDF "marked content". * Enables tags (MCID, role/props) and structural/accessibility information useful for mapping text ↔ structure. * For plain text extraction it's usually false (trade-off: larger output). * Default: `false`. */ includeMarkedContent?: boolean; /** * When true, text normalization is NOT performed in the worker thread. * For plain text extraction, normalizing in the worker (false) is usually recommended. * Default: `false`. */ disableNormalization?: boolean; } export type SafeParseParameters = Required<Pick<ParseParameters, 'lineThreshold' | 'cellThreshold' | 'scale'>> & ParseParameters; export declare function setDefaultParseParameters(params: ParseParameters): SafeParseParameters; //# sourceMappingURL=ParseParameters.d.ts.map