pdf-parse-new

export interface Result { numpages: number; numrender: number; info: any; metadata: any; text: string; version?: string; _meta?: { method?: string; duration?: number; analysis?: any; fastPath?: boolean; cached?: boolean; commonScenario?: boolean; }; } export interface Options { /** * Custom page render function (for single-thread parsers) * @param pageData - PDF.js page object * @returns Extracted text from the page */ pagerender?: ((pageData: any) => string | Promise<string>) | undefined; /** * Path to external module exporting custom render function * Used by workers/processes to load custom render logic without eval() * Module must export: module.exports = function(pageData) { ... } * @example './my-custom-render.js' */ pagerenderModule?: string | undefined; max?: number | undefined; verbosityLevel?: 0 | 1 | 5 | undefined; parallelizePages?: boolean | undefined; batchSize?: number | undefined; /** * Password for encrypted PDFs. Forwarded as-is to PDF.js `getDocument({ password })`. * Ignored when the document is not encrypted. */ password?: string | undefined; } export interface SmartParserOptions { /** Force a specific parsing method instead of auto-selection */ forceMethod?: 'sequential' | 'batch' | 'stream' | 'aggressive' | 'processes' | 'workers' | null; /** Maximum memory usage in bytes (default: 70% of total RAM) */ maxMemoryUsage?: number; /** Number of available CPUs (auto-detected by default) */ availableCPUs?: number; /** Enable fast-path optimization for tiny PDFs (default: true) */ enableFastPath?: boolean; /** Enable decision caching (default: true) */ enableCache?: boolean; /** Oversaturation factor for worker/process count (default: 1.5) */ oversaturationFactor?: number; /** Hard limit on maximum workers/processes (default: null = auto) */ maxWorkerLimit?: number | null; } export interface SmartParserStats { totalParses: number; methodUsage: { sequential: number; batch: number; stream: number; aggressive: number; processes: number; workers: number; }; averageTimes: Record<string, number>; failedParses: number; fastPathHits: number; cacheHits: number; treeNavigations: number; optimizationRate: string; averageOverhead: string; } export interface StreamOptions extends Options { chunkSize?: number | undefined; onChunkComplete?: ((progress: ChunkProgress) => void) | undefined; } export interface WorkersOptions extends Options { chunkSize?: number | undefined; maxWorkers?: number | undefined; /** Batch size for processing pages within each worker (default: 10) */ batchSize?: number | undefined; onProgress?: ((progress: WorkerProgress) => void) | undefined; } export interface ProcessesOptions extends Options { chunkSize?: number | undefined; maxProcesses?: number | undefined; /** Timeout for each child process in milliseconds (default: 300000) */ processTimeout?: number | undefined; /** Batch size for processing pages within each process (default: 10) */ batchSize?: number | undefined; onProgress?: ((progress: ProcessProgress) => void) | undefined; } export interface ChunkProgress { processedPages: number; totalPages: number; progress: string; currentChunk: number; totalChunks: number; } export interface WorkerProgress { completedChunks: number; totalChunks: number; progress: string; } export interface ProcessProgress { completedChunks: number; totalChunks: number; progress: string; } export const DEFAULT_OPTIONS: Options; /** * Parse PDF with streaming/chunking approach for large files * Reduces memory pressure by processing in chunks * Best for 500-1000 page PDFs * @param dataBuffer - PDF file buffer * @param options - Streaming options * @returns Promise with parsed PDF data */ export function stream(dataBuffer: Buffer, options?: StreamOptions): Promise<Result>; /** * Parse PDF with aggressive parallelization for maximum speed * Best for very large PDFs (1000+ pages) * All batches within a chunk run in parallel (single-thread) * @param dataBuffer - PDF file buffer * @param options - Aggressive parsing options * @returns Promise with parsed PDF data */ export function aggressive(dataBuffer: Buffer, options?: StreamOptions): Promise<Result>; /** * Parse PDF using worker threads for true multi-core parallelism * May have compatibility issues with PDF.js in some environments * Best for very large PDFs (1000+ pages) on multi-core systems * @param dataBuffer - PDF file buffer * @param options - Worker threads options * @returns Promise with parsed PDF data */ export function workers(dataBuffer: Buffer, options?: WorkersOptions): Promise<Result>; /** * Parse PDF using child processes for true multi-core parallelism * Most reliable multi-threading option, works in all environments * Best for very large PDFs (1000+ pages) on multi-core systems * @param dataBuffer - PDF file buffer * @param options - Child processes options * @returns Promise with parsed PDF data */ export function processes(dataBuffer: Buffer, options?: ProcessesOptions): Promise<Result>; /** * Smart PDF Parser - Automatically selects optimal parsing method * based on PDF characteristics and system resources. * * Features: * - CPU-aware decision tree (adapts to available cores) * - Fast-path optimization (0.5ms overhead for tiny PDFs) * - LRU cache for repeated similar PDFs * - Common scenario matching (90%+ hit rate) * - Oversaturation for maximum CPU utilization * * @example * ```typescript * import PdfParse, { SmartPDFParser } from 'pdf-parse-new'; * const parser = new SmartPDFParser(); * const result = await parser.parse(pdfBuffer); * console.log(`Parsed ${result.numpages} pages using ${result._meta.method}`); * ``` */ export class SmartPDFParser { constructor(options?: SmartParserOptions); /** * Parse PDF with automatic method selection * @param dataBuffer - PDF file buffer * @param userOptions - Optional parsing options to override defaults * @returns Promise with parsed PDF data including _meta with method and performance info */ parse(dataBuffer: Buffer, userOptions?: Options): Promise<Result>; /** * Get parser statistics (in-memory for current session) * @returns Statistics object with parse counts, method usage, and optimization metrics */ getStats(): SmartParserStats; } export interface FontStats { /** Most common font size across the sampled pages (body text). */ bodySize: number; /** Threshold (>=) above which a line is treated as `# h1`. */ h1Size: number; /** Threshold (>=) above which a line is treated as `## h2`. */ h2Size: number; /** Threshold (>=) above which a line is treated as `### h3`. */ h3Size: number; /** Median vertical distance between consecutive lines of body text. */ lineHeight: number; } export interface MarkdownOptions extends Options { /** Number of pages to sample for font statistics (default: 5). */ sampleSize?: number; /** Wrap items in `**...**` / `*...*` based on font name (default: true). */ detectEmphasis?: boolean; /** Convert leading bullets and numbered prefixes to Markdown lists (default: true). */ detectLists?: boolean; /** Wrap monospace runs in fenced code blocks (default: true). */ detectCodeBlocks?: boolean; } /** * Parse a PDF and emit Markdown instead of plain text. * * Performs a two-pass analysis: first samples a few pages to build a font-size * histogram (used to infer headings), then parses the document with a renderer * that emits headings, lists, inline emphasis and (optionally) fenced code * blocks. Heuristic-based — works well on text-heavy PDFs, struggles with * tables and complex multi-column layouts (use a vision model for those). * * @param dataBuffer - PDF file buffer * @param options - Markdown rendering options * @returns Promise with `result.text` containing Markdown */ export function markdown(dataBuffer: Buffer, options?: MarkdownOptions): Promise<Result>; /** * Drop-in `pagerender` that emits Markdown using only per-page statistics. * Lower quality than `markdown(buffer)` (no document-wide font stats) but * works in single-call contexts and as a `pagerenderModule` for workers. */ export function markdownRender(pageData: any): Promise<string>; /** * Build a Markdown `pagerender` bound to pre-computed font statistics. * Useful when calling `pdf(buffer, { pagerender: createMarkdownRenderer(stats) })` * in a custom flow. */ export function createMarkdownRenderer(stats: FontStats, options?: MarkdownOptions): (pageData: any) => Promise<string>; /** * Sample the PDF and compute font-size statistics used to drive Markdown * heading detection. Cheap: defaults to 5 evenly-distributed pages. */ export function collectFontStats(dataBuffer: Buffer, options?: { sampleSize?: number; verbosityLevel?: number; password?: string }): Promise<FontStats>; /** * Absolute path to the standalone Markdown renderer module, suitable for * passing as `pagerenderModule` to `workers()` / `processes()`. */ export const markdownRenderModule: string; /** * Funzione principale di parsing (retrocompatibile) */ declare function PdfParse(dataBuffer: Buffer, options?: Options): Promise<Result>; export default PdfParse;