pdf-parse-new
Version:
Pure javascript cross-platform module to extract text from PDFs with AI-powered optimization and multi-core processing.
282 lines (245 loc) • 9.54 kB
TypeScript
export interface Result {
numpages: number;
numrender: number;
info: any;
metadata: any;
text: string;
version?: string;
_meta?: {
method?: string;
duration?: number;
analysis?: any;
fastPath?: boolean;
cached?: boolean;
commonScenario?: boolean;
};
}
export interface Options {
/**
* Custom page render function (for single-thread parsers)
* @param pageData - PDF.js page object
* @returns Extracted text from the page
*/
pagerender?: ((pageData: any) => string | Promise<string>) | undefined;
/**
* Path to external module exporting custom render function
* Used by workers/processes to load custom render logic without eval()
* Module must export: module.exports = function(pageData) { ... }
* @example './my-custom-render.js'
*/
pagerenderModule?: string | undefined;
max?: number | undefined;
verbosityLevel?: 0 | 1 | 5 | undefined;
parallelizePages?: boolean | undefined;
batchSize?: number | undefined;
/**
* Password for encrypted PDFs. Forwarded as-is to PDF.js `getDocument({ password })`.
* Ignored when the document is not encrypted.
*/
password?: string | undefined;
}
export interface SmartParserOptions {
/** Force a specific parsing method instead of auto-selection */
forceMethod?: 'sequential' | 'batch' | 'stream' | 'aggressive' | 'processes' | 'workers' | null;
/** Maximum memory usage in bytes (default: 70% of total RAM) */
maxMemoryUsage?: number;
/** Number of available CPUs (auto-detected by default) */
availableCPUs?: number;
/** Enable fast-path optimization for tiny PDFs (default: true) */
enableFastPath?: boolean;
/** Enable decision caching (default: true) */
enableCache?: boolean;
/** Oversaturation factor for worker/process count (default: 1.5) */
oversaturationFactor?: number;
/** Hard limit on maximum workers/processes (default: null = auto) */
maxWorkerLimit?: number | null;
}
export interface SmartParserStats {
totalParses: number;
methodUsage: {
sequential: number;
batch: number;
stream: number;
aggressive: number;
processes: number;
workers: number;
};
averageTimes: Record<string, number>;
failedParses: number;
fastPathHits: number;
cacheHits: number;
treeNavigations: number;
optimizationRate: string;
averageOverhead: string;
}
export interface StreamOptions extends Options {
chunkSize?: number | undefined;
onChunkComplete?: ((progress: ChunkProgress) => void) | undefined;
}
export interface WorkersOptions extends Options {
chunkSize?: number | undefined;
maxWorkers?: number | undefined;
/** Batch size for processing pages within each worker (default: 10) */
batchSize?: number | undefined;
onProgress?: ((progress: WorkerProgress) => void) | undefined;
}
export interface ProcessesOptions extends Options {
chunkSize?: number | undefined;
maxProcesses?: number | undefined;
/** Timeout for each child process in milliseconds (default: 300000) */
processTimeout?: number | undefined;
/** Batch size for processing pages within each process (default: 10) */
batchSize?: number | undefined;
onProgress?: ((progress: ProcessProgress) => void) | undefined;
}
export interface ChunkProgress {
processedPages: number;
totalPages: number;
progress: string;
currentChunk: number;
totalChunks: number;
}
export interface WorkerProgress {
completedChunks: number;
totalChunks: number;
progress: string;
}
export interface ProcessProgress {
completedChunks: number;
totalChunks: number;
progress: string;
}
export const DEFAULT_OPTIONS: Options;
/**
* Parse PDF with streaming/chunking approach for large files
* Reduces memory pressure by processing in chunks
* Best for 500-1000 page PDFs
* @param dataBuffer - PDF file buffer
* @param options - Streaming options
* @returns Promise with parsed PDF data
*/
export function stream(dataBuffer: Buffer, options?: StreamOptions): Promise<Result>;
/**
* Parse PDF with aggressive parallelization for maximum speed
* Best for very large PDFs (1000+ pages)
* All batches within a chunk run in parallel (single-thread)
* @param dataBuffer - PDF file buffer
* @param options - Aggressive parsing options
* @returns Promise with parsed PDF data
*/
export function aggressive(dataBuffer: Buffer, options?: StreamOptions): Promise<Result>;
/**
* Parse PDF using worker threads for true multi-core parallelism
* May have compatibility issues with PDF.js in some environments
* Best for very large PDFs (1000+ pages) on multi-core systems
* @param dataBuffer - PDF file buffer
* @param options - Worker threads options
* @returns Promise with parsed PDF data
*/
export function workers(dataBuffer: Buffer, options?: WorkersOptions): Promise<Result>;
/**
* Parse PDF using child processes for true multi-core parallelism
* Most reliable multi-threading option, works in all environments
* Best for very large PDFs (1000+ pages) on multi-core systems
* @param dataBuffer - PDF file buffer
* @param options - Child processes options
* @returns Promise with parsed PDF data
*/
export function processes(dataBuffer: Buffer, options?: ProcessesOptions): Promise<Result>;
/**
* Smart PDF Parser - Automatically selects optimal parsing method
* based on PDF characteristics and system resources.
*
* Features:
* - CPU-aware decision tree (adapts to available cores)
* - Fast-path optimization (0.5ms overhead for tiny PDFs)
* - LRU cache for repeated similar PDFs
* - Common scenario matching (90%+ hit rate)
* - Oversaturation for maximum CPU utilization
*
* @example
* ```typescript
* import PdfParse, { SmartPDFParser } from 'pdf-parse-new';
* const parser = new SmartPDFParser();
* const result = await parser.parse(pdfBuffer);
* console.log(`Parsed ${result.numpages} pages using ${result._meta.method}`);
* ```
*/
export class SmartPDFParser {
constructor(options?: SmartParserOptions);
/**
* Parse PDF with automatic method selection
* @param dataBuffer - PDF file buffer
* @param userOptions - Optional parsing options to override defaults
* @returns Promise with parsed PDF data including _meta with method and performance info
*/
parse(dataBuffer: Buffer, userOptions?: Options): Promise<Result>;
/**
* Get parser statistics (in-memory for current session)
* @returns Statistics object with parse counts, method usage, and optimization metrics
*/
getStats(): SmartParserStats;
}
export interface FontStats {
/** Most common font size across the sampled pages (body text). */
bodySize: number;
/** Threshold (>=) above which a line is treated as `# h1`. */
h1Size: number;
/** Threshold (>=) above which a line is treated as `## h2`. */
h2Size: number;
/** Threshold (>=) above which a line is treated as `### h3`. */
h3Size: number;
/** Median vertical distance between consecutive lines of body text. */
lineHeight: number;
}
export interface MarkdownOptions extends Options {
/** Number of pages to sample for font statistics (default: 5). */
sampleSize?: number;
/** Wrap items in `**...**` / `*...*` based on font name (default: true). */
detectEmphasis?: boolean;
/** Convert leading bullets and numbered prefixes to Markdown lists (default: true). */
detectLists?: boolean;
/** Wrap monospace runs in fenced code blocks (default: true). */
detectCodeBlocks?: boolean;
}
/**
* Parse a PDF and emit Markdown instead of plain text.
*
* Performs a two-pass analysis: first samples a few pages to build a font-size
* histogram (used to infer headings), then parses the document with a renderer
* that emits headings, lists, inline emphasis and (optionally) fenced code
* blocks. Heuristic-based — works well on text-heavy PDFs, struggles with
* tables and complex multi-column layouts (use a vision model for those).
*
* @param dataBuffer - PDF file buffer
* @param options - Markdown rendering options
* @returns Promise with `result.text` containing Markdown
*/
export function markdown(dataBuffer: Buffer, options?: MarkdownOptions): Promise<Result>;
/**
* Drop-in `pagerender` that emits Markdown using only per-page statistics.
* Lower quality than `markdown(buffer)` (no document-wide font stats) but
* works in single-call contexts and as a `pagerenderModule` for workers.
*/
export function markdownRender(pageData: any): Promise<string>;
/**
* Build a Markdown `pagerender` bound to pre-computed font statistics.
* Useful when calling `pdf(buffer, { pagerender: createMarkdownRenderer(stats) })`
* in a custom flow.
*/
export function createMarkdownRenderer(stats: FontStats, options?: MarkdownOptions): (pageData: any) => Promise<string>;
/**
* Sample the PDF and compute font-size statistics used to drive Markdown
* heading detection. Cheap: defaults to 5 evenly-distributed pages.
*/
export function collectFontStats(dataBuffer: Buffer, options?: { sampleSize?: number; verbosityLevel?: number; password?: string }): Promise<FontStats>;
/**
* Absolute path to the standalone Markdown renderer module, suitable for
* passing as `pagerenderModule` to `workers()` / `processes()`.
*/
export const markdownRenderModule: string;
/**
* Funzione principale di parsing (retrocompatibile)
*/
declare function PdfParse(dataBuffer: Buffer, options?: Options): Promise<Result>;
export default PdfParse;