ppu-pdf
Version:
Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.
113 lines (112 loc) • 5.1 kB
TypeScript
/** @module Web entrypoint for ppu-pdf browser support.
*
* Provides `PdfReaderLegacyWeb`, a browser-compatible PDF reader based on pdfjs-dist.
* Supports text extraction, line grouping, compact lines, TOON format, scanned detection,
* page rendering to HTMLCanvasElement, and scanned PDF OCR via ppu-paddle-ocr/web.
*
* @example
* ```ts
* import { PdfReaderLegacyWeb } from "ppu-pdf/web";
*
* const reader = new PdfReaderLegacyWeb({ verbose: false });
* const response = await fetch("my-document.pdf");
* const buffer = await response.arrayBuffer();
*
* const pdf = await reader.open(buffer);
* const texts = await reader.getTexts(pdf);
* console.log(texts.get(1)?.fullText);
* await reader.destroy(pdf);
* ```
*/
import * as pdfjs from "pdfjs-dist";
import { BasePdfReaderCommon } from "../core/base-pdf-reader-common.js";
import { type CompactPageLines, type PageLines, type PageTexts, type PageToonLines, type PdfCompactLineAlgorithm, type PdfReaderOptions, type PdfScannedThreshold } from "../pdf.interface.js";
/** Canvas map type for web — uses HTMLCanvasElement instead of Node.js native Canvas. */
export type WebCanvasMap = Map<number, HTMLCanvasElement>;
/**
* Browser-compatible PDF reader based on pdfjs-dist.
*
* Supports all digital PDF features: text extraction, line grouping,
* compact lines, TOON format, and scanned detection.
* Also supports page rendering to HTMLCanvasElement and scanned PDF OCR
* when combined with ppu-paddle-ocr/web.
*/
export declare class PdfReaderLegacyWeb extends BasePdfReaderCommon {
private options;
readonly startIndex = 1;
constructor(options?: Partial<PdfReaderOptions>);
/**
* Opens a PDF document from an ArrayBuffer.
* @param data - The ArrayBuffer containing the PDF data.
* @returns The opened PDFDocumentProxy instance.
*/
open(data: ArrayBuffer): Promise<pdfjs.PDFDocumentProxy>;
/**
* Renders all pages of a PDF document into HTMLCanvasElements.
* @param doc - The PDFDocumentProxy to render.
* @returns A map of page numbers to HTMLCanvasElement instances.
*/
renderAll(doc: pdfjs.PDFDocumentProxy): Promise<WebCanvasMap>;
/**
* Extracts text from scanned PDF pages using an OCR service.
* Compatible with ppu-paddle-ocr/web's PaddleOcrService.
* @param ocrService - Any OCR service with initialize() and recognize(canvas) methods.
* @param canvasMap - A map of page numbers to HTMLCanvasElement instances.
* @returns A map of page numbers to extracted text data with OCR results.
*/
getTextsScanned(ocrService: {
initialize(): Promise<void>;
recognize(canvas: HTMLCanvasElement): Promise<any>;
}, canvasMap: WebCanvasMap): Promise<PageTexts>;
private getCanvas;
/**
* Extracts text from all pages of a PDF document.
* @param doc - The PDFDocumentProxy to extract text from.
* @returns A map of page numbers to extracted text data.
*/
getTexts(pdf: pdfjs.PDFDocumentProxy): Promise<PageTexts>;
private extractTexts;
private extractOcrTexts;
private convertOcrToPdfWords;
private mapTokenToPdfWord;
private mergeTextContent;
private filterTextContent;
/**
* Converts extracted text into structured lines.
* @param pageTexts - The extracted text data from a PDF.
* @returns A map of page numbers to structured lines.
*/
getLinesFromTexts(pageTexts: PageTexts): PageLines;
/**
* Converts extracted text into TOON format string for LLM-friendly input.
* @param pageTexts - The extracted text data from a PDF.
* @returns A string of TOON format
*/
getLinesFromTextsInToon(pageTexts: PageTexts): PageToonLines;
/**
* Converts extracted text into compact structured lines using a specified algorithm.
* @param pageTexts - The extracted text data from a PDF.
* @param algorithm - The algorithm for compacting lines (default: "middleY").
* @returns A map of page numbers to compact structured lines.
*/
getCompactLinesFromTexts(pageTexts: PageTexts, algorithm?: PdfCompactLineAlgorithm): CompactPageLines;
/**
* Determines if the PDF document is scanned based on text thresholds.
* @param pageTexts - The extracted text data from a PDF.
* @param options - The threshold options for scanned detection.
* @returns True if the document is likely scanned, false otherwise.
*/
isScanned(pageTexts: PageTexts, options?: PdfScannedThreshold): boolean;
/**
* Determines if the individual PDF page is scanned/digital based on text thresholds.
* @param pageText - The extracted page text.
* @param options - The threshold options for scanned detection.
* @returns True if the page is likely scanned, false otherwise.
*/
isPageScanned(pageText: string, options?: PdfScannedThreshold): boolean;
/**
* Destroys the PDF document instance to free memory.
* @param doc - The PDFDocumentProxy instance to destroy.
*/
destroy(pdf: pdfjs.PDFDocumentProxy): Promise<void>;
}