ppu-pdf

Version:

Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.

PT-Perkasa-Pilar-Utama/ppu-pdf

113 lines (112 loc) • 5.1 kB

TypeScript

/** @module Web entrypoint for ppu-pdf browser support. * * Provides `PdfReaderLegacyWeb`, a browser-compatible PDF reader based on pdfjs-dist. * Supports text extraction, line grouping, compact lines, TOON format, scanned detection, * page rendering to HTMLCanvasElement, and scanned PDF OCR via ppu-paddle-ocr/web. * * @example * ```ts * import { PdfReaderLegacyWeb } from "ppu-pdf/web"; * * const reader = new PdfReaderLegacyWeb({ verbose: false }); * const response = await fetch("my-document.pdf"); * const buffer = await response.arrayBuffer(); * * const pdf = await reader.open(buffer); * const texts = await reader.getTexts(pdf); * console.log(texts.get(1)?.fullText); * await reader.destroy(pdf); * ``` */ import * as pdfjs from "pdfjs-dist"; import { BasePdfReaderCommon } from "../core/base-pdf-reader-common.js"; import { type CompactPageLines, type PageLines, type PageTexts, type PageToonLines, type PdfCompactLineAlgorithm, type PdfReaderOptions, type PdfScannedThreshold } from "../pdf.interface.js"; /** Canvas map type for web — uses HTMLCanvasElement instead of Node.js native Canvas. */ export type WebCanvasMap = Map<number, HTMLCanvasElement>; /** * Browser-compatible PDF reader based on pdfjs-dist. * * Supports all digital PDF features: text extraction, line grouping, * compact lines, TOON format, and scanned detection. * Also supports page rendering to HTMLCanvasElement and scanned PDF OCR * when combined with ppu-paddle-ocr/web. */ export declare class PdfReaderLegacyWeb extends BasePdfReaderCommon { private options; readonly startIndex = 1; constructor(options?: Partial<PdfReaderOptions>); /** * Opens a PDF document from an ArrayBuffer. * @param data - The ArrayBuffer containing the PDF data. * @returns The opened PDFDocumentProxy instance. */ open(data: ArrayBuffer): Promise<pdfjs.PDFDocumentProxy>; /** * Renders all pages of a PDF document into HTMLCanvasElements. * @param doc - The PDFDocumentProxy to render. * @returns A map of page numbers to HTMLCanvasElement instances. */ renderAll(doc: pdfjs.PDFDocumentProxy): Promise<WebCanvasMap>; /** * Extracts text from scanned PDF pages using an OCR service. * Compatible with ppu-paddle-ocr/web's PaddleOcrService. * @param ocrService - Any OCR service with initialize() and recognize(canvas) methods. * @param canvasMap - A map of page numbers to HTMLCanvasElement instances. * @returns A map of page numbers to extracted text data with OCR results. */ getTextsScanned(ocrService: { initialize(): Promise<void>; recognize(canvas: HTMLCanvasElement): Promise<any>; }, canvasMap: WebCanvasMap): Promise<PageTexts>; private getCanvas; /** * Extracts text from all pages of a PDF document. * @param doc - The PDFDocumentProxy to extract text from. * @returns A map of page numbers to extracted text data. */ getTexts(pdf: pdfjs.PDFDocumentProxy): Promise<PageTexts>; private extractTexts; private extractOcrTexts; private convertOcrToPdfWords; private mapTokenToPdfWord; private mergeTextContent; private filterTextContent; /** * Converts extracted text into structured lines. * @param pageTexts - The extracted text data from a PDF. * @returns A map of page numbers to structured lines. */ getLinesFromTexts(pageTexts: PageTexts): PageLines; /** * Converts extracted text into TOON format string for LLM-friendly input. * @param pageTexts - The extracted text data from a PDF. * @returns A string of TOON format */ getLinesFromTextsInToon(pageTexts: PageTexts): PageToonLines; /** * Converts extracted text into compact structured lines using a specified algorithm. * @param pageTexts - The extracted text data from a PDF. * @param algorithm - The algorithm for compacting lines (default: "middleY"). * @returns A map of page numbers to compact structured lines. */ getCompactLinesFromTexts(pageTexts: PageTexts, algorithm?: PdfCompactLineAlgorithm): CompactPageLines; /** * Determines if the PDF document is scanned based on text thresholds. * @param pageTexts - The extracted text data from a PDF. * @param options - The threshold options for scanned detection. * @returns True if the document is likely scanned, false otherwise. */ isScanned(pageTexts: PageTexts, options?: PdfScannedThreshold): boolean; /** * Determines if the individual PDF page is scanned/digital based on text thresholds. * @param pageText - The extracted page text. * @param options - The threshold options for scanned detection. * @returns True if the page is likely scanned, false otherwise. */ isPageScanned(pageText: string, options?: PdfScannedThreshold): boolean; /** * Destroys the PDF document instance to free memory. * @param doc - The PDFDocumentProxy instance to destroy. */ destroy(pdf: pdfjs.PDFDocumentProxy): Promise<void>; }