ppu-pdf

Version:

Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.

PT-Perkasa-Pilar-Utama/ppu-pdf

104 lines (103 loc) • 5.06 kB

TypeScript

import "pdfjs-dist/build/pdf.worker.min.mjs"; import "./pdfjs-workaround.js"; import { type Canvas } from "@napi-rs/canvas"; import * as pdfjs from "pdfjs-dist/legacy/build/pdf.mjs"; import { PdfReaderCommon } from "./pdf-reader-common.js"; import { type CanvasMap, type CompactPageLines, type PageLines, type PageTexts, type PageToonLines, type PdfCompactLineAlgorithm, type PdfReaderOptions, type PdfScannedThreshold } from "./pdf.interface.js"; import { type PaddleOcrService } from "ppu-paddle-ocr"; /** * PdfReaderLegacy class based on pdfjs-dist for reading and processing PDF documents. */ export declare class PdfReaderLegacy extends PdfReaderCommon { private options; readonly startIndex = 1; constructor(options?: Partial<PdfReaderOptions>); /** * Opens a PDF document from a file path or an ArrayBuffer. * @param filename - The file path or ArrayBuffer of the PDF document. * @returns The opened PDFDocument instance. */ open(filename: string | ArrayBuffer): Promise<pdfjs.PDFDocumentProxy>; /** * Renders all pages of a PDF document into canvases. * @param doc - The PDFDocumentProxy to render. * @returns A map of page numbers to Canvas instances. */ renderAll(doc: pdfjs.PDFDocumentProxy): Promise<CanvasMap>; /** * Extracts text from scanned PDF pages using ppu-paddle-ocr package. * @param paddleOcrService - The OCR service instance specifically from ppu-paddle-ocr to use for text recognition. * @param canvasMap - A map of page numbers to Canvas instances representing rendered PDF pages. * @returns A map of page numbers to extracted text data with OCR results. */ getTextsScanned(paddleOcrService: PaddleOcrService, canvasMap: CanvasMap): Promise<PageTexts>; private getCanvas; /** * Extracts text from all pages of a PDF document. * @param doc - The PDFDocumentProxy to extract text from. * @returns A map of page numbers to extracted text data. */ getTexts(pdf: pdfjs.PDFDocumentProxy): Promise<PageTexts>; private extractTexts; private extractOcrTexts; private convertOcrToPdfWords; private mapTokenToPdfWord; private mergeTextContent; private filterTextContent; /** * Converts extracted text into structured lines. * @param pageTexts - The extracted text data from a PDF. * @returns A map of page numbers to structured lines. */ getLinesFromTexts(pageTexts: PageTexts): PageLines; /** * Converts extracted text into TOON format string for LLM-friendly input. * @param pageTexts - The extracted text data from a PDF. * @returns A string of TOON format */ getLinesFromTextsInToon(pageTexts: PageTexts): PageToonLines; /** * Converts extracted text into compact structured lines using a specified algorithm. * @param pageTexts - The extracted text data from a PDF. * @param algorithm - The algorithm for compacting lines (default: "middleY"). * @returns A map of page numbers to compact structured lines. */ getCompactLinesFromTexts(pageTexts: PageTexts, algorithm?: PdfCompactLineAlgorithm): CompactPageLines; /** * Determines if the PDF document is scanned based on text thresholds. * @param pageTexts - The extracted text data from a PDF. * @param options - The threshold options for scanned detection. * @returns True if the document is likely scanned, false otherwise. */ isScanned(pageTexts: PageTexts, options?: PdfScannedThreshold): boolean; /** * Determines if the individual PDF page is a scanned/digital based on text thresholds. * @param pageText - The extracted page text. * @param options - The threshold options for scanned detection. * @returns True if the page is likely scanned, false otherwise. */ isPageScanned(pageText: string, options?: PdfScannedThreshold): boolean; /** * Saves rendered canvases as image files. * @param canvasMap - The map of canvases to save. * @param filename - The base filename for the output images. * @param foldername - The folder to save the images in (default: "out"). */ dumpCanvasMap(canvasMap: Map<number, Canvas>, filename: string, foldername?: string): Promise<void>; /** * Rebuilds a scanned PDF by placing invisible text over the orginial images, * making the PDF searchable without altering its visual appearance. * @param doc - The PDFDocumentProxy instance to rebuild. * @param pageTexts - The extracted text data to overlay. * @param options - Rebuild options (optional, default font is Helvetica). * @returns A Uint8Array containing the rebuilt PDF binary data. */ rebuild(doc: pdfjs.PDFDocumentProxy, pageTexts: PageTexts, options?: { fontName?: string; }): Promise<Uint8Array>; /** * Destroys the PDF document instance to free memory. * @param doc - The PDFDocumentProxy instance to destroy. */ destroy(pdf: pdfjs.PDFDocumentProxy): Promise<void>; }