ppu-pdf
Version:
Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.
77 lines (76 loc) • 3.5 kB
TypeScript
import "./mupdf-workaround";
import { Canvas } from "@napi-rs/canvas";
import { type PDFDocument, type PDFPage } from "mupdf/mupdfjs";
import { PdfReaderCommon } from "./pdf-reader-common";
import { type CanvasMap, type CompactPageLines, type PageLines, type PageTexts, type PdfCompactLineAlgorithm, type PdfReaderOptions, type PdfScannedThreshold } from "./pdf.interface";
/**
* PdfReader class based on mupdfjs for reading and processing PDF documents.
*/
export declare class PdfReader extends PdfReaderCommon {
private options;
private startIndex;
constructor(options?: Partial<PdfReaderOptions>);
/**
* Opens a PDF document from a file path or an ArrayBuffer.
* @param filename - The file path or ArrayBuffer of the PDF document.
* @returns The opened PDFDocument instance.
*/
open(filename: string | ArrayBuffer): PDFDocument;
/**
* Renders all pages of a PDF document into canvases.
* @param doc - The PDFDocument to render.
* @param dpi - The resolution (dots per inch) to render the PDF pages.
* Higher values improve OCR accuracy but increase memory usage.
* @returns A map of page numbers to Canvas instances, where each page number
* corresponds to its rendered canvas representation.
*/
renderAll(doc: PDFDocument, dpi?: number): Promise<CanvasMap>;
private getCanvas;
/**
* Extracts text from all pages of a PDF document.
* @param doc - The PDFDocument to extract text from.
* @returns A map of page numbers to extracted text data.
*/
getTexts(doc: PDFDocument): Promise<PageTexts>;
private extractTexts;
private mapStructureToPdfWord;
private mergeTextContent;
private filterTextContent;
/**
* Converts extracted text into structured lines.
* @param pageTexts - The extracted text data from a PDF.
* @returns A map of page numbers to structured lines.
*/
getLinesFromTexts(pageTexts: PageTexts): PageLines;
/**
* Converts extracted text into compact structured lines using a specified algorithm.
* @param pageTexts - The extracted text data from a PDF.
* @param algorithm - The algorithm for compacting lines (default: "middleY").
* @returns A map of page numbers to compact structured lines.
*/
getCompactLinesFromTexts(pageTexts: PageTexts, algorithm?: PdfCompactLineAlgorithm): CompactPageLines;
/**
* Saves rendered canvases as image files.
* @param canvasMap - The map of canvases to save.
* @param filename - The base filename for the output images.
* @param foldername - The folder to save the images in (default: "out").
*/
dumpCanvasMap(canvasMap: Map<number, Canvas>, filename: string, foldername?: string): Promise<void>;
/**
* Determines if the PDF document is scanned based on text thresholds.
* @param pageTexts - The extracted text data from a PDF.
* @param options - The threshold options for scanned detection.
* @returns True if the document is likely scanned, false otherwise.
*/
isScanned(pageTexts: PageTexts, options?: PdfScannedThreshold): boolean;
/**
* Destroys the PDF document instance to free memory.
* @param doc - The PDFDocument instance to destroy.
*/
destroy(doc: PDFDocument): void;
/**
* Destroys a PDF page instance to free memory.
* @param page - The PDFPage instance to destroy.
*/
destroyPage(page: PDFPage): void;
}