ppu-pdf
Version:
Easily extract text from digital PDF files with coordinate and font size included, and optionally group text by lines or render scanned pdf to canvas/png.
188 lines (187 loc) • 5.74 kB
TypeScript
import type { Canvas } from "@napi-rs/canvas";
/**
* Represents a bounding box in a PDF document.
*/
export interface PdfBbox {
/** The x-coordinate of the top-left corner. */
x0: number;
/** The y-coordinate of the top-left corner. */
y0: number;
/** The x-coordinate of the bottom-right corner. */
x1: number;
/** The y-coordinate of the bottom-right corner. */
y1: number;
}
/**
* Metadata associated with a PDF text element.
*/
export interface PdfMetadata {
/** Writing orientation: horizontal, vertical, or unknown. */
writing: "horizontal" | "vertical" | "";
/** Text direction: left-to-right (ltr), right-to-left (rtl), or unknown. */
direction: "ltr" | "rtl" | "";
/** Font details. */
font: {
/** Font name. */
name: string;
/** Font size. */
size: number;
/** Font family, or empty string if unknown. */
family: string | "";
/** Font weight: normal, bold, or unknown. */
weight: "normal" | "bold" | "";
/** Font style: normal, italic, or unknown. */
style: "normal" | "italic" | "";
};
/** Indicates if this element is at the end of a line. */
hasEOL: boolean;
/** The page number where this element appears. */
pageNum: number;
}
/**
* Represents the width and height of a PDF element.
*/
export interface PdfDimension {
/** The width of the element. */
width: number;
/** The height of the element. */
height: number;
}
/**
* Represents a single word extracted from a PDF.
*/
export interface PdfWord {
/** The extracted text. */
text: string;
/** The bounding box of the word. */
bbox: PdfBbox;
/** The dimensions of the word. */
dimension: PdfDimension;
/** Additional metadata about the word. */
metadata: PdfMetadata;
}
/**
* Represents a single word extracted from a PDF in TOON Format.
*/
export type PdfToonWord = Record<string, string>;
/**
* Represents all words extracted from a page.
*/
export interface PdfTexts {
/** List of extracted words. */
words: PdfWord[];
/** The full text of extracted words */
fullText: string;
/** The full confidence of text extraction */
confidence: number;
/** The full text of extracted words in toon notation for LLM-friendly input */
toon: string;
}
/**
* Represents a single line of text in a PDF.
*/
export interface PdfLine {
/** The extracted text content of the line. */
text: string;
/** The bounding box of the line. */
bbox: PdfBbox;
/** The dimensions of the line. */
dimension: PdfDimension;
/** The average font size of the text in this line. */
averageFontSize: number;
/** The words that make up this line. */
words: PdfWord[];
}
/**
* Represents a single line of text in a PDF in TOON format.
*/
export type PdfToonLine = Record<string, PdfToonWord>;
/**
* Represents a mapping of page numbers to their corresponding extracted texts.
*/
export type PageTexts = Map<number, PdfTexts>;
/**
* Represents a mapping of page numbers to their corresponding extracted lines.
*/
export type PageLines = Map<number, PdfLine[]>;
/**
* Represents a lines in a string formmat of TOON.
*/
export type PageToonLines = string;
/**
* Represents a compact version of a PDF word with only text and bounding box.
*/
export interface CompactPdfWord {
/** The extracted text. */
text: string;
/** The bounding box of the word. */
bbox: PdfBbox;
}
/**
* Represents a compact version of a PDF line with only text, bounding box, and words.
*/
export interface CompactPdfLine {
/** The extracted text content of the line. */
text: string;
/** The bounding box of the line. */
bbox: PdfBbox;
/** The words that make up this line in a compact format. */
words: CompactPdfWord[];
}
/**
* Represents a mapping of page numbers to their corresponding compact extracted lines.
*/
export type CompactPageLines = Map<number, CompactPdfLine[]>;
/**
* Algorithm types for compacting PDF lines.
*/
export type PdfCompactLineAlgorithm = "middleY" | "y0";
/**
* Represents a font used in the PDF.
*/
export interface PdfFont {
/** The file path of the font. */
path: string;
/** The name of the font. */
name: string;
}
/**
* Options for configuring the PDF reader.
*/
export interface PdfReaderOptions {
/** Enable verbose logging. */
verbose?: boolean;
/** Exclude footer text from extraction. */
excludeFooter?: boolean;
/** Exclude header text from extraction. */
excludeHeader?: boolean;
/** Extract raw text without additional processing. */
raw?: boolean;
/** Height percentage threshold for detecting headers. */
headerFromHeightPercentage?: number;
/** Height percentage threshold for detecting footers. */
footerFromHeightPercentage?: number;
/** Merge text elements that are close to each other. */
mergeCloseTextNeighbor?: boolean;
/** Use a simpler sorting algorithm for text layout. */
simpleSortAlgorithm?: boolean;
/** Scaling factor for text positioning adjustments. */
scale?: number;
/** List of fonts used in the document. */
fonts: PdfFont[];
/** Whether to turn on/off toon notation format extraction for LLM-friendly text */
enableToon: boolean;
}
/**
* Represents thresholds for determining whether a PDF is scanned.
*/
export interface PdfScannedThreshold {
/** Minimum number of words per page. */
wordsPerPage: number;
/** Minimum text length per page. */
textLength: number;
}
/**
* Represents a mapping of page numbers to their corresponding canvas elements.
*/
export type CanvasMap = Map<number, Canvas>;