pdf-tax-reader-cl
Version:
PDF scraping library for Chilean tax documents. Extract emitter name, economic activities, and address from structured PDF documents like 'CARPETA TRIBUTARIA ELECTRÓNICA PARA SOLICITAR CRÉDITOS'
102 lines (89 loc) • 2.81 kB
TypeScript
/**
* PDF Tax Scraping Library for Chilean Tax Documents
*
* This library extracts specific data from Chilean tax PDF documents
* like "CARPETA TRIBUTARIA ELECTRÓNICA PARA SOLICITAR CRÉDITOS"
*/
export interface ExtractedTaxData {
/** Name of the document emitter/company */
emitterName: string | null;
/** List of economic activities */
economicActivities: string[];
/** Registered address */
address: string | null;
}
export interface ValidationResult {
/** Whether the data is valid */
isValid: boolean;
/** List of missing fields */
missingFields: string[];
}
export interface ProcessingResult {
/** Original filename */
filename: string;
/** Extracted data or error */
data?: ExtractedTaxData;
/** Error message if processing failed */
error?: string;
}
/**
* Extract tax data from a PDF file
* @param pdfPath - Path to the PDF file
* @returns Promise with extracted tax data
* @throws Error if file is invalid, not found, or not a tax document
*/
export function extractTaxData(pdfPath: string): Promise<ExtractedTaxData>;
/**
* Process multiple PDF files in a directory
* @param directoryPath - Path to directory containing PDF files
* @returns Promise with array of processing results
*/
export function processMultiplePDFs(directoryPath: string): Promise<ProcessingResult[]>;
/**
* Save data to JSON file
* @param data - Data to save
* @param outputPath - Output file path
*/
export function saveToJSON(data: any, outputPath: string): void;
/**
* Extract emitter name from PDF text
* @param text - PDF text content
* @returns Emitter name or null
*/
export function extractEmitterName(text: string): string | null;
/**
* Extract economic activities from PDF text
* @param text - PDF text content
* @returns Array of economic activities
*/
export function extractEconomicActivities(text: string): string[];
/**
* Extract address from PDF text
* @param text - PDF text content
* @returns Address or null
*/
export function extractAddress(text: string): string | null;
/**
* Validate if a file is a valid PDF
* @param dataBuffer - File buffer
* @returns True if valid PDF, false otherwise
*/
export function isValidPDF(dataBuffer: Buffer): boolean;
/**
* Validate file extension
* @param filePath - Path to the file
* @returns True if file has .pdf extension
*/
export function hasValidExtension(filePath: string): boolean;
/**
* Check if the document appears to be a Chilean tax document
* @param text - PDF text content
* @returns True if appears to be a tax document
*/
export function isTaxDocument(text: string): boolean;
/**
* Validate extracted data completeness
* @param data - Extracted data
* @returns Validation result
*/
export function validateExtractedData(data: ExtractedTaxData): ValidationResult;