UNPKG

pdf-tax-reader-cl

Version:

PDF scraping library for Chilean tax documents. Extract emitter name, economic activities, and address from structured PDF documents like 'CARPETA TRIBUTARIA ELECTRÓNICA PARA SOLICITAR CRÉDITOS'

323 lines (274 loc) 9.43 kB
const fs = require('fs'); const pdfParse = require('pdf-parse'); /** * Validate if a file is a valid PDF * @param {Buffer} dataBuffer - File buffer * @returns {boolean} - True if valid PDF, false otherwise */ function isValidPDF(dataBuffer) { try { // Check if file starts with PDF signature const pdfSignature = dataBuffer.toString('ascii', 0, 4); if (pdfSignature !== '%PDF') { return false; } // Check if file contains PDF end signature (more flexible) const fileContent = dataBuffer.toString('ascii'); if (!fileContent.includes('%%EOF')) { return false; } return true; } catch (error) { return false; } } /** * Validate file extension * @param {string} filePath - Path to the file * @returns {boolean} - True if file has .pdf extension */ function hasValidExtension(filePath) { const extension = filePath.toLowerCase().split('.').pop(); return extension === 'pdf'; } /** * Extract emitter name from PDF text * @param {string} text - PDF text content * @returns {string} - Emitter name */ function extractEmitterName(text) { // Look for "Nombre del emisor:" pattern const emitterPattern = /Nombre del emisor:\s*([^\n]+)/i; const match = text.match(emitterPattern); if (match && match[1]) { return match[1].trim(); } return null; } /** * Extract economic activities from PDF text * @param {string} text - PDF text content * @returns {Array<string>} - Array of economic activities */ function extractEconomicActivities(text) { const activities = []; // Find the "Actividades Económicas:" section const activitiesSection = text.split('Actividades Económicas:')[1]; if (!activitiesSection) { return activities; } // Split by lines and look for activity patterns const lines = activitiesSection.split('\n'); for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); // Skip empty lines and look for activity patterns if (line && !line.includes('Categoría tributaria:')) { // Check if line contains activity code (6 digits) or is a general description if (/^\d{6}\s/.test(line) || line.includes('ASES.COMER.PUBLICIDAD') || line.includes('VENTA') || line.includes('ACTIVIDADES') || line.includes('SERVICIOS') || line.includes('OTRAS ACTIVIDADES') || line.includes('ENSEÑANZA')) { activities.push(line); } } // Stop when we reach the next section if (line.includes('Categoría tributaria:')) { break; } } return activities; } /** * Extract address from PDF text * @param {string} text - PDF text content * @returns {string} - Address */ function extractAddress(text) { // Look for "Domicilio:" pattern const addressPattern = /Domicilio:\s*([^\n]+)/i; const match = text.match(addressPattern); if (match && match[1]) { return match[1].trim(); } return null; } /** * Check if the document appears to be a Chilean tax document * @param {string} text - PDF text content * @returns {boolean} - True if appears to be a tax document */ function isTaxDocument(text) { const taxDocumentIndicators = [ 'CARPETA TRIBUTARIA', 'SII', 'Contribuyente', 'RUT del emisor', 'Actividades Económicas', 'Domicilio', 'Categoría tributaria' ]; const foundIndicators = taxDocumentIndicators.filter(indicator => text.includes(indicator) ); // At least 3 indicators should be present return foundIndicators.length >= 3; } /** * Validate extracted data completeness * @param {Object} data - Extracted data * @returns {Object} - Validation result */ function validateExtractedData(data) { const missingFields = []; if (!data.emitterName) { missingFields.push('emitter name'); } if (!data.economicActivities || data.economicActivities.length === 0) { missingFields.push('economic activities'); } if (!data.address) { missingFields.push('address'); } return { isValid: missingFields.length === 0, missingFields: missingFields }; } /** * Main function to extract data from PDF tax documents * @param {string} pdfPath - Path to the PDF file * @returns {Promise<Object>} - Extracted data object */ async function extractTaxData(pdfPath) { try { // Check if file exists if (!fs.existsSync(pdfPath)) { throw new Error(`File not found: ${pdfPath}`); } // Check file extension if (!hasValidExtension(pdfPath)) { throw new Error(`Invalid file extension. Expected .pdf, got: ${pdfPath.split('.').pop()}`); } // Get file stats const stats = fs.statSync(pdfPath); if (stats.size === 0) { throw new Error('File is empty'); } // Read the PDF file const dataBuffer = fs.readFileSync(pdfPath); // Validate PDF format if (!isValidPDF(dataBuffer)) { throw new Error('Invalid PDF format. File does not appear to be a valid PDF document.'); } // Parse the PDF content const data = await pdfParse(dataBuffer); // Check if PDF has text content if (!data.text || data.text.trim().length === 0) { throw new Error('PDF appears to be empty or contains no extractable text (possibly scanned images)'); } const text = data.text; // Validate that this appears to be a tax document if (!isTaxDocument(text)) { throw new Error('Document does not appear to be a Chilean tax document. Missing expected tax document structure.'); } // Extract the required information const extractedData = { emitterName: extractEmitterName(text), economicActivities: extractEconomicActivities(text), address: extractAddress(text) }; // Validate extracted data const validationResult = validateExtractedData(extractedData); if (!validationResult.isValid) { throw new Error(`Data extraction incomplete: ${validationResult.missingFields.join(', ')}`); } return extractedData; } catch (error) { console.error('Error processing PDF:', error.message); throw error; } } /** * Process multiple PDF files in a directory * @param {string} directoryPath - Path to directory containing PDF files * @returns {Promise<Array>} - Array of extracted data objects */ async function processMultiplePDFs(directoryPath) { const results = []; try { const files = fs.readdirSync(directoryPath); const pdfFiles = files.filter(file => file.toLowerCase().endsWith('.pdf')); for (const pdfFile of pdfFiles) { const pdfPath = `${directoryPath}/${pdfFile}`; console.log(`Processing: ${pdfFile}`); try { const data = await extractTaxData(pdfPath); results.push({ filename: pdfFile, data: data }); } catch (error) { console.error(`Error processing ${pdfFile}:`, error.message); results.push({ filename: pdfFile, error: error.message }); } } return results; } catch (error) { console.error('Error reading directory:', error.message); throw error; } } /** * Save extracted data to JSON file * @param {Object} data - Data to save * @param {string} outputPath - Output file path */ function saveToJSON(data, outputPath) { try { fs.writeFileSync(outputPath, JSON.stringify(data, null, 2)); console.log(`Data saved to: ${outputPath}`); } catch (error) { console.error('Error saving data:', error.message); throw error; } } // Export functions for use in other modules module.exports = { extractTaxData, processMultiplePDFs, saveToJSON, extractEmitterName, extractEconomicActivities, extractAddress, isValidPDF, hasValidExtension, isTaxDocument, validateExtractedData }; // Main execution if run directly if (require.main === module) { const pdfPath = process.argv[2]; if (!pdfPath) { console.log('Usage: node index.js <pdf-file-path>'); console.log('Example: node index.js ./documents/tax-document.pdf'); process.exit(1); } extractTaxData(pdfPath) .then(data => { console.log('Extracted Data:'); console.log(JSON.stringify(data, null, 2)); // Save to JSON file const outputPath = pdfPath.replace('.pdf', '_extracted.json'); saveToJSON(data, outputPath); }) .catch(error => { console.error('Error:', error.message); process.exit(1); }); }