pdf-tax-reader-cl
Version:
PDF scraping library for Chilean tax documents. Extract emitter name, economic activities, and address from structured PDF documents like 'CARPETA TRIBUTARIA ELECTRÓNICA PARA SOLICITAR CRÉDITOS'
323 lines (274 loc) • 9.43 kB
JavaScript
const fs = require('fs');
const pdfParse = require('pdf-parse');
/**
* Validate if a file is a valid PDF
* @param {Buffer} dataBuffer - File buffer
* @returns {boolean} - True if valid PDF, false otherwise
*/
function isValidPDF(dataBuffer) {
try {
// Check if file starts with PDF signature
const pdfSignature = dataBuffer.toString('ascii', 0, 4);
if (pdfSignature !== '%PDF') {
return false;
}
// Check if file contains PDF end signature (more flexible)
const fileContent = dataBuffer.toString('ascii');
if (!fileContent.includes('%%EOF')) {
return false;
}
return true;
} catch (error) {
return false;
}
}
/**
* Validate file extension
* @param {string} filePath - Path to the file
* @returns {boolean} - True if file has .pdf extension
*/
function hasValidExtension(filePath) {
const extension = filePath.toLowerCase().split('.').pop();
return extension === 'pdf';
}
/**
* Extract emitter name from PDF text
* @param {string} text - PDF text content
* @returns {string} - Emitter name
*/
function extractEmitterName(text) {
// Look for "Nombre del emisor:" pattern
const emitterPattern = /Nombre del emisor:\s*([^\n]+)/i;
const match = text.match(emitterPattern);
if (match && match[1]) {
return match[1].trim();
}
return null;
}
/**
* Extract economic activities from PDF text
* @param {string} text - PDF text content
* @returns {Array<string>} - Array of economic activities
*/
function extractEconomicActivities(text) {
const activities = [];
// Find the "Actividades Económicas:" section
const activitiesSection = text.split('Actividades Económicas:')[1];
if (!activitiesSection) {
return activities;
}
// Split by lines and look for activity patterns
const lines = activitiesSection.split('\n');
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
// Skip empty lines and look for activity patterns
if (line && !line.includes('Categoría tributaria:')) {
// Check if line contains activity code (6 digits) or is a general description
if (/^\d{6}\s/.test(line) ||
line.includes('ASES.COMER.PUBLICIDAD') ||
line.includes('VENTA') ||
line.includes('ACTIVIDADES') ||
line.includes('SERVICIOS') ||
line.includes('OTRAS ACTIVIDADES') ||
line.includes('ENSEÑANZA')) {
activities.push(line);
}
}
// Stop when we reach the next section
if (line.includes('Categoría tributaria:')) {
break;
}
}
return activities;
}
/**
* Extract address from PDF text
* @param {string} text - PDF text content
* @returns {string} - Address
*/
function extractAddress(text) {
// Look for "Domicilio:" pattern
const addressPattern = /Domicilio:\s*([^\n]+)/i;
const match = text.match(addressPattern);
if (match && match[1]) {
return match[1].trim();
}
return null;
}
/**
* Check if the document appears to be a Chilean tax document
* @param {string} text - PDF text content
* @returns {boolean} - True if appears to be a tax document
*/
function isTaxDocument(text) {
const taxDocumentIndicators = [
'CARPETA TRIBUTARIA',
'SII',
'Contribuyente',
'RUT del emisor',
'Actividades Económicas',
'Domicilio',
'Categoría tributaria'
];
const foundIndicators = taxDocumentIndicators.filter(indicator =>
text.includes(indicator)
);
// At least 3 indicators should be present
return foundIndicators.length >= 3;
}
/**
* Validate extracted data completeness
* @param {Object} data - Extracted data
* @returns {Object} - Validation result
*/
function validateExtractedData(data) {
const missingFields = [];
if (!data.emitterName) {
missingFields.push('emitter name');
}
if (!data.economicActivities || data.economicActivities.length === 0) {
missingFields.push('economic activities');
}
if (!data.address) {
missingFields.push('address');
}
return {
isValid: missingFields.length === 0,
missingFields: missingFields
};
}
/**
* Main function to extract data from PDF tax documents
* @param {string} pdfPath - Path to the PDF file
* @returns {Promise<Object>} - Extracted data object
*/
async function extractTaxData(pdfPath) {
try {
// Check if file exists
if (!fs.existsSync(pdfPath)) {
throw new Error(`File not found: ${pdfPath}`);
}
// Check file extension
if (!hasValidExtension(pdfPath)) {
throw new Error(`Invalid file extension. Expected .pdf, got: ${pdfPath.split('.').pop()}`);
}
// Get file stats
const stats = fs.statSync(pdfPath);
if (stats.size === 0) {
throw new Error('File is empty');
}
// Read the PDF file
const dataBuffer = fs.readFileSync(pdfPath);
// Validate PDF format
if (!isValidPDF(dataBuffer)) {
throw new Error('Invalid PDF format. File does not appear to be a valid PDF document.');
}
// Parse the PDF content
const data = await pdfParse(dataBuffer);
// Check if PDF has text content
if (!data.text || data.text.trim().length === 0) {
throw new Error('PDF appears to be empty or contains no extractable text (possibly scanned images)');
}
const text = data.text;
// Validate that this appears to be a tax document
if (!isTaxDocument(text)) {
throw new Error('Document does not appear to be a Chilean tax document. Missing expected tax document structure.');
}
// Extract the required information
const extractedData = {
emitterName: extractEmitterName(text),
economicActivities: extractEconomicActivities(text),
address: extractAddress(text)
};
// Validate extracted data
const validationResult = validateExtractedData(extractedData);
if (!validationResult.isValid) {
throw new Error(`Data extraction incomplete: ${validationResult.missingFields.join(', ')}`);
}
return extractedData;
} catch (error) {
console.error('Error processing PDF:', error.message);
throw error;
}
}
/**
* Process multiple PDF files in a directory
* @param {string} directoryPath - Path to directory containing PDF files
* @returns {Promise<Array>} - Array of extracted data objects
*/
async function processMultiplePDFs(directoryPath) {
const results = [];
try {
const files = fs.readdirSync(directoryPath);
const pdfFiles = files.filter(file => file.toLowerCase().endsWith('.pdf'));
for (const pdfFile of pdfFiles) {
const pdfPath = `${directoryPath}/${pdfFile}`;
console.log(`Processing: ${pdfFile}`);
try {
const data = await extractTaxData(pdfPath);
results.push({
filename: pdfFile,
data: data
});
} catch (error) {
console.error(`Error processing ${pdfFile}:`, error.message);
results.push({
filename: pdfFile,
error: error.message
});
}
}
return results;
} catch (error) {
console.error('Error reading directory:', error.message);
throw error;
}
}
/**
* Save extracted data to JSON file
* @param {Object} data - Data to save
* @param {string} outputPath - Output file path
*/
function saveToJSON(data, outputPath) {
try {
fs.writeFileSync(outputPath, JSON.stringify(data, null, 2));
console.log(`Data saved to: ${outputPath}`);
} catch (error) {
console.error('Error saving data:', error.message);
throw error;
}
}
// Export functions for use in other modules
module.exports = {
extractTaxData,
processMultiplePDFs,
saveToJSON,
extractEmitterName,
extractEconomicActivities,
extractAddress,
isValidPDF,
hasValidExtension,
isTaxDocument,
validateExtractedData
};
// Main execution if run directly
if (require.main === module) {
const pdfPath = process.argv[2];
if (!pdfPath) {
console.log('Usage: node index.js <pdf-file-path>');
console.log('Example: node index.js ./documents/tax-document.pdf');
process.exit(1);
}
extractTaxData(pdfPath)
.then(data => {
console.log('Extracted Data:');
console.log(JSON.stringify(data, null, 2));
// Save to JSON file
const outputPath = pdfPath.replace('.pdf', '_extracted.json');
saveToJSON(data, outputPath);
})
.catch(error => {
console.error('Error:', error.message);
process.exit(1);
});
}