@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
223 lines (222 loc) • 9.07 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.CSVGenerator = void 0;
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
/**
* CSV Generator class for processing output directories and creating CSV summaries
*/
class CSVGenerator {
/**
* Generate CSV summary from a base folder containing subdirectories with JSON files
* @param baseFolderPath Path to the base folder
*/
async generateCSV(baseFolderPath) {
try {
console.log(`🔍 Scanning base folder: ${baseFolderPath}`);
// Validate base folder exists
if (!fs.existsSync(baseFolderPath)) {
throw new Error(`Base folder not found: ${baseFolderPath}`);
}
// Discover all JSON files in subdirectories
const jsonFiles = await this.discoverJSONFiles(baseFolderPath);
console.log(`📁 Found ${jsonFiles.length} JSON files to process`);
if (jsonFiles.length === 0) {
console.log('⚠️ No JSON files found in subdirectories');
return;
}
// Extract data from each JSON file
const csvData = [];
let processedCount = 0;
let errorCount = 0;
for (const filePath of jsonFiles) {
try {
const row = await this.extractDataFromJSON(filePath, baseFolderPath);
csvData.push(row);
processedCount++;
if (processedCount % 10 === 0) {
console.log(`📊 Processed ${processedCount}/${jsonFiles.length} files...`);
}
}
catch (error) {
console.warn(`⚠️ Error processing ${filePath}: ${error}`);
errorCount++;
}
}
console.log(`✅ Successfully processed ${processedCount} files`);
if (errorCount > 0) {
console.log(`❌ Failed to process ${errorCount} files`);
}
// Write CSV file
const outputPath = path.join(baseFolderPath, 'summary.csv');
await this.writeCSV(csvData, outputPath);
console.log(`📄 CSV summary generated: ${outputPath}`);
console.log(`📈 Total rows: ${csvData.length}`);
}
catch (error) {
console.error('❌ Error generating CSV:', error);
throw error;
}
}
/**
* Recursively discover all JSON files in subdirectories
* @param basePath Base path to scan
* @returns Array of JSON file paths
*/
async discoverJSONFiles(basePath) {
const jsonFiles = [];
try {
const entries = fs.readdirSync(basePath, { withFileTypes: true });
for (const entry of entries) {
if (entry.isDirectory()) {
const subdirPath = path.join(basePath, entry.name);
// Look for JSON files directly in this subdirectory
const files = fs.readdirSync(subdirPath, { withFileTypes: true });
for (const file of files) {
if (file.isFile() && file.name.endsWith('.json')) {
jsonFiles.push(path.join(subdirPath, file.name));
}
}
}
}
}
catch (error) {
console.error(`Error scanning directory ${basePath}:`, error);
}
return jsonFiles;
}
/**
* Extract data from a JSON file and convert to CSV row format
* @param filePath Path to the JSON file
* @param baseFolderPath Base folder path for relative subdirectory calculation
* @returns CSV row data
*/
async extractDataFromJSON(filePath, baseFolderPath) {
try {
// Read and parse JSON file
const jsonContent = fs.readFileSync(filePath, 'utf-8');
const data = JSON.parse(jsonContent);
// Calculate relative subdirectory path
const relativePath = path.relative(baseFolderPath, filePath);
const subdirectory = path.dirname(relativePath);
// Extract data with fallbacks for missing values
const tokenUsage = data.tokenUsage || {};
const metadata = data.metadata || {};
const emptinessPercentage = metadata.emptinessPercentage || {};
return {
subdirectory,
totalTokens: tokenUsage.totalTokens || 0,
estimatedCost: tokenUsage.estimatedCost || 0,
processingTime: metadata.processingTime || 0,
conversionType: metadata.conversionType || '',
provider: metadata.provider || '',
model: metadata.model || '',
emptinessPercentage: emptinessPercentage.percentage || 0,
totalFields: emptinessPercentage.totalFields || 0,
nonEmptyFields: emptinessPercentage.nonEmptyFields || 0,
expectedTotalFields: emptinessPercentage.expectedTotalFields || 0,
expectedPercentage: emptinessPercentage.expectedPercentage || 0,
};
}
catch (error) {
throw new Error(`Failed to parse JSON file ${filePath}: ${error}`);
}
}
/**
* Write CSV data to file
* @param data Array of CSV row data
* @param outputPath Output file path
*/
async writeCSV(data, outputPath) {
try {
// Define CSV headers
const headers = [
'subdirectory',
'totalTokens',
'estimatedCost',
'processingTime',
'conversionType',
'provider',
'model',
'emptinessPercentage',
'totalFields',
'nonEmptyFields',
'expectedTotalFields',
'expectedPercentage',
];
// Create CSV content
const csvLines = [];
// Add header row
csvLines.push(headers.join(','));
// Add data rows
for (const row of data) {
const values = [
this.escapeCsvValue(row.subdirectory),
row.totalTokens.toString(),
row.estimatedCost.toFixed(6),
row.processingTime.toFixed(3),
this.escapeCsvValue(row.conversionType),
this.escapeCsvValue(row.provider),
this.escapeCsvValue(row.model),
row.emptinessPercentage.toFixed(2),
row.totalFields.toString(),
row.nonEmptyFields.toString(),
row.expectedTotalFields.toString(),
row.expectedPercentage.toFixed(2),
];
csvLines.push(values.join(','));
}
// Write to file
const csvContent = csvLines.join('\n');
fs.writeFileSync(outputPath, csvContent, 'utf-8');
}
catch (error) {
throw new Error(`Failed to write CSV file ${outputPath}: ${error}`);
}
}
/**
* Escape CSV values that contain commas, quotes, or newlines
* @param value Value to escape
* @returns Escaped value
*/
escapeCsvValue(value) {
if (value.includes(',') || value.includes('"') || value.includes('\n')) {
return `"${value.replace(/"/g, '""')}"`;
}
return value;
}
}
exports.CSVGenerator = CSVGenerator;