UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

223 lines (222 loc) 9.07 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.CSVGenerator = void 0; const fs = __importStar(require("fs")); const path = __importStar(require("path")); /** * CSV Generator class for processing output directories and creating CSV summaries */ class CSVGenerator { /** * Generate CSV summary from a base folder containing subdirectories with JSON files * @param baseFolderPath Path to the base folder */ async generateCSV(baseFolderPath) { try { console.log(`🔍 Scanning base folder: ${baseFolderPath}`); // Validate base folder exists if (!fs.existsSync(baseFolderPath)) { throw new Error(`Base folder not found: ${baseFolderPath}`); } // Discover all JSON files in subdirectories const jsonFiles = await this.discoverJSONFiles(baseFolderPath); console.log(`📁 Found ${jsonFiles.length} JSON files to process`); if (jsonFiles.length === 0) { console.log('⚠️ No JSON files found in subdirectories'); return; } // Extract data from each JSON file const csvData = []; let processedCount = 0; let errorCount = 0; for (const filePath of jsonFiles) { try { const row = await this.extractDataFromJSON(filePath, baseFolderPath); csvData.push(row); processedCount++; if (processedCount % 10 === 0) { console.log(`📊 Processed ${processedCount}/${jsonFiles.length} files...`); } } catch (error) { console.warn(`⚠️ Error processing ${filePath}: ${error}`); errorCount++; } } console.log(`✅ Successfully processed ${processedCount} files`); if (errorCount > 0) { console.log(`❌ Failed to process ${errorCount} files`); } // Write CSV file const outputPath = path.join(baseFolderPath, 'summary.csv'); await this.writeCSV(csvData, outputPath); console.log(`📄 CSV summary generated: ${outputPath}`); console.log(`📈 Total rows: ${csvData.length}`); } catch (error) { console.error('❌ Error generating CSV:', error); throw error; } } /** * Recursively discover all JSON files in subdirectories * @param basePath Base path to scan * @returns Array of JSON file paths */ async discoverJSONFiles(basePath) { const jsonFiles = []; try { const entries = fs.readdirSync(basePath, { withFileTypes: true }); for (const entry of entries) { if (entry.isDirectory()) { const subdirPath = path.join(basePath, entry.name); // Look for JSON files directly in this subdirectory const files = fs.readdirSync(subdirPath, { withFileTypes: true }); for (const file of files) { if (file.isFile() && file.name.endsWith('.json')) { jsonFiles.push(path.join(subdirPath, file.name)); } } } } } catch (error) { console.error(`Error scanning directory ${basePath}:`, error); } return jsonFiles; } /** * Extract data from a JSON file and convert to CSV row format * @param filePath Path to the JSON file * @param baseFolderPath Base folder path for relative subdirectory calculation * @returns CSV row data */ async extractDataFromJSON(filePath, baseFolderPath) { try { // Read and parse JSON file const jsonContent = fs.readFileSync(filePath, 'utf-8'); const data = JSON.parse(jsonContent); // Calculate relative subdirectory path const relativePath = path.relative(baseFolderPath, filePath); const subdirectory = path.dirname(relativePath); // Extract data with fallbacks for missing values const tokenUsage = data.tokenUsage || {}; const metadata = data.metadata || {}; const emptinessPercentage = metadata.emptinessPercentage || {}; return { subdirectory, totalTokens: tokenUsage.totalTokens || 0, estimatedCost: tokenUsage.estimatedCost || 0, processingTime: metadata.processingTime || 0, conversionType: metadata.conversionType || '', provider: metadata.provider || '', model: metadata.model || '', emptinessPercentage: emptinessPercentage.percentage || 0, totalFields: emptinessPercentage.totalFields || 0, nonEmptyFields: emptinessPercentage.nonEmptyFields || 0, expectedTotalFields: emptinessPercentage.expectedTotalFields || 0, expectedPercentage: emptinessPercentage.expectedPercentage || 0, }; } catch (error) { throw new Error(`Failed to parse JSON file ${filePath}: ${error}`); } } /** * Write CSV data to file * @param data Array of CSV row data * @param outputPath Output file path */ async writeCSV(data, outputPath) { try { // Define CSV headers const headers = [ 'subdirectory', 'totalTokens', 'estimatedCost', 'processingTime', 'conversionType', 'provider', 'model', 'emptinessPercentage', 'totalFields', 'nonEmptyFields', 'expectedTotalFields', 'expectedPercentage', ]; // Create CSV content const csvLines = []; // Add header row csvLines.push(headers.join(',')); // Add data rows for (const row of data) { const values = [ this.escapeCsvValue(row.subdirectory), row.totalTokens.toString(), row.estimatedCost.toFixed(6), row.processingTime.toFixed(3), this.escapeCsvValue(row.conversionType), this.escapeCsvValue(row.provider), this.escapeCsvValue(row.model), row.emptinessPercentage.toFixed(2), row.totalFields.toString(), row.nonEmptyFields.toString(), row.expectedTotalFields.toString(), row.expectedPercentage.toFixed(2), ]; csvLines.push(values.join(',')); } // Write to file const csvContent = csvLines.join('\n'); fs.writeFileSync(outputPath, csvContent, 'utf-8'); } catch (error) { throw new Error(`Failed to write CSV file ${outputPath}: ${error}`); } } /** * Escape CSV values that contain commas, quotes, or newlines * @param value Value to escape * @returns Escaped value */ escapeCsvValue(value) { if (value.includes(',') || value.includes('"') || value.includes('\n')) { return `"${value.replace(/"/g, '""')}"`; } return value; } } exports.CSVGenerator = CSVGenerator;