UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

254 lines (224 loc) 7.51 kB
import * as fs from 'fs' import * as path from 'path' /** * Interface representing a row in the CSV output */ interface CSVRow { subdirectory: string totalTokens: number estimatedCost: number processingTime: number conversionType: string provider: string model: string emptinessPercentage: number totalFields: number nonEmptyFields: number expectedTotalFields: number expectedPercentage: number } /** * Interface representing the structure of the JSON files */ interface ProcessedCVData { tokenUsage?: { totalTokens?: number estimatedCost?: number } metadata?: { processingTime?: number conversionType?: string provider?: string model?: string emptinessPercentage?: { percentage?: number totalFields?: number nonEmptyFields?: number expectedTotalFields?: number expectedPercentage?: number } } } /** * CSV Generator class for processing output directories and creating CSV summaries */ export class CSVGenerator { /** * Generate CSV summary from a base folder containing subdirectories with JSON files * @param baseFolderPath Path to the base folder */ async generateCSV(baseFolderPath: string): Promise<void> { try { console.log(`🔍 Scanning base folder: ${baseFolderPath}`) // Validate base folder exists if (!fs.existsSync(baseFolderPath)) { throw new Error(`Base folder not found: ${baseFolderPath}`) } // Discover all JSON files in subdirectories const jsonFiles = await this.discoverJSONFiles(baseFolderPath) console.log(`📁 Found ${jsonFiles.length} JSON files to process`) if (jsonFiles.length === 0) { console.log('⚠️ No JSON files found in subdirectories') return } // Extract data from each JSON file const csvData: CSVRow[] = [] let processedCount = 0 let errorCount = 0 for (const filePath of jsonFiles) { try { const row = await this.extractDataFromJSON(filePath, baseFolderPath) csvData.push(row) processedCount++ if (processedCount % 10 === 0) { console.log( `📊 Processed ${processedCount}/${jsonFiles.length} files...` ) } } catch (error) { console.warn(`⚠️ Error processing ${filePath}: ${error}`) errorCount++ } } console.log(`✅ Successfully processed ${processedCount} files`) if (errorCount > 0) { console.log(`❌ Failed to process ${errorCount} files`) } // Write CSV file const outputPath = path.join(baseFolderPath, 'summary.csv') await this.writeCSV(csvData, outputPath) console.log(`📄 CSV summary generated: ${outputPath}`) console.log(`📈 Total rows: ${csvData.length}`) } catch (error) { console.error('❌ Error generating CSV:', error) throw error } } /** * Recursively discover all JSON files in subdirectories * @param basePath Base path to scan * @returns Array of JSON file paths */ private async discoverJSONFiles(basePath: string): Promise<string[]> { const jsonFiles: string[] = [] try { const entries = fs.readdirSync(basePath, { withFileTypes: true }) for (const entry of entries) { if (entry.isDirectory()) { const subdirPath = path.join(basePath, entry.name) // Look for JSON files directly in this subdirectory const files = fs.readdirSync(subdirPath, { withFileTypes: true }) for (const file of files) { if (file.isFile() && file.name.endsWith('.json')) { jsonFiles.push(path.join(subdirPath, file.name)) } } } } } catch (error) { console.error(`Error scanning directory ${basePath}:`, error) } return jsonFiles } /** * Extract data from a JSON file and convert to CSV row format * @param filePath Path to the JSON file * @param baseFolderPath Base folder path for relative subdirectory calculation * @returns CSV row data */ private async extractDataFromJSON( filePath: string, baseFolderPath: string ): Promise<CSVRow> { try { // Read and parse JSON file const jsonContent = fs.readFileSync(filePath, 'utf-8') const data: ProcessedCVData = JSON.parse(jsonContent) // Calculate relative subdirectory path const relativePath = path.relative(baseFolderPath, filePath) const subdirectory = path.dirname(relativePath) // Extract data with fallbacks for missing values const tokenUsage = data.tokenUsage || {} const metadata = data.metadata || {} const emptinessPercentage = metadata.emptinessPercentage || {} return { subdirectory, totalTokens: tokenUsage.totalTokens || 0, estimatedCost: tokenUsage.estimatedCost || 0, processingTime: metadata.processingTime || 0, conversionType: metadata.conversionType || '', provider: metadata.provider || '', model: metadata.model || '', emptinessPercentage: emptinessPercentage.percentage || 0, totalFields: emptinessPercentage.totalFields || 0, nonEmptyFields: emptinessPercentage.nonEmptyFields || 0, expectedTotalFields: emptinessPercentage.expectedTotalFields || 0, expectedPercentage: emptinessPercentage.expectedPercentage || 0, } } catch (error) { throw new Error(`Failed to parse JSON file ${filePath}: ${error}`) } } /** * Write CSV data to file * @param data Array of CSV row data * @param outputPath Output file path */ private async writeCSV(data: CSVRow[], outputPath: string): Promise<void> { try { // Define CSV headers const headers = [ 'subdirectory', 'totalTokens', 'estimatedCost', 'processingTime', 'conversionType', 'provider', 'model', 'emptinessPercentage', 'totalFields', 'nonEmptyFields', 'expectedTotalFields', 'expectedPercentage', ] // Create CSV content const csvLines: string[] = [] // Add header row csvLines.push(headers.join(',')) // Add data rows for (const row of data) { const values = [ this.escapeCsvValue(row.subdirectory), row.totalTokens.toString(), row.estimatedCost.toFixed(6), row.processingTime.toFixed(3), this.escapeCsvValue(row.conversionType), this.escapeCsvValue(row.provider), this.escapeCsvValue(row.model), row.emptinessPercentage.toFixed(2), row.totalFields.toString(), row.nonEmptyFields.toString(), row.expectedTotalFields.toString(), row.expectedPercentage.toFixed(2), ] csvLines.push(values.join(',')) } // Write to file const csvContent = csvLines.join('\n') fs.writeFileSync(outputPath, csvContent, 'utf-8') } catch (error) { throw new Error(`Failed to write CSV file ${outputPath}: ${error}`) } } /** * Escape CSV values that contain commas, quotes, or newlines * @param value Value to escape * @returns Escaped value */ private escapeCsvValue(value: string): string { if (value.includes(',') || value.includes('"') || value.includes('\n')) { return `"${value.replace(/"/g, '""')}"` } return value } }