UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

320 lines (276 loc) 11.4 kB
import * as fs from 'fs' import { glob } from 'glob' import * as path from 'path' import { CVData } from '../types' interface ExecutionData { provider: string model: string cvData: CVData } /** * ReportGenerator class is responsible for generating markdown reports * from CV processing results, including token usage metrics. */ export class ReportGenerator { /** * Generate a report for a specific output directory */ public static async generateReport( outputDir: string, verbose: boolean = false ): Promise<string> { if (verbose) { console.log(`Generating report for ${outputDir}`) } try { // Find all JSON files in the directory const jsonFiles = await glob(`${outputDir}/*.json`) if (jsonFiles.length === 0) { return 'No JSON files found' } // Load all CV data files const allData: { cvData: CVData file: string provider: string model: string time: number }[] = [] for (const file of jsonFiles) { try { const data: CVData = JSON.parse(fs.readFileSync(file, 'utf8')) // Skip files without proper metadata if (!data.metadata) continue allData.push({ cvData: data, file: path.basename(file), provider: data.metadata.provider || 'unknown', model: data.metadata.model || 'default', time: data.metadata.processingTime || 0, }) } catch (error) { console.error(`Error loading data file ${file}:`, error) } } if (allData.length === 0) { return 'No valid data files found' } // Get the CV name from the directory const cvName = path.basename(outputDir).split('_')[0] // Get the date const dateMatch = path.basename(outputDir).match(/(\d{4}-\d{2}-\d{2})/) const date = dateMatch ? dateMatch[1] : new Date().toISOString().split('T')[0] // Calculate total execution time const totalTime = allData.reduce((sum, data) => sum + data.time, 0) // Separate successful and failed executions const successfulExecutions = allData.filter( (data) => data.cvData.metadata && data.cvData.metadata.accuracy ) // For now we don't have failed executions in our data structure // If needed, we can detect them based on missing accuracy or other criteria const failedExecutions: any[] = [] // Calculate success rate const totalProviders = successfulExecutions.length + failedExecutions.length const successRate = totalProviders > 0 ? (successfulExecutions.length / totalProviders) * 100 : 0 // Start building the markdown report let report = `# CV Processing Report\n\n` report += `**CV**: ${cvName}.pdf\n` report += `**Date**: ${date}\n` report += `**Total Execution Time**: ${totalTime.toFixed(2)} seconds\n\n` // Summary section report += `## Summary\n\n` report += `- **Total Providers**: ${totalProviders}\n` report += `- **Successful**: ${successfulExecutions.length}\n` report += `- **Failed**: ${failedExecutions.length}\n` report += `- **Success Rate**: ${successRate.toFixed(1)}%\n` // Successful executions section report += `## Successful Executions\n\n` report += `| Provider | Model | Time (s) | Accuracy | Token Usage | Est. Cost | Output File |\n` report += `|----------|-------|----------|----------|-------------|-----------|-------------|\n` for (const execution of successfulExecutions) { const accuracy = execution.cvData.metadata?.accuracy?.overall ?? 0 // Token usage information const tokenUsage = execution.cvData.metadata?.tokenUsage const tokenCount = tokenUsage?.totalTokens ?? 'N/A' const estCost = tokenUsage?.estimatedCost ? `$${tokenUsage.estimatedCost.toFixed(4)}` : 'N/A' report += `| ${execution.provider} | ${ execution.model } | ${execution.time.toFixed( 2 )} | ${accuracy}% | ${tokenCount} | ${estCost} | [View](./${ execution.file }) |\n` } report += '\n' // Failed executions section (if any) if (failedExecutions.length > 0) { report += `## Failed Executions\n\n` report += `| Provider | Model | Error |\n` report += `|----------|-------|-------|\n` for (const execution of failedExecutions) { const errorMessage = execution.error.substring(0, 50) + '...' report += `| ${execution.provider} | ${execution.model} | ${errorMessage} |\n` } report += '\n' } // Performance comparison section report += `## Performance Comparison\n\n` const fastest = [...successfulExecutions].sort( (a, b) => a.time - b.time )[0] const slowest = [...successfulExecutions].sort( (a, b) => b.time - a.time )[0] const avgTime = totalTime / successfulExecutions.length report += `- **Fastest**: ${fastest.provider} (${ fastest.model }) - ${fastest.time.toFixed(2)}s\n` report += `- **Slowest**: ${slowest.provider} (${ slowest.model }) - ${slowest.time.toFixed(2)}s\n` report += `- **Average Time**: ${avgTime.toFixed(2)}s\n\n` // Add accuracy comparison report += `### Accuracy Comparison\n\n` report += `| Provider | Model | Accuracy | Token Usage | Cost |\n` report += `|----------|-------|----------|-------------|------|\n` successfulExecutions.forEach((execution) => { const accuracy = execution.cvData.metadata?.accuracy?.overall ?? 0 const tokenUsage = execution.cvData.metadata?.tokenUsage const cost = tokenUsage?.estimatedCost ? `$${tokenUsage.estimatedCost.toFixed(4)}` : 'N/A' report += `| ${execution.provider} | ${ execution.model } | ${accuracy}% | ${tokenUsage?.totalTokens ?? 'N/A'} | ${cost} |\n` }) // Add token usage comparison report += `\n### Token Usage Comparison\n\n` report += `| Provider | Model | Input Tokens | Output Tokens | Total Tokens | Cost |\n` report += `|----------|-------|--------------|---------------|--------------|------|\n` successfulExecutions.forEach((execution) => { const tokenUsage = execution.cvData.metadata?.tokenUsage const cost = tokenUsage?.estimatedCost ? `$${tokenUsage.estimatedCost.toFixed(4)}` : 'N/A' report += `| ${execution.provider} | ${execution.model} | ${ tokenUsage?.inputTokens ?? 'N/A' } | ${tokenUsage?.outputTokens ?? 'N/A'} | ${ tokenUsage?.totalTokens ?? 'N/A' } | ${cost} |\n` }) // Add emptiness percentage comparison report += `\n### Field Emptiness Comparison\n\n` report += `| Provider | Model | Populated Fields | Total Fields | Emptiness % |\n` report += `|----------|-------|-----------------|--------------|------------|\n` successfulExecutions.forEach((execution) => { const emptinessPercentage = execution.cvData.metadata?.emptinessPercentage const percentage = emptinessPercentage?.percentage ?? 'N/A' const nonEmptyFields = emptinessPercentage?.nonEmptyFields ?? 'N/A' const totalFields = emptinessPercentage?.totalFields ?? 'N/A' report += `| ${execution.provider} | ${ execution.model || 'default' } | ${nonEmptyFields} | ${totalFields} | ${ typeof percentage === 'number' ? `${(percentage * 100).toFixed(1)}%` : percentage } |\n` }) // Sort by accuracy const sortedByAccuracy = [...successfulExecutions].sort((a, b) => { const accuracyA = a.cvData.metadata?.accuracy?.overall ?? 0 const accuracyB = b.cvData.metadata?.accuracy?.overall ?? 0 return accuracyB - accuracyA }) // Add accuracy details report += `\n### Accuracy Details\n\n` sortedByAccuracy.forEach((execution) => { const accuracy = execution.cvData.metadata?.accuracy const emptinessPercentage = execution.cvData.metadata?.emptinessPercentage if (!accuracy) return report += `#### ${execution.provider} (${execution.model})\n` report += `- Overall Accuracy: ${accuracy.overall}%\n` if (accuracy.fieldAccuracy) { report += `- Field Accuracy: ${accuracy.fieldAccuracy}%\n` } report += `- Completeness: ${accuracy.completeness}%\n` if (accuracy.structuralFidelity) { report += `- Structural Fidelity: ${accuracy.structuralFidelity}%\n` } // Add emptiness percentage information if available if (emptinessPercentage) { report += `- Field Emptiness: ${emptinessPercentage.percentage}% (${emptinessPercentage.nonEmptyFields}/${emptinessPercentage.totalFields} fields populated)\n` } if (accuracy.missingFields?.length) { report += `- Missing Fields: ${accuracy.missingFields.join(', ')}\n` } report += '\n' }) // Sort by token usage const sortedByTokens = [...successfulExecutions].sort((a, b) => { const accuracyA = a.cvData.metadata?.accuracy?.overall ?? 0 const accuracyB = b.cvData.metadata?.accuracy?.overall ?? 0 return accuracyB - accuracyA }) // Add token usage details report += `\n### Token Usage Details\n\n` sortedByTokens.forEach((execution) => { const tokenUsage = execution.cvData.metadata?.tokenUsage if (!tokenUsage) return report += `#### ${execution.provider} (${execution.model})\n` report += `- Input Tokens: ${tokenUsage.inputTokens}\n` report += `- Output Tokens: ${tokenUsage.outputTokens}\n` report += `- Total Tokens: ${tokenUsage.totalTokens}\n` if (tokenUsage.estimatedCost) { report += `- Estimated Cost: $${tokenUsage.estimatedCost.toFixed( 4 )}\n` } report += '\n' }) // Add best accuracy summary const bestAccuracy = sortedByAccuracy[0] if (bestAccuracy?.cvData.metadata?.accuracy) { report += `\n### Best Accuracy\n` report += `Provider: ${bestAccuracy.provider}\n` report += `Model: ${bestAccuracy.model}\n` report += `Accuracy: ${bestAccuracy.cvData.metadata.accuracy.overall}%\n\n` } return report } catch (error) { console.error(`Error generating report: ${error}`) return `Error generating report: ${error}` } } /** * Save the report to a file */ public static saveReport(report: string, outputDir: string): void { try { const reportPath = path.join(outputDir, 'report.md') fs.writeFileSync(reportPath, report) console.log(`Report saved to ${reportPath}`) } catch (error) { console.error(`Error saving report: ${error}`) } } /** * Generate and save a report for a specific directory */ public static async generateAndSaveReport( outputDir: string, verbose: boolean = false ): Promise<void> { const report = await this.generateReport(outputDir, verbose) this.saveReport(report, outputDir) } }