UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

225 lines (224 loc) 12.6 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.mergeReports = mergeReports; const fs_1 = __importDefault(require("fs")); const glob_1 = require("glob"); const path_1 = __importDefault(require("path")); async function mergeReports(outputDir) { // Find all report.md files in subdirectories const reportFiles = await (0, glob_1.glob)(`${outputDir}/**/report.md`); if (reportFiles.length === 0) { return 'No report files found'; } const report = { providers: {}, models: {}, allRuns: [], }; // Process each report file for (const reportFile of reportFiles) { const content = fs_1.default.readFileSync(reportFile, 'utf-8'); const dirName = path_1.default.dirname(reportFile); const cvName = path_1.default.basename(dirName).split('_')[0]; // Extract accuracy table from report const accuracyTableMatch = content.match(/## Accuracy Comparison\n\n\|.*\n\|.*\n((?:\|.*\n)*)/); if (!accuracyTableMatch) continue; const accuracyTableRows = accuracyTableMatch[1].trim().split('\n'); // Process each row in the accuracy table for (const row of accuracyTableRows) { const [_, provider, model, accuracy, fieldAccuracy, completeness, structure,] = row.split('|').map((item) => item.trim()); // Extract processing time from the successful executions table const timeMatch = content.match(new RegExp(`\\| ${provider} \\| ${model.replace(/(\(|\))/g, '\\$1')} \\| (\\d+\\.\\d+)`)); const processingTime = timeMatch ? parseFloat(timeMatch[2]) : 0; // Find the output file const outputFileMatch = content.match(new RegExp(`\\| ${provider} \\| ${model.replace(/(\(|\))/g, '\\$1')} \\| ${processingTime.toFixed(2)} \\| \\d+% \\| \\[View\\]\\(\\.\\/(.+?)\\)`)); const outputFile = outputFileMatch ? outputFileMatch[1] : ''; const accuracyValue = parseInt(accuracy.replace('%', '')) / 100; const fieldAccuracyValue = fieldAccuracy !== '-' ? parseInt(fieldAccuracy.replace('%', '')) / 100 : undefined; const completenessValue = completeness !== '-' ? parseInt(completeness.replace('%', '')) / 100 : undefined; const structureValue = structure !== '-' ? parseInt(structure.replace('%', '')) / 100 : undefined; // Skip if we can't parse accuracy if (isNaN(accuracyValue)) continue; // Update provider metrics const providerKey = provider; if (!report.providers[providerKey]) { report.providers[providerKey] = { provider, model: 'Various', processingTime: 0, accuracy: 0, fieldAccuracy: 0, completeness: 0, structure: 0, count: 0, successRate: 0, files: [], }; } report.providers[providerKey].processingTime += processingTime; report.providers[providerKey].accuracy += accuracyValue; if (fieldAccuracyValue) report.providers[providerKey].fieldAccuracy += fieldAccuracyValue; if (completenessValue) report.providers[providerKey].completeness += completenessValue; if (structureValue) report.providers[providerKey].structure += structureValue; report.providers[providerKey].count += 1; report.providers[providerKey].files.push(path_1.default.join(dirName, outputFile)); // Update model metrics const modelKey = `${provider}_${model}`; if (!report.models[modelKey]) { report.models[modelKey] = { provider, model, processingTime: 0, accuracy: 0, fieldAccuracy: 0, completeness: 0, structure: 0, count: 0, successRate: 0, files: [], }; } report.models[modelKey].processingTime += processingTime; report.models[modelKey].accuracy += accuracyValue; if (fieldAccuracyValue) report.models[modelKey].fieldAccuracy += fieldAccuracyValue; if (completenessValue) report.models[modelKey].completeness += completenessValue; if (structureValue) report.models[modelKey].structure += structureValue; report.models[modelKey].count += 1; report.models[modelKey].files.push(path_1.default.join(dirName, outputFile)); // Add to all runs report.allRuns.push({ cvName, provider, model, processingTime, accuracy: accuracyValue, fieldAccuracy: fieldAccuracyValue, completeness: completenessValue, structure: structureValue, outputFile: path_1.default.join(dirName, outputFile), }); } // Extract success rate from the summary for (const provider of Object.values(report.providers)) { // Get total number of executions for this provider const totalExecutionsMatch = content.match(/- \*\*Total Providers\*\*: (\d+)/); const successfulExecutionsMatch = content.match(/- \*\*Successful\*\*: (\d+)/); if (totalExecutionsMatch && successfulExecutionsMatch) { const total = parseInt(totalExecutionsMatch[1]); const successful = parseInt(successfulExecutionsMatch[1]); provider.successRate = successful / total; } } } // Calculate averages for (const provider of Object.values(report.providers)) { provider.processingTime = provider.processingTime / provider.count; provider.accuracy = provider.accuracy / provider.count; if (provider.fieldAccuracy) provider.fieldAccuracy = provider.fieldAccuracy / provider.count; if (provider.completeness) provider.completeness = provider.completeness / provider.count; if (provider.structure) provider.structure = provider.structure / provider.count; } for (const model of Object.values(report.models)) { model.processingTime = model.processingTime / model.count; model.accuracy = model.accuracy / model.count; if (model.fieldAccuracy) model.fieldAccuracy = model.fieldAccuracy / model.count; if (model.completeness) model.completeness = model.completeness / model.count; if (model.structure) model.structure = model.structure / model.count; } // Generate markdown report return generateMarkdownReport(report); } function generateMarkdownReport(report) { const sortedProviders = Object.values(report.providers).sort((a, b) => b.accuracy - a.accuracy); const sortedModels = Object.values(report.models).sort((a, b) => b.accuracy - a.accuracy); const fastestProviders = [...Object.values(report.providers)].sort((a, b) => a.processingTime - b.processingTime); const fastestModels = [...Object.values(report.models)].sort((a, b) => a.processingTime - b.processingTime); // Calculate a combined score (weighted average of accuracy and speed) // Higher is better const combinedScore = (metrics) => { // Normalize processing time to a 0-1 scale (reversed, so faster is better) const maxTime = Math.max(...Object.values(report.models).map((m) => m.processingTime)); const normalizedTime = 1 - metrics.processingTime / maxTime; // Weight accuracy more heavily than speed return metrics.accuracy * 0.7 + normalizedTime * 0.3; }; const bestOverallProviders = [...Object.values(report.providers)].sort((a, b) => combinedScore(b) - combinedScore(a)); const bestOverallModels = [...Object.values(report.models)].sort((a, b) => combinedScore(b) - combinedScore(a)); let markdown = `# Merged CV Processing Report\n\n`; markdown += `**Date**: ${new Date().toISOString().split('T')[0]}\n`; markdown += `**Total CV Samples**: ${new Set(report.allRuns.map((r) => r.cvName)).size}\n`; markdown += `**Total Runs Analyzed**: ${report.allRuns.length}\n\n`; markdown += `## Best Providers by Accuracy\n\n`; markdown += `| Provider | Avg Accuracy | Avg Field Accuracy | Avg Completeness | Avg Structure | Runs |\n`; markdown += `|----------|-------------|-------------------|-----------------|--------------|------|\n`; for (const provider of sortedProviders) { markdown += `| ${provider.provider} | ${(provider.accuracy * 100).toFixed(1)}% | ${provider.fieldAccuracy ? (provider.fieldAccuracy * 100).toFixed(1) : '-'}% | ${provider.completeness ? (provider.completeness * 100).toFixed(1) : '-'}% | ${provider.structure ? (provider.structure * 100).toFixed(1) : '-'}% | ${provider.count} |\n`; } markdown += `\n## Best Models by Accuracy\n\n`; markdown += `| Provider | Model | Avg Accuracy | Avg Field Accuracy | Avg Completeness | Avg Structure | Runs |\n`; markdown += `|----------|-------|-------------|-------------------|-----------------|--------------|------|\n`; for (const model of sortedModels) { markdown += `| ${model.provider} | ${model.model} | ${(model.accuracy * 100).toFixed(1)}% | ${model.fieldAccuracy ? (model.fieldAccuracy * 100).toFixed(1) : '-'}% | ${model.completeness ? (model.completeness * 100).toFixed(1) : '-'}% | ${model.structure ? (model.structure * 100).toFixed(1) : '-'}% | ${model.count} |\n`; } markdown += `\n## Fastest Providers\n\n`; markdown += `| Provider | Avg Processing Time (s) | Runs |\n`; markdown += `|----------|--------------------------|------|\n`; for (const provider of fastestProviders) { markdown += `| ${provider.provider} | ${provider.processingTime.toFixed(2)} | ${provider.count} |\n`; } markdown += `\n## Fastest Models\n\n`; markdown += `| Provider | Model | Avg Processing Time (s) | Runs |\n`; markdown += `|----------|-------|--------------------------|------|\n`; for (const model of fastestModels) { markdown += `| ${model.provider} | ${model.model} | ${model.processingTime.toFixed(2)} | ${model.count} |\n`; } markdown += `\n## Best Overall (Combined Accuracy & Speed)\n\n`; markdown += `| Provider | Model | Accuracy | Processing Time (s) | Combined Score |\n`; markdown += `|----------|-------|----------|---------------------|---------------|\n`; for (const model of bestOverallModels.slice(0, 5)) { markdown += `| ${model.provider} | ${model.model} | ${(model.accuracy * 100).toFixed(1)}% | ${model.processingTime.toFixed(2)} | ${combinedScore(model).toFixed(2)} |\n`; } markdown += `\n## Recommendations\n\n`; // Best overall model const bestModel = bestOverallModels[0]; markdown += `### Best Overall Model\n`; markdown += `**${bestModel.provider} (${bestModel.model})** with ${(bestModel.accuracy * 100).toFixed(1)}% accuracy and ${bestModel.processingTime.toFixed(2)}s average processing time.\n\n`; // Best for accuracy const bestAccuracyModel = sortedModels[0]; markdown += `### Best for Accuracy\n`; markdown += `**${bestAccuracyModel.provider} (${bestAccuracyModel.model})** with ${(bestAccuracyModel.accuracy * 100).toFixed(1)}% accuracy.\n\n`; // Best for speed const bestSpeedModel = fastestModels[0]; markdown += `### Best for Speed\n`; markdown += `**${bestSpeedModel.provider} (${bestSpeedModel.model})** with ${bestSpeedModel.processingTime.toFixed(2)}s average processing time.\n\n`; markdown += `## All Runs\n\n`; markdown += `| CV | Provider | Model | Accuracy | Processing Time (s) |\n`; markdown += `|----|----------|-------|----------|---------------------|\n`; for (const run of report.allRuns) { markdown += `| ${run.cvName} | ${run.provider} | ${run.model} | ${(run.accuracy * 100).toFixed(1)}% | ${run.processingTime.toFixed(2)} |\n`; } return markdown; }