@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
225 lines (224 loc) • 12.6 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.mergeReports = mergeReports;
const fs_1 = __importDefault(require("fs"));
const glob_1 = require("glob");
const path_1 = __importDefault(require("path"));
async function mergeReports(outputDir) {
// Find all report.md files in subdirectories
const reportFiles = await (0, glob_1.glob)(`${outputDir}/**/report.md`);
if (reportFiles.length === 0) {
return 'No report files found';
}
const report = {
providers: {},
models: {},
allRuns: [],
};
// Process each report file
for (const reportFile of reportFiles) {
const content = fs_1.default.readFileSync(reportFile, 'utf-8');
const dirName = path_1.default.dirname(reportFile);
const cvName = path_1.default.basename(dirName).split('_')[0];
// Extract accuracy table from report
const accuracyTableMatch = content.match(/## Accuracy Comparison\n\n\|.*\n\|.*\n((?:\|.*\n)*)/);
if (!accuracyTableMatch)
continue;
const accuracyTableRows = accuracyTableMatch[1].trim().split('\n');
// Process each row in the accuracy table
for (const row of accuracyTableRows) {
const [_, provider, model, accuracy, fieldAccuracy, completeness, structure,] = row.split('|').map((item) => item.trim());
// Extract processing time from the successful executions table
const timeMatch = content.match(new RegExp(`\\| ${provider} \\| ${model.replace(/(\(|\))/g, '\\$1')} \\| (\\d+\\.\\d+)`));
const processingTime = timeMatch ? parseFloat(timeMatch[2]) : 0;
// Find the output file
const outputFileMatch = content.match(new RegExp(`\\| ${provider} \\| ${model.replace(/(\(|\))/g, '\\$1')} \\| ${processingTime.toFixed(2)} \\| \\d+% \\| \\[View\\]\\(\\.\\/(.+?)\\)`));
const outputFile = outputFileMatch ? outputFileMatch[1] : '';
const accuracyValue = parseInt(accuracy.replace('%', '')) / 100;
const fieldAccuracyValue = fieldAccuracy !== '-'
? parseInt(fieldAccuracy.replace('%', '')) / 100
: undefined;
const completenessValue = completeness !== '-'
? parseInt(completeness.replace('%', '')) / 100
: undefined;
const structureValue = structure !== '-'
? parseInt(structure.replace('%', '')) / 100
: undefined;
// Skip if we can't parse accuracy
if (isNaN(accuracyValue))
continue;
// Update provider metrics
const providerKey = provider;
if (!report.providers[providerKey]) {
report.providers[providerKey] = {
provider,
model: 'Various',
processingTime: 0,
accuracy: 0,
fieldAccuracy: 0,
completeness: 0,
structure: 0,
count: 0,
successRate: 0,
files: [],
};
}
report.providers[providerKey].processingTime += processingTime;
report.providers[providerKey].accuracy += accuracyValue;
if (fieldAccuracyValue)
report.providers[providerKey].fieldAccuracy += fieldAccuracyValue;
if (completenessValue)
report.providers[providerKey].completeness += completenessValue;
if (structureValue)
report.providers[providerKey].structure += structureValue;
report.providers[providerKey].count += 1;
report.providers[providerKey].files.push(path_1.default.join(dirName, outputFile));
// Update model metrics
const modelKey = `${provider}_${model}`;
if (!report.models[modelKey]) {
report.models[modelKey] = {
provider,
model,
processingTime: 0,
accuracy: 0,
fieldAccuracy: 0,
completeness: 0,
structure: 0,
count: 0,
successRate: 0,
files: [],
};
}
report.models[modelKey].processingTime += processingTime;
report.models[modelKey].accuracy += accuracyValue;
if (fieldAccuracyValue)
report.models[modelKey].fieldAccuracy += fieldAccuracyValue;
if (completenessValue)
report.models[modelKey].completeness += completenessValue;
if (structureValue)
report.models[modelKey].structure += structureValue;
report.models[modelKey].count += 1;
report.models[modelKey].files.push(path_1.default.join(dirName, outputFile));
// Add to all runs
report.allRuns.push({
cvName,
provider,
model,
processingTime,
accuracy: accuracyValue,
fieldAccuracy: fieldAccuracyValue,
completeness: completenessValue,
structure: structureValue,
outputFile: path_1.default.join(dirName, outputFile),
});
}
// Extract success rate from the summary
for (const provider of Object.values(report.providers)) {
// Get total number of executions for this provider
const totalExecutionsMatch = content.match(/- \*\*Total Providers\*\*: (\d+)/);
const successfulExecutionsMatch = content.match(/- \*\*Successful\*\*: (\d+)/);
if (totalExecutionsMatch && successfulExecutionsMatch) {
const total = parseInt(totalExecutionsMatch[1]);
const successful = parseInt(successfulExecutionsMatch[1]);
provider.successRate = successful / total;
}
}
}
// Calculate averages
for (const provider of Object.values(report.providers)) {
provider.processingTime = provider.processingTime / provider.count;
provider.accuracy = provider.accuracy / provider.count;
if (provider.fieldAccuracy)
provider.fieldAccuracy = provider.fieldAccuracy / provider.count;
if (provider.completeness)
provider.completeness = provider.completeness / provider.count;
if (provider.structure)
provider.structure = provider.structure / provider.count;
}
for (const model of Object.values(report.models)) {
model.processingTime = model.processingTime / model.count;
model.accuracy = model.accuracy / model.count;
if (model.fieldAccuracy)
model.fieldAccuracy = model.fieldAccuracy / model.count;
if (model.completeness)
model.completeness = model.completeness / model.count;
if (model.structure)
model.structure = model.structure / model.count;
}
// Generate markdown report
return generateMarkdownReport(report);
}
function generateMarkdownReport(report) {
const sortedProviders = Object.values(report.providers).sort((a, b) => b.accuracy - a.accuracy);
const sortedModels = Object.values(report.models).sort((a, b) => b.accuracy - a.accuracy);
const fastestProviders = [...Object.values(report.providers)].sort((a, b) => a.processingTime - b.processingTime);
const fastestModels = [...Object.values(report.models)].sort((a, b) => a.processingTime - b.processingTime);
// Calculate a combined score (weighted average of accuracy and speed)
// Higher is better
const combinedScore = (metrics) => {
// Normalize processing time to a 0-1 scale (reversed, so faster is better)
const maxTime = Math.max(...Object.values(report.models).map((m) => m.processingTime));
const normalizedTime = 1 - metrics.processingTime / maxTime;
// Weight accuracy more heavily than speed
return metrics.accuracy * 0.7 + normalizedTime * 0.3;
};
const bestOverallProviders = [...Object.values(report.providers)].sort((a, b) => combinedScore(b) - combinedScore(a));
const bestOverallModels = [...Object.values(report.models)].sort((a, b) => combinedScore(b) - combinedScore(a));
let markdown = `# Merged CV Processing Report\n\n`;
markdown += `**Date**: ${new Date().toISOString().split('T')[0]}\n`;
markdown += `**Total CV Samples**: ${new Set(report.allRuns.map((r) => r.cvName)).size}\n`;
markdown += `**Total Runs Analyzed**: ${report.allRuns.length}\n\n`;
markdown += `## Best Providers by Accuracy\n\n`;
markdown += `| Provider | Avg Accuracy | Avg Field Accuracy | Avg Completeness | Avg Structure | Runs |\n`;
markdown += `|----------|-------------|-------------------|-----------------|--------------|------|\n`;
for (const provider of sortedProviders) {
markdown += `| ${provider.provider} | ${(provider.accuracy * 100).toFixed(1)}% | ${provider.fieldAccuracy ? (provider.fieldAccuracy * 100).toFixed(1) : '-'}% | ${provider.completeness ? (provider.completeness * 100).toFixed(1) : '-'}% | ${provider.structure ? (provider.structure * 100).toFixed(1) : '-'}% | ${provider.count} |\n`;
}
markdown += `\n## Best Models by Accuracy\n\n`;
markdown += `| Provider | Model | Avg Accuracy | Avg Field Accuracy | Avg Completeness | Avg Structure | Runs |\n`;
markdown += `|----------|-------|-------------|-------------------|-----------------|--------------|------|\n`;
for (const model of sortedModels) {
markdown += `| ${model.provider} | ${model.model} | ${(model.accuracy * 100).toFixed(1)}% | ${model.fieldAccuracy ? (model.fieldAccuracy * 100).toFixed(1) : '-'}% | ${model.completeness ? (model.completeness * 100).toFixed(1) : '-'}% | ${model.structure ? (model.structure * 100).toFixed(1) : '-'}% | ${model.count} |\n`;
}
markdown += `\n## Fastest Providers\n\n`;
markdown += `| Provider | Avg Processing Time (s) | Runs |\n`;
markdown += `|----------|--------------------------|------|\n`;
for (const provider of fastestProviders) {
markdown += `| ${provider.provider} | ${provider.processingTime.toFixed(2)} | ${provider.count} |\n`;
}
markdown += `\n## Fastest Models\n\n`;
markdown += `| Provider | Model | Avg Processing Time (s) | Runs |\n`;
markdown += `|----------|-------|--------------------------|------|\n`;
for (const model of fastestModels) {
markdown += `| ${model.provider} | ${model.model} | ${model.processingTime.toFixed(2)} | ${model.count} |\n`;
}
markdown += `\n## Best Overall (Combined Accuracy & Speed)\n\n`;
markdown += `| Provider | Model | Accuracy | Processing Time (s) | Combined Score |\n`;
markdown += `|----------|-------|----------|---------------------|---------------|\n`;
for (const model of bestOverallModels.slice(0, 5)) {
markdown += `| ${model.provider} | ${model.model} | ${(model.accuracy * 100).toFixed(1)}% | ${model.processingTime.toFixed(2)} | ${combinedScore(model).toFixed(2)} |\n`;
}
markdown += `\n## Recommendations\n\n`;
// Best overall model
const bestModel = bestOverallModels[0];
markdown += `### Best Overall Model\n`;
markdown += `**${bestModel.provider} (${bestModel.model})** with ${(bestModel.accuracy * 100).toFixed(1)}% accuracy and ${bestModel.processingTime.toFixed(2)}s average processing time.\n\n`;
// Best for accuracy
const bestAccuracyModel = sortedModels[0];
markdown += `### Best for Accuracy\n`;
markdown += `**${bestAccuracyModel.provider} (${bestAccuracyModel.model})** with ${(bestAccuracyModel.accuracy * 100).toFixed(1)}% accuracy.\n\n`;
// Best for speed
const bestSpeedModel = fastestModels[0];
markdown += `### Best for Speed\n`;
markdown += `**${bestSpeedModel.provider} (${bestSpeedModel.model})** with ${bestSpeedModel.processingTime.toFixed(2)}s average processing time.\n\n`;
markdown += `## All Runs\n\n`;
markdown += `| CV | Provider | Model | Accuracy | Processing Time (s) |\n`;
markdown += `|----|----------|-------|----------|---------------------|\n`;
for (const run of report.allRuns) {
markdown += `| ${run.cvName} | ${run.provider} | ${run.model} | ${(run.accuracy * 100).toFixed(1)}% | ${run.processingTime.toFixed(2)} |\n`;
}
return markdown;
}