@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
352 lines (351 loc) • 15.5 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.ConsensusAccuracyScorer = void 0;
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
/**
* The ConsensusAccuracyScorer evaluates extracted CV data against
* established consensus baseline data.
*/
class ConsensusAccuracyScorer {
/**
* Initialize the consensus accuracy scorer
*/
constructor(cacheDir) {
this.baseMetricsFile = path.join(cacheDir || path.join(process.cwd(), 'cache'), 'baseMetrics.json');
this.loadBaseMetrics();
}
/**
* Load base metrics from cache
*/
loadBaseMetrics() {
try {
if (fs.existsSync(this.baseMetricsFile)) {
this.baseMetrics = JSON.parse(fs.readFileSync(this.baseMetricsFile, 'utf8'));
console.log(`Loaded base metrics from ${this.baseMetricsFile}`);
console.log(`Base metrics contain ${Object.keys(this.baseMetrics.metrics).length} CV templates`);
}
else {
console.warn(`Base metrics file not found: ${this.baseMetricsFile}`);
this.baseMetrics = null;
}
}
catch (error) {
console.error(`Error loading base metrics: ${error}`);
this.baseMetrics = null;
}
}
/**
* Find the best matching consensus for a CV
*/
findBestMatchingConsensus(cvData) {
if (!this.baseMetrics || !this.baseMetrics.metrics) {
return null;
}
// For now, just use simple matching based on CV filename
// In a real implementation, this would use more sophisticated matching
const cvName = cvData.metadata?.sourceFile;
if (cvName && this.baseMetrics.metrics[cvName]) {
return {
consensus: this.baseMetrics.metrics[cvName].consensus,
baseCV: cvName,
confidence: this.baseMetrics.metrics[cvName].confidence,
};
}
// If no exact match, could implement fuzzy matching here
// For now, just return null
return null;
}
/**
* Evaluate accuracy against consensus
*/
evaluateAccuracy(cvData) {
// Default accuracy result
const result = {
overall: 0,
fieldAccuracy: 0,
structuralFidelity: 0,
completeness: 0,
missingFields: [],
metadata: {
consensusSource: 'none',
consensusStrength: 0,
comparedFields: 0,
},
};
// Find matching consensus
const consensusMatch = this.findBestMatchingConsensus(cvData);
if (!consensusMatch) {
console.warn('No matching consensus found for this CV');
return result;
}
const { consensus, baseCV, confidence } = consensusMatch;
// Determine if we're dealing with resume array or credits array
const hasResumeStructure = cvData.resume && Array.isArray(cvData.resume);
const hasCreditsStructure = cvData.credits && Array.isArray(cvData.credits);
const consensusHasResumeStructure = consensus.resume && Array.isArray(consensus.resume);
const consensusHasCreditsStructure = consensus.credits && Array.isArray(consensus.credits);
// Make sure structures match between consensus and data
const structuresMatch = (hasResumeStructure && consensusHasResumeStructure) ||
(hasCreditsStructure && consensusHasCreditsStructure);
if (!structuresMatch) {
console.warn('Data structure does not match consensus structure');
result.structuralFidelity = 30; // Partial credit for having some structure
result.overall = 30;
return result;
}
// Calculate structural fidelity
const structuralFidelity = this.calculateStructuralFidelity(cvData, consensus);
result.structuralFidelity = structuralFidelity;
// Calculate field accuracy and completeness
const fieldResults = this.calculateFieldAccuracy(cvData, consensus, confidence);
result.fieldAccuracy = fieldResults.accuracy;
result.completeness = fieldResults.completeness;
result.missingFields = fieldResults.missingFields;
// Calculate overall score
result.overall = Math.round(result.structuralFidelity * 0.3 +
result.fieldAccuracy * 0.4 +
result.completeness * 0.3);
// Add metadata
result.metadata = {
consensusSource: baseCV,
consensusStrength: confidence.overall || 0,
comparedFields: fieldResults.comparedFields,
};
return result;
}
/**
* Calculate structural fidelity against consensus
*/
calculateStructuralFidelity(cvData, consensus) {
// If resume structure
if (cvData.resume && consensus.resume) {
// Check if categories match
const consensusCategories = new Set(consensus.resume.map((cat) => cat.category));
const dataCategories = new Set(cvData.resume.map((cat) => cat.category));
// Calculate category match percentage
const categoriesInBoth = new Set([...consensusCategories].filter((cat) => dataCategories.has(cat)));
const categoryScore = consensusCategories.size > 0
? (categoriesInBoth.size / consensusCategories.size) * 100
: 0;
// Check if credits structure in each category matches
let creditsScore = 0;
let totalCategories = 0;
for (const category of consensus.resume) {
const dataCategory = cvData.resume.find((cat) => cat.category === category.category);
if (dataCategory) {
totalCategories++;
const consensusCreditsCount = category.credits.length;
const dataCreditsCount = dataCategory.credits.length;
// Calculate similarity in credits count
const countSimilarity = Math.min(dataCreditsCount, consensusCreditsCount) /
Math.max(dataCreditsCount, consensusCreditsCount);
creditsScore += countSimilarity * 100;
}
}
const averageCreditsScore = totalCategories > 0 ? creditsScore / totalCategories : 0;
// Calculate overall structural fidelity
return Math.round(categoryScore * 0.6 + averageCreditsScore * 0.4);
}
// If credits structure
if (cvData.credits && consensus.credits) {
const consensusCreditsCount = consensus.credits.length;
const dataCreditsCount = cvData.credits.length;
// Calculate similarity in credits count
const countSimilarity = Math.min(dataCreditsCount, consensusCreditsCount) /
Math.max(dataCreditsCount, consensusCreditsCount);
return Math.round(countSimilarity * 100);
}
return 0;
}
/**
* Calculate field accuracy and completeness against consensus
*/
calculateFieldAccuracy(cvData, consensus, confidence) {
let totalFields = 0;
let matchedFields = 0;
let totalExpectedFields = 0;
let presentExpectedFields = 0;
const missingFields = [];
// If resume structure
if (cvData.resume && consensus.resume) {
// For each category in consensus
for (const consensusCategory of consensus.resume) {
const dataCategory = cvData.resume.find((cat) => cat.category === consensusCategory.category);
if (!dataCategory) {
// Missing category
missingFields.push(`Category: ${consensusCategory.category}`);
totalExpectedFields += consensusCategory.credits.length * 4; // approximate fields
continue;
}
// For each credit in consensus category
for (const consensusCredit of consensusCategory.credits) {
// Find matching credit in data
const matchingCredit = this.findMatchingCredit(consensusCredit, dataCategory.credits);
if (!matchingCredit) {
// Missing credit
missingFields.push(`Credit: ${consensusCredit.title} in ${consensusCategory.category}`);
totalExpectedFields += 4; // approximate fields
continue;
}
// Compare fields
const fields = ['title', 'role', 'year', 'director'];
fields.forEach((field) => {
totalExpectedFields++;
if (matchingCredit[field]) {
presentExpectedFields++;
// Compare field values
totalFields++;
const similarity = this.calculateFieldSimilarity(consensusCredit[field], matchingCredit[field]);
// Apply confidence weighting
const fieldConfidence = confidence.fields[`${consensusCategory.category}.credits[0].${field}`] || 0.5;
matchedFields += similarity * fieldConfidence;
}
else {
missingFields.push(`${field} in ${consensusCredit.title}`);
}
});
}
}
}
// If credits structure
if (cvData.credits && consensus.credits) {
// For each credit in consensus
for (const consensusCredit of consensus.credits) {
// Find matching credit in data
const matchingCredit = this.findMatchingCredit(consensusCredit, cvData.credits);
if (!matchingCredit) {
// Missing credit
missingFields.push(`Credit: ${consensusCredit.title}`);
totalExpectedFields += 4; // approximate fields
continue;
}
// Compare fields
const fields = ['title', 'role', 'year', 'director', 'type'];
fields.forEach((field) => {
totalExpectedFields++;
if (matchingCredit[field]) {
presentExpectedFields++;
// Compare field values
totalFields++;
const similarity = this.calculateFieldSimilarity(consensusCredit[field], matchingCredit[field]);
// Apply confidence weighting
const fieldConfidence = confidence.fields[`credits[0].${field}`] || 0.5;
matchedFields += similarity * fieldConfidence;
}
else {
missingFields.push(`${field} in ${consensusCredit.title}`);
}
});
}
}
// Calculate scores
const accuracy = totalFields > 0 ? Math.round((matchedFields / totalFields) * 100) : 0;
const completeness = totalExpectedFields > 0
? Math.round((presentExpectedFields / totalExpectedFields) * 100)
: 0;
// Limit the number of missing fields reported
const uniqueMissingFields = [...new Set(missingFields)];
const topMissingFields = uniqueMissingFields.slice(0, 10);
return {
accuracy,
completeness,
missingFields: topMissingFields,
comparedFields: totalFields,
};
}
/**
* Find matching credit by title similarity
*/
findMatchingCredit(consensusCredit, credits) {
if (!credits || !Array.isArray(credits)) {
return null;
}
// Try to find by exact title match first
const exactMatch = credits.find((credit) => credit.title === consensusCredit.title);
if (exactMatch) {
return exactMatch;
}
// Try to find by title similarity
let bestMatch = null;
let bestSimilarity = 0;
for (const credit of credits) {
if (!credit.title || !consensusCredit.title)
continue;
const similarity = this.calculateStringSimilarity(credit.title, consensusCredit.title);
if (similarity > bestSimilarity && similarity > 0.6) {
// 60% threshold
bestMatch = credit;
bestSimilarity = similarity;
}
}
return bestMatch;
}
/**
* Calculate similarity between two field values
*/
calculateFieldSimilarity(value1, value2) {
if (value1 === value2) {
return 1.0; // Perfect match
}
if (typeof value1 === 'string' && typeof value2 === 'string') {
// String similarity for text fields
return this.calculateStringSimilarity(value1, value2);
}
// For other types, return 0 or 1
return 0;
}
/**
* Calculate string similarity (case-insensitive)
*/
calculateStringSimilarity(str1, str2) {
if (!str1 || !str2)
return 0;
// Normalize strings
const norm1 = str1.toLowerCase().trim().replace(/\s+/g, ' ');
const norm2 = str2.toLowerCase().trim().replace(/\s+/g, ' ');
if (norm1 === norm2) {
return 1.0; // Perfect match after normalization
}
// Simple Jaccard similarity for word overlap
const words1 = new Set(norm1.split(/\s+/));
const words2 = new Set(norm2.split(/\s+/));
const intersection = new Set([...words1].filter((word) => words2.has(word)));
const union = new Set([...words1, ...words2]);
return intersection.size / union.size;
}
}
exports.ConsensusAccuracyScorer = ConsensusAccuracyScorer;