@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
278 lines (277 loc) • 10.6 kB
JavaScript
;
/**
* AccuracyScorer - Evaluates the accuracy of extracted CV data
*
* This utility helps measure how well the AI extraction performed by:
* - Checking structural validity (schema compliance)
* - Measuring field completeness
* - Validating category/type assignments
* - Calculating an overall accuracy score
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.AccuracyScorer = void 0;
/**
* The official categories/types that are allowed in the CV
*/
const OFFICIAL_CATEGORIES = [
'Film',
'Television',
'TV',
'Commercial',
'Theatre',
'Theater',
'Print',
'Fashion',
'Training',
'Voice',
'Stunt',
'Corporate',
'MC',
'Presenting',
'Extras',
'Other',
];
/**
* Required fields for each credit
*/
const REQUIRED_CREDIT_FIELDS = ['title', 'role', 'year', 'director'];
class AccuracyScorer {
/**
* Evaluate the accuracy of CV data extraction
*/
static evaluateAccuracy(cvData) {
// Default accuracy result
const result = {
overall: 0,
categoryAssignment: 0,
completeness: 0,
structuralValidity: 0,
missingFields: [],
};
// Check if the data has the expected structure
const structuralValidity = this.checkStructuralValidity(cvData);
result.structuralValidity = structuralValidity;
// If the structure is invalid, return low scores
if (structuralValidity < 50) {
result.overall = structuralValidity;
return result;
}
// Calculate category assignment accuracy
result.categoryAssignment = this.checkCategoryAssignment(cvData);
// Calculate field completeness
const completenessResult = this.checkCompleteness(cvData);
result.completeness = completenessResult.score;
result.missingFields = completenessResult.missingFields;
// Calculate the overall score
result.overall = this.calculateOverallScore(result);
return result;
}
/**
* Check the structural validity of the CV data
* Returns a score from 0-100
*/
static checkStructuralValidity(cvData) {
// Detect format
const hasResumeStructure = cvData.resume && Array.isArray(cvData.resume);
const hasCreditsStructure = cvData.credits && Array.isArray(cvData.credits);
// If neither structure exists, return 0
if (!hasResumeStructure && !hasCreditsStructure) {
return 0;
}
// Check resume structure (hierarchical)
if (hasResumeStructure) {
let score = 0;
// Check if resume_show_years is present (bonus points)
if (typeof cvData.resume_show_years === 'boolean') {
score += 10;
}
// Check each resume entry for proper structure
let totalCategories = cvData.resume.length;
if (totalCategories === 0) {
return Math.min(score + 40, 50); // Cap at 50 if no categories found
}
let validCategories = 0;
let totalCredits = 0;
let validCredits = 0;
for (const category of cvData.resume) {
if (category.category &&
category.credits &&
Array.isArray(category.credits)) {
validCategories++;
// Check credits in each category
totalCredits += category.credits.length;
for (const credit of category.credits) {
if (this.isValidCredit(credit)) {
validCredits++;
}
}
}
}
// Calculate validity scores
const categoryScore = totalCategories > 0 ? (validCategories / totalCategories) * 100 : 0;
const creditScore = totalCredits > 0 ? (validCredits / totalCredits) * 100 : 0;
// Final structural validity score (weighted average)
return Math.round(score + categoryScore * 0.4 + creditScore * 0.5);
}
// Check credits structure (flat)
if (hasCreditsStructure) {
// Empty credits array
if (cvData.credits.length === 0) {
return 50; // Basic structure exists but no content
}
// Check each credit for proper structure
let validCredits = 0;
for (const credit of cvData.credits) {
if (this.isValidCredit(credit)) {
validCredits++;
}
}
// Percentage of valid credits
const creditsScore = (validCredits / cvData.credits.length) * 100;
return Math.round(creditsScore);
}
return 0;
}
/**
* Check if a credit object has the expected structure
*/
static isValidCredit(credit) {
// Must have at least title and role
if (!credit.title || !credit.role) {
return false;
}
return true;
}
/**
* Check how well categories are assigned based on official categories
* Returns a score from 0-100
*/
static checkCategoryAssignment(cvData) {
// Detect format
const hasResumeStructure = cvData.resume && Array.isArray(cvData.resume);
const hasCreditsStructure = cvData.credits && Array.isArray(cvData.credits);
// Check resume structure (hierarchical)
if (hasResumeStructure) {
let totalCategories = cvData.resume.length;
if (totalCategories === 0) {
return 0;
}
let validCategories = 0;
for (const category of cvData.resume) {
if (category.category && this.isValidCategoryName(category.category)) {
validCategories++;
}
}
return Math.round((validCategories / totalCategories) * 100);
}
// Check credits structure (flat)
if (hasCreditsStructure) {
let totalCredits = cvData.credits.length;
if (totalCredits === 0) {
return 0;
}
let validCategories = 0;
for (const credit of cvData.credits) {
if (credit.type && this.isValidCategoryName(credit.type)) {
validCategories++;
}
}
return Math.round((validCategories / totalCredits) * 100);
}
return 0;
}
/**
* Check if a category string is valid by comparing to official categories
* Uses partial matching to handle variations
*/
static isValidCategoryName(category) {
if (!category || typeof category !== 'string') {
return false;
}
// Normalize the category for comparison
const normalizedCategory = category.toLowerCase().trim();
// Check if any official category is contained in this category string
return OFFICIAL_CATEGORIES.some((officialCat) => normalizedCategory.includes(officialCat.toLowerCase()));
}
/**
* Check completeness of fields in the CV data
* Returns a score from 0-100 and a list of missing fields
*/
static checkCompleteness(cvData) {
// Detect format
const hasResumeStructure = cvData.resume && Array.isArray(cvData.resume);
const hasCreditsStructure = cvData.credits && Array.isArray(cvData.credits);
const missingFields = [];
let totalFields = 0;
let filledFields = 0;
// Check resume structure (hierarchical)
if (hasResumeStructure) {
for (const category of cvData.resume) {
if (!Array.isArray(category.credits)) {
missingFields.push(`credits array missing in ${category.category || 'unknown'} category`);
continue;
}
for (const credit of category.credits) {
for (const field of REQUIRED_CREDIT_FIELDS) {
totalFields++;
if (credit[field] && String(credit[field]).trim() !== '') {
filledFields++;
}
else {
missingFields.push(`${field} in ${category.category || 'unknown'} category`);
}
}
}
}
}
// Check credits structure (flat)
if (hasCreditsStructure) {
for (const credit of cvData.credits) {
for (const field of REQUIRED_CREDIT_FIELDS) {
totalFields++;
if (credit[field] && String(credit[field]).trim() !== '') {
filledFields++;
}
else {
missingFields.push(`${field} in credit titled "${credit.title || 'unknown'}"`);
}
}
// Check type/category field
totalFields++;
if (credit.type && String(credit.type).trim() !== '') {
filledFields++;
}
else {
missingFields.push(`type in credit titled "${credit.title || 'unknown'}"`);
}
// Check optional fields (just for counting completeness)
const optionalFields = ['productionCompany', 'location', 'link'];
for (const field of optionalFields) {
if (credit[field] && String(credit[field]).trim() !== '') {
filledFields++;
totalFields++;
}
}
}
}
const score = totalFields > 0 ? Math.round((filledFields / totalFields) * 100) : 0;
// Limit the number of missing fields reported
const uniqueMissingFields = [...new Set(missingFields)];
const topMissingFields = uniqueMissingFields.slice(0, 10);
return {
score,
missingFields: topMissingFields,
};
}
/**
* Calculate the overall accuracy score from component scores
*/
static calculateOverallScore(result) {
// Weighted average of component scores
const score = result.structuralValidity * 0.4 +
result.categoryAssignment * 0.3 +
result.completeness * 0.3;
return Math.round(score);
}
}
exports.AccuracyScorer = AccuracyScorer;