@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
378 lines (377 loc) • 15.1 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.ConsensusBuilder = void 0;
const fs = __importStar(require("fs"));
/**
* The ConsensusBuilder class analyzes multiple CV data extractions
* and builds a consensus version that represents the most likely correct data.
*/
class ConsensusBuilder {
/**
* Build consensus from multiple CV data files
*/
async buildConsensus(dataFiles) {
console.log(`Building consensus from ${dataFiles.length} data files...`);
// Load all data files
const allData = dataFiles
.map((file) => {
try {
const data = JSON.parse(fs.readFileSync(file, 'utf8'));
return data;
}
catch (error) {
console.error(`Error loading data file ${file}: ${error}`);
return null;
}
})
.filter((data) => data !== null);
if (allData.length === 0) {
throw new Error('No valid data files to build consensus from');
}
// Determine if we're dealing with resume array or credits array
const hasResumeStructure = allData.some((data) => data.resume && Array.isArray(data.resume));
const hasCreditsStructure = allData.some((data) => data.credits && Array.isArray(data.credits));
console.log(`Data format: ${hasResumeStructure ? 'resume structure' : ''} ${hasCreditsStructure ? 'credits structure' : ''}`);
let consensus;
let confidenceData = {};
// Build consensus based on the structure
if (hasResumeStructure) {
const { consensusData, confidence } = this.buildResumeConsensus(allData);
consensus = consensusData;
confidenceData = confidence;
}
else if (hasCreditsStructure) {
const { consensusData, confidence } = this.buildCreditsConsensus(allData);
consensus = consensusData;
confidenceData = confidence;
}
else {
throw new Error('Unknown data structure, cannot build consensus');
}
// Calculate overall consensus strength
const confidenceValues = Object.values(confidenceData);
const consensusStrength = confidenceValues.length > 0
? confidenceValues.reduce((sum, val) => sum + val, 0) /
confidenceValues.length
: 0;
return {
consensus,
confidence: {
overall: Math.round(consensusStrength * 100) / 100,
fields: confidenceData,
},
metadata: {
providerCount: allData.length,
consensusStrength: Math.round(consensusStrength * 100) / 100,
generatedAt: new Date().toISOString(),
},
};
}
/**
* Build consensus for resume structure (hierarchical)
*/
buildResumeConsensus(allData) {
const result = {
resume: [],
resume_show_years: this.determineShowYears(allData),
};
const confidence = {
resume_show_years: this.calculateConfidence(allData.map((data) => data.resume_show_years)),
};
// Get all categories from all data
const allCategories = this.extractAllCategories(allData);
// For each unique category, build consensus
for (const categoryName of allCategories) {
const categoryConsensus = this.buildCategoryConsensus(categoryName, allData);
if (categoryConsensus.category.credits.length > 0) {
result.resume.push(categoryConsensus.category);
// Add confidence scores for this category
Object.entries(categoryConsensus.confidence).forEach(([key, value]) => {
confidence[`${categoryName}.${key}`] = value;
});
}
}
return { consensusData: result, confidence };
}
/**
* Build consensus for credits structure (flat)
*/
buildCreditsConsensus(allData) {
const result = {
credits: [],
};
const confidence = {};
// Collect all credits from all data
const allCredits = [];
allData.forEach((data) => {
if (data.credits && Array.isArray(data.credits)) {
allCredits.push(...data.credits);
}
});
// Group similar credits
const groupedCredits = this.groupSimilarCredits(allCredits);
// For each group, build consensus credit
for (const [groupKey, creditsGroup] of Object.entries(groupedCredits)) {
const consensusCredit = this.buildCreditConsensus(creditsGroup);
result.credits.push(consensusCredit.credit);
// Add confidence scores for this credit
Object.entries(consensusCredit.confidence).forEach(([key, value]) => {
confidence[`credits[${groupKey}].${key}`] = value;
});
}
return { consensusData: result, confidence };
}
/**
* Extract all unique categories from all data
*/
extractAllCategories(allData) {
const categories = new Set();
allData.forEach((data) => {
if (data.resume && Array.isArray(data.resume)) {
data.resume.forEach((category) => {
if (category.category) {
categories.add(category.category);
}
});
}
});
return Array.from(categories);
}
/**
* Build consensus for a specific category
*/
buildCategoryConsensus(categoryName, allData) {
// Extract all credits for this category
const allCategoryCredits = [];
allData.forEach((data) => {
if (data.resume && Array.isArray(data.resume)) {
const categoryData = data.resume.find((cat) => cat.category === categoryName);
if (categoryData &&
categoryData.credits &&
Array.isArray(categoryData.credits)) {
allCategoryCredits.push(...categoryData.credits);
}
}
});
// Group similar credits
const groupedCredits = this.groupSimilarCredits(allCategoryCredits);
// Build consensus credits
const consensusCredits = [];
const confidence = {
category: 1.0, // Category name has perfect confidence as it's our grouping key
};
for (const [groupKey, creditsGroup] of Object.entries(groupedCredits)) {
const consensusCredit = this.buildCreditConsensus(creditsGroup);
consensusCredits.push(consensusCredit.credit);
// Add confidence scores for this credit
Object.entries(consensusCredit.confidence).forEach(([key, value]) => {
confidence[`credits[${groupKey}].${key}`] = value;
});
}
return {
category: {
category: categoryName,
category_id: this.generateConsistentId(categoryName),
credits: consensusCredits,
},
confidence,
};
}
/**
* Build consensus for a specific credit from similar credits
*/
buildCreditConsensus(credits) {
const fields = ['title', 'role', 'year', 'director', 'id'];
const result = {};
const confidence = {};
// For each field, find the most common value
for (const field of fields) {
const fieldValues = credits.map((credit) => credit[field]).filter(Boolean);
const { value, confidence: fieldConfidence } = this.findConsensusValue(fieldValues);
if (value !== null) {
result[field] = value;
confidence[field] = fieldConfidence;
}
else if (field === 'id') {
// Generate a consistent ID if none exists
result[field] = this.generateConsistentId(JSON.stringify(result));
confidence[field] = 1.0;
}
}
// Add attached_media if present
result.attached_media = [];
confidence['attached_media'] = 1.0;
return { credit: result, confidence };
}
/**
* Group similar credits based on title and role similarity
*/
groupSimilarCredits(credits) {
const groups = {};
credits.forEach((credit) => {
if (!credit.title)
return;
// Create a key based on normalized title
const normalizedTitle = credit.title
.toLowerCase()
.trim()
.replace(/\s+/g, ' ') // normalize whitespace
.replace(/[^\w\s]/g, ''); // remove special characters
// Try to find an existing group that's similar
let foundGroup = false;
for (const [groupKey, groupCredits] of Object.entries(groups)) {
const firstCredit = groupCredits[0];
const similarity = this.calculateStringSimilarity(normalizedTitle, firstCredit.title
.toLowerCase()
.trim()
.replace(/\s+/g, ' ')
.replace(/[^\w\s]/g, ''));
if (similarity > 0.8) {
// 80% similarity threshold
groups[groupKey].push(credit);
foundGroup = true;
break;
}
}
// Create a new group if no match found
if (!foundGroup) {
const groupKey = Object.keys(groups).length.toString();
groups[groupKey] = [credit];
}
});
return groups;
}
/**
* Find the consensus value from a list of values
*/
findConsensusValue(values) {
if (values.length === 0) {
return { value: null, confidence: 0 };
}
// Count occurrences of each value
const valueCounts = {};
values.forEach((value) => {
const valueStr = String(value).toLowerCase().trim();
valueCounts[valueStr] = (valueCounts[valueStr] || 0) + 1;
});
// Find the most common value
let mostCommonValue = null;
let highestCount = 0;
for (const [value, count] of Object.entries(valueCounts)) {
if (count > highestCount) {
mostCommonValue = value;
highestCount = count;
}
}
// Find the original case version of the value
const originalValue = values.find((v) => String(v).toLowerCase().trim() === mostCommonValue);
// Calculate confidence as the percentage of agreement
const confidence = highestCount / values.length;
return {
value: originalValue || null,
confidence,
};
}
/**
* Calculate similarity between two strings (Jaccard similarity)
*/
calculateStringSimilarity(str1, str2) {
if (!str1 || !str2)
return 0;
// Create sets of words
const words1 = new Set(str1.split(/\s+/));
const words2 = new Set(str2.split(/\s+/));
// Calculate intersection
const intersection = new Set([...words1].filter((word) => words2.has(word)));
// Calculate union
const union = new Set([...words1, ...words2]);
// Return Jaccard similarity
return intersection.size / union.size;
}
/**
* Determine if years should be shown based on majority
*/
determineShowYears(allData) {
const values = allData
.filter((data) => typeof data.resume_show_years === 'boolean')
.map((data) => data.resume_show_years);
if (values.length === 0) {
return true; // Default to true if no data
}
const trueCount = values.filter(Boolean).length;
return trueCount >= values.length / 2;
}
/**
* Calculate confidence score for a list of values
*/
calculateConfidence(values) {
if (!values || values.length === 0)
return 0;
const filteredValues = values.filter((v) => v !== undefined && v !== null);
if (filteredValues.length === 0)
return 0;
// For boolean values
if (typeof filteredValues[0] === 'boolean') {
const trueCount = filteredValues.filter(Boolean).length;
const falseCount = filteredValues.length - trueCount;
return Math.max(trueCount, falseCount) / filteredValues.length;
}
// For string or number values
const valueCounts = {};
filteredValues.forEach((value) => {
const valueStr = String(value).toLowerCase().trim();
valueCounts[valueStr] = (valueCounts[valueStr] || 0) + 1;
});
const highestCount = Math.max(...Object.values(valueCounts));
return highestCount / filteredValues.length;
}
/**
* Generate a consistent ID from input data
*/
generateConsistentId(input) {
// Simple hash function for demonstration
let hash = 0;
for (let i = 0; i < input.length; i++) {
const char = input.charCodeAt(i);
hash = (hash << 5) - hash + char;
hash = hash & hash; // Convert to 32bit integer
}
// Format as a UUID-like string
const hashStr = Math.abs(hash).toString(16).padStart(8, '0');
return `${hashStr}-${hashStr.substr(0, 4)}-${hashStr.substr(4, 4)}-${hashStr.substr(0, 4)}-${hashStr.substr(0, 12)}`;
}
}
exports.ConsensusBuilder = ConsensusBuilder;