UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

378 lines (377 loc) 15.1 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ConsensusBuilder = void 0; const fs = __importStar(require("fs")); /** * The ConsensusBuilder class analyzes multiple CV data extractions * and builds a consensus version that represents the most likely correct data. */ class ConsensusBuilder { /** * Build consensus from multiple CV data files */ async buildConsensus(dataFiles) { console.log(`Building consensus from ${dataFiles.length} data files...`); // Load all data files const allData = dataFiles .map((file) => { try { const data = JSON.parse(fs.readFileSync(file, 'utf8')); return data; } catch (error) { console.error(`Error loading data file ${file}: ${error}`); return null; } }) .filter((data) => data !== null); if (allData.length === 0) { throw new Error('No valid data files to build consensus from'); } // Determine if we're dealing with resume array or credits array const hasResumeStructure = allData.some((data) => data.resume && Array.isArray(data.resume)); const hasCreditsStructure = allData.some((data) => data.credits && Array.isArray(data.credits)); console.log(`Data format: ${hasResumeStructure ? 'resume structure' : ''} ${hasCreditsStructure ? 'credits structure' : ''}`); let consensus; let confidenceData = {}; // Build consensus based on the structure if (hasResumeStructure) { const { consensusData, confidence } = this.buildResumeConsensus(allData); consensus = consensusData; confidenceData = confidence; } else if (hasCreditsStructure) { const { consensusData, confidence } = this.buildCreditsConsensus(allData); consensus = consensusData; confidenceData = confidence; } else { throw new Error('Unknown data structure, cannot build consensus'); } // Calculate overall consensus strength const confidenceValues = Object.values(confidenceData); const consensusStrength = confidenceValues.length > 0 ? confidenceValues.reduce((sum, val) => sum + val, 0) / confidenceValues.length : 0; return { consensus, confidence: { overall: Math.round(consensusStrength * 100) / 100, fields: confidenceData, }, metadata: { providerCount: allData.length, consensusStrength: Math.round(consensusStrength * 100) / 100, generatedAt: new Date().toISOString(), }, }; } /** * Build consensus for resume structure (hierarchical) */ buildResumeConsensus(allData) { const result = { resume: [], resume_show_years: this.determineShowYears(allData), }; const confidence = { resume_show_years: this.calculateConfidence(allData.map((data) => data.resume_show_years)), }; // Get all categories from all data const allCategories = this.extractAllCategories(allData); // For each unique category, build consensus for (const categoryName of allCategories) { const categoryConsensus = this.buildCategoryConsensus(categoryName, allData); if (categoryConsensus.category.credits.length > 0) { result.resume.push(categoryConsensus.category); // Add confidence scores for this category Object.entries(categoryConsensus.confidence).forEach(([key, value]) => { confidence[`${categoryName}.${key}`] = value; }); } } return { consensusData: result, confidence }; } /** * Build consensus for credits structure (flat) */ buildCreditsConsensus(allData) { const result = { credits: [], }; const confidence = {}; // Collect all credits from all data const allCredits = []; allData.forEach((data) => { if (data.credits && Array.isArray(data.credits)) { allCredits.push(...data.credits); } }); // Group similar credits const groupedCredits = this.groupSimilarCredits(allCredits); // For each group, build consensus credit for (const [groupKey, creditsGroup] of Object.entries(groupedCredits)) { const consensusCredit = this.buildCreditConsensus(creditsGroup); result.credits.push(consensusCredit.credit); // Add confidence scores for this credit Object.entries(consensusCredit.confidence).forEach(([key, value]) => { confidence[`credits[${groupKey}].${key}`] = value; }); } return { consensusData: result, confidence }; } /** * Extract all unique categories from all data */ extractAllCategories(allData) { const categories = new Set(); allData.forEach((data) => { if (data.resume && Array.isArray(data.resume)) { data.resume.forEach((category) => { if (category.category) { categories.add(category.category); } }); } }); return Array.from(categories); } /** * Build consensus for a specific category */ buildCategoryConsensus(categoryName, allData) { // Extract all credits for this category const allCategoryCredits = []; allData.forEach((data) => { if (data.resume && Array.isArray(data.resume)) { const categoryData = data.resume.find((cat) => cat.category === categoryName); if (categoryData && categoryData.credits && Array.isArray(categoryData.credits)) { allCategoryCredits.push(...categoryData.credits); } } }); // Group similar credits const groupedCredits = this.groupSimilarCredits(allCategoryCredits); // Build consensus credits const consensusCredits = []; const confidence = { category: 1.0, // Category name has perfect confidence as it's our grouping key }; for (const [groupKey, creditsGroup] of Object.entries(groupedCredits)) { const consensusCredit = this.buildCreditConsensus(creditsGroup); consensusCredits.push(consensusCredit.credit); // Add confidence scores for this credit Object.entries(consensusCredit.confidence).forEach(([key, value]) => { confidence[`credits[${groupKey}].${key}`] = value; }); } return { category: { category: categoryName, category_id: this.generateConsistentId(categoryName), credits: consensusCredits, }, confidence, }; } /** * Build consensus for a specific credit from similar credits */ buildCreditConsensus(credits) { const fields = ['title', 'role', 'year', 'director', 'id']; const result = {}; const confidence = {}; // For each field, find the most common value for (const field of fields) { const fieldValues = credits.map((credit) => credit[field]).filter(Boolean); const { value, confidence: fieldConfidence } = this.findConsensusValue(fieldValues); if (value !== null) { result[field] = value; confidence[field] = fieldConfidence; } else if (field === 'id') { // Generate a consistent ID if none exists result[field] = this.generateConsistentId(JSON.stringify(result)); confidence[field] = 1.0; } } // Add attached_media if present result.attached_media = []; confidence['attached_media'] = 1.0; return { credit: result, confidence }; } /** * Group similar credits based on title and role similarity */ groupSimilarCredits(credits) { const groups = {}; credits.forEach((credit) => { if (!credit.title) return; // Create a key based on normalized title const normalizedTitle = credit.title .toLowerCase() .trim() .replace(/\s+/g, ' ') // normalize whitespace .replace(/[^\w\s]/g, ''); // remove special characters // Try to find an existing group that's similar let foundGroup = false; for (const [groupKey, groupCredits] of Object.entries(groups)) { const firstCredit = groupCredits[0]; const similarity = this.calculateStringSimilarity(normalizedTitle, firstCredit.title .toLowerCase() .trim() .replace(/\s+/g, ' ') .replace(/[^\w\s]/g, '')); if (similarity > 0.8) { // 80% similarity threshold groups[groupKey].push(credit); foundGroup = true; break; } } // Create a new group if no match found if (!foundGroup) { const groupKey = Object.keys(groups).length.toString(); groups[groupKey] = [credit]; } }); return groups; } /** * Find the consensus value from a list of values */ findConsensusValue(values) { if (values.length === 0) { return { value: null, confidence: 0 }; } // Count occurrences of each value const valueCounts = {}; values.forEach((value) => { const valueStr = String(value).toLowerCase().trim(); valueCounts[valueStr] = (valueCounts[valueStr] || 0) + 1; }); // Find the most common value let mostCommonValue = null; let highestCount = 0; for (const [value, count] of Object.entries(valueCounts)) { if (count > highestCount) { mostCommonValue = value; highestCount = count; } } // Find the original case version of the value const originalValue = values.find((v) => String(v).toLowerCase().trim() === mostCommonValue); // Calculate confidence as the percentage of agreement const confidence = highestCount / values.length; return { value: originalValue || null, confidence, }; } /** * Calculate similarity between two strings (Jaccard similarity) */ calculateStringSimilarity(str1, str2) { if (!str1 || !str2) return 0; // Create sets of words const words1 = new Set(str1.split(/\s+/)); const words2 = new Set(str2.split(/\s+/)); // Calculate intersection const intersection = new Set([...words1].filter((word) => words2.has(word))); // Calculate union const union = new Set([...words1, ...words2]); // Return Jaccard similarity return intersection.size / union.size; } /** * Determine if years should be shown based on majority */ determineShowYears(allData) { const values = allData .filter((data) => typeof data.resume_show_years === 'boolean') .map((data) => data.resume_show_years); if (values.length === 0) { return true; // Default to true if no data } const trueCount = values.filter(Boolean).length; return trueCount >= values.length / 2; } /** * Calculate confidence score for a list of values */ calculateConfidence(values) { if (!values || values.length === 0) return 0; const filteredValues = values.filter((v) => v !== undefined && v !== null); if (filteredValues.length === 0) return 0; // For boolean values if (typeof filteredValues[0] === 'boolean') { const trueCount = filteredValues.filter(Boolean).length; const falseCount = filteredValues.length - trueCount; return Math.max(trueCount, falseCount) / filteredValues.length; } // For string or number values const valueCounts = {}; filteredValues.forEach((value) => { const valueStr = String(value).toLowerCase().trim(); valueCounts[valueStr] = (valueCounts[valueStr] || 0) + 1; }); const highestCount = Math.max(...Object.values(valueCounts)); return highestCount / filteredValues.length; } /** * Generate a consistent ID from input data */ generateConsistentId(input) { // Simple hash function for demonstration let hash = 0; for (let i = 0; i < input.length; i++) { const char = input.charCodeAt(i); hash = (hash << 5) - hash + char; hash = hash & hash; // Convert to 32bit integer } // Format as a UUID-like string const hashStr = Math.abs(hash).toString(16).padStart(8, '0'); return `${hashStr}-${hashStr.substr(0, 4)}-${hashStr.substr(4, 4)}-${hashStr.substr(0, 4)}-${hashStr.substr(0, 12)}`; } } exports.ConsensusBuilder = ConsensusBuilder;