@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
407 lines (403 loc) • 16.6 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractTextFromPDF = extractTextFromPDF;
exports.calculateAccuracy = calculateAccuracy;
exports.meetsAccuracyThreshold = meetsAccuracyThreshold;
exports.saveCVDataToJson = saveCVDataToJson;
exports.processCvWithAI = processCvWithAI;
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
/**
* Extract text from a PDF file using AI capabilities
*/
async function extractTextFromPDF(pdfPath, aiProvider) {
console.log(`Extracting text from PDF: ${pdfPath}`);
try {
// Check if the provider has a dedicated PDF processing method
if (aiProvider.processPDF) {
console.log("Using AI provider's native PDF processing");
const result = await aiProvider.processPDF(pdfPath, `
Extract all text content from this PDF document.
Maintain the formatting and structure as much as possible.
Include all sections, headers, bullet points, tables, and other content.
Do not summarize or interpret the content, just extract the raw text.
`);
return {
text: result.text,
tokenUsage: result.tokenUsage,
};
}
// No native PDF processing, use a fallback method
console.log('Using fallback PDF text extraction');
// Here, we'd use a PDF parsing library like pdf-parse
// For simplicity, we'll just read the file and assume text extraction is handled elsewhere
// Just a placeholder - in a real implementation, would use pdf-parse or similar
const extractedText = `This is placeholder text from ${pdfPath}. In a real implementation, this would be the actual content of the PDF.`;
// Process the extracted text with AI to improve formatting
const result = await aiProvider.processText(extractedText, `
Improve the formatting of this extracted PDF text.
Maintain the structure but fix any obvious extraction errors.
Do not add or remove information.
`);
return {
text: result.text,
tokenUsage: result.tokenUsage,
};
}
catch (error) {
console.error('Error extracting text from PDF:', error);
throw error;
}
}
/**
* Calculate accuracy scores for the extracted CV data
*/
function calculateAccuracy(cvData, options = {}) {
// Ensure weights have default values to avoid 'possibly undefined' errors
const weights = {
personalInfo: options.accuracyWeights?.personalInfo ?? 0.3,
education: options.accuracyWeights?.education ?? 0.25,
experience: options.accuracyWeights?.experience ?? 0.3,
skills: options.accuracyWeights?.skills ?? 0.15,
};
// Initialize missing fields array
const missingFields = [];
// Check personal info
const personalInfoScore = calculateSectionScore(cvData.personalInfo, ['name', 'email', 'phone', 'location'], missingFields);
// Check education (if any entries exist)
const educationScore = cvData.education && cvData.education.length > 0
? calculateSectionScore(cvData.education[0], ['institution', 'degree', 'fieldOfStudy', 'startDate', 'endDate'], missingFields)
: 0;
// Check experience (if any entries exist)
const experienceScore = cvData.experience && cvData.experience.length > 0
? calculateSectionScore(cvData.experience[0], ['company', 'position', 'startDate', 'endDate', 'description'], missingFields)
: 0;
// Check skills
const skillsScore = calculateSkillsScore(cvData.skills, missingFields);
// Calculate weighted average score
const weightedScore = personalInfoScore * weights.personalInfo +
educationScore * weights.education +
experienceScore * weights.experience +
skillsScore * weights.skills;
// Calculate completeness (percentage of fields that are not null)
const completeness = 100 - missingFields.length * 5;
// For now, we'll use a simple confidence score based on the weighted score
const confidence = Math.min(95, weightedScore * 0.9);
return {
score: Math.round(weightedScore),
completeness: Math.max(0, Math.round(completeness)),
confidence: Math.round(confidence),
fieldScores: {
personalInfo: Math.round(personalInfoScore),
education: Math.round(educationScore),
experience: Math.round(experienceScore),
skills: Math.round(skillsScore),
},
missingFields,
};
}
/**
* Calculate score for a specific section
*/
function calculateSectionScore(section, requiredFields, missingFields) {
if (!section)
return 0;
let validCount = 0;
let totalFields = requiredFields.length;
for (const field of requiredFields) {
const value = section[field];
if (value === null || value === undefined || value === '') {
missingFields.push(field);
}
else if (Array.isArray(value) && value.length === 0) {
missingFields.push(field);
}
else {
validCount++;
}
}
return (validCount / totalFields) * 100;
}
/**
* Calculate score for skills section
*/
function calculateSkillsScore(skills, missingFields) {
if (!skills)
return 0;
let hasSkills = false;
let totalSkills = 0;
// Check for any non-empty skills arrays
for (const [category, skillList] of Object.entries(skills)) {
if (Array.isArray(skillList) && skillList.length > 0) {
hasSkills = true;
totalSkills += skillList.length;
}
}
if (!hasSkills) {
missingFields.push('skills');
return 0;
}
// More skills should result in higher score, up to a maximum
return Math.min(100, 50 + totalSkills * 5);
}
/**
* Check if CV data meets the minimum accuracy threshold
*/
function meetsAccuracyThreshold(cvData, minAccuracyThreshold = 70) {
return (cvData.accuracy?.score || 0) >= minAccuracyThreshold;
}
/**
* Save CV data to a JSON file
*/
function saveCVDataToJson(cvData, outputPath) {
try {
// Create directory if it doesn't exist
const dir = path.dirname(outputPath);
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
// Generate proper filename with provider, model and timestamp
const filename = path.basename(outputPath, '.json');
const provider = cvData.metadata.provider || 'unknown';
const model = cvData.metadata.model || 'unknown';
const timestamp = new Date().toISOString().replace(/:/g, '-').split('.')[0];
const enhancedFilename = `${filename}_${provider}_${model}_${timestamp}.json`;
const fullPath = path.join(dir, enhancedFilename);
// Write data to file
fs.writeFileSync(fullPath, JSON.stringify(cvData, null, 2));
console.log(`CV data saved to ${fullPath}`);
}
catch (error) {
console.error('Error saving CV data to JSON:', error);
throw error;
}
}
/**
* Process a CV using AI
*/
async function processCvWithAI(pdfPath, aiProvider, options = {}) {
console.log(`Processing CV with AI: ${pdfPath}`);
const verbose = options.verbose || false;
const minAccuracyThreshold = options.minAccuracyThreshold || 70;
// Track token usage
let tokenUsage = {
promptTokens: 0,
completionTokens: 0,
totalTokens: 0,
estimatedCost: 0,
};
try {
// Extract text from PDF
const { text, tokenUsage: extractionTokenUsage } = await extractTextFromPDF(pdfPath, aiProvider);
// Add token usage from text extraction
if (extractionTokenUsage) {
tokenUsage.promptTokens += extractionTokenUsage.promptTokens || 0;
tokenUsage.completionTokens += extractionTokenUsage.completionTokens || 0;
tokenUsage.totalTokens += extractionTokenUsage.totalTokens || 0;
tokenUsage.estimatedCost =
(tokenUsage.estimatedCost || 0) +
(extractionTokenUsage.estimatedCost || 0);
}
if (verbose) {
console.log(`Extracted ${text.length} characters of text from PDF`);
}
// Define the data schema to match our CVData type
const dataSchema = {
type: 'object',
properties: {
personalInfo: {
type: 'object',
properties: {
name: { type: 'string' },
email: { type: 'string' },
phone: { type: 'string' },
location: { type: 'string' },
linkedin: { type: 'string' },
github: { type: 'string' },
website: { type: 'string' },
summary: { type: 'string' },
},
},
education: {
type: 'array',
items: {
type: 'object',
properties: {
institution: { type: 'string' },
degree: { type: 'string' },
fieldOfStudy: { type: 'string' },
startDate: { type: 'string' },
endDate: { type: 'string' },
gpa: { type: 'string' },
location: { type: 'string' },
},
},
},
experience: {
type: 'array',
items: {
type: 'object',
properties: {
company: { type: 'string' },
position: { type: 'string' },
startDate: { type: 'string' },
endDate: { type: 'string' },
location: { type: 'string' },
description: {
type: 'array',
items: { type: 'string' },
},
},
},
},
skills: {
type: 'object',
properties: {
programmingLanguages: {
type: 'array',
items: { type: 'string' },
},
frameworks: {
type: 'array',
items: { type: 'string' },
},
tools: {
type: 'array',
items: { type: 'string' },
},
softSkills: {
type: 'array',
items: { type: 'string' },
},
other: {
type: 'array',
items: { type: 'string' },
},
},
},
},
};
// Create extraction instructions
const instructions = `
You are a CV parser specializing in extracting structured information from resumes.
Analyze the provided CV/resume and extract key information including:
1. Personal information (name, contact details, etc.)
2. Education history
3. Work experience
4. Skills (technical and soft skills)
Structure the data according to the provided JSON schema and ensure all fields are correctly populated.
If information is not found, use null for string fields and empty arrays for arrays.
For descriptions in work experience, extract bullet points as separate strings in an array.
IMPORTANT: Return ONLY the JSON object with no additional text.
`;
// Use AI to extract structured data
const extractionResult = await aiProvider.extractStructuredData(text, dataSchema, instructions);
// Add token usage from structured extraction
if (extractionResult.tokenUsage) {
tokenUsage.promptTokens += extractionResult.tokenUsage.promptTokens || 0;
tokenUsage.completionTokens +=
extractionResult.tokenUsage.completionTokens || 0;
tokenUsage.totalTokens += extractionResult.tokenUsage.totalTokens || 0;
tokenUsage.estimatedCost =
(tokenUsage.estimatedCost || 0) +
(extractionResult.tokenUsage.estimatedCost || 0);
// Remove token usage from result as we'll add our aggregated version
delete extractionResult.tokenUsage;
}
// Create default objects if any are missing
if (!extractionResult.personalInfo) {
extractionResult.personalInfo = {
name: null,
email: null,
phone: null,
location: null,
linkedin: null,
github: null,
};
}
if (!extractionResult.education) {
extractionResult.education = [];
}
if (!extractionResult.experience) {
extractionResult.experience = [];
}
if (!extractionResult.skills) {
extractionResult.skills = {};
}
// Get model info
const modelInfo = aiProvider.getModelInfo();
// Add metadata
extractionResult.metadata = {
processedDate: new Date().toISOString(),
sourceFile: path.basename(pdfPath),
provider: modelInfo.provider,
model: modelInfo.model,
};
// Calculate accuracy
extractionResult.accuracy = calculateAccuracy(extractionResult, options);
// Add token usage
extractionResult.tokenUsage = tokenUsage;
// Check accuracy threshold
if (!meetsAccuracyThreshold(extractionResult, minAccuracyThreshold)) {
console.warn(`Warning: CV extraction accuracy (${extractionResult.accuracy.score}%) is below the threshold (${minAccuracyThreshold}%)`);
}
return extractionResult;
}
catch (error) {
console.error('Error processing CV with AI:', error);
// Return basic error information with proper error message handling
const errorMessage = error instanceof Error ? error.message : String(error);
return {
personalInfo: {
name: null,
email: null,
phone: null,
location: null,
linkedin: null,
github: null,
},
education: [],
experience: [],
skills: {},
tokenUsage,
metadata: {
processedDate: new Date().toISOString(),
sourceFile: path.basename(pdfPath),
error: `Error processing CV: ${errorMessage}`,
},
};
}
}