UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

407 lines (403 loc) 16.6 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.extractTextFromPDF = extractTextFromPDF; exports.calculateAccuracy = calculateAccuracy; exports.meetsAccuracyThreshold = meetsAccuracyThreshold; exports.saveCVDataToJson = saveCVDataToJson; exports.processCvWithAI = processCvWithAI; const fs = __importStar(require("fs")); const path = __importStar(require("path")); /** * Extract text from a PDF file using AI capabilities */ async function extractTextFromPDF(pdfPath, aiProvider) { console.log(`Extracting text from PDF: ${pdfPath}`); try { // Check if the provider has a dedicated PDF processing method if (aiProvider.processPDF) { console.log("Using AI provider's native PDF processing"); const result = await aiProvider.processPDF(pdfPath, ` Extract all text content from this PDF document. Maintain the formatting and structure as much as possible. Include all sections, headers, bullet points, tables, and other content. Do not summarize or interpret the content, just extract the raw text. `); return { text: result.text, tokenUsage: result.tokenUsage, }; } // No native PDF processing, use a fallback method console.log('Using fallback PDF text extraction'); // Here, we'd use a PDF parsing library like pdf-parse // For simplicity, we'll just read the file and assume text extraction is handled elsewhere // Just a placeholder - in a real implementation, would use pdf-parse or similar const extractedText = `This is placeholder text from ${pdfPath}. In a real implementation, this would be the actual content of the PDF.`; // Process the extracted text with AI to improve formatting const result = await aiProvider.processText(extractedText, ` Improve the formatting of this extracted PDF text. Maintain the structure but fix any obvious extraction errors. Do not add or remove information. `); return { text: result.text, tokenUsage: result.tokenUsage, }; } catch (error) { console.error('Error extracting text from PDF:', error); throw error; } } /** * Calculate accuracy scores for the extracted CV data */ function calculateAccuracy(cvData, options = {}) { // Ensure weights have default values to avoid 'possibly undefined' errors const weights = { personalInfo: options.accuracyWeights?.personalInfo ?? 0.3, education: options.accuracyWeights?.education ?? 0.25, experience: options.accuracyWeights?.experience ?? 0.3, skills: options.accuracyWeights?.skills ?? 0.15, }; // Initialize missing fields array const missingFields = []; // Check personal info const personalInfoScore = calculateSectionScore(cvData.personalInfo, ['name', 'email', 'phone', 'location'], missingFields); // Check education (if any entries exist) const educationScore = cvData.education && cvData.education.length > 0 ? calculateSectionScore(cvData.education[0], ['institution', 'degree', 'fieldOfStudy', 'startDate', 'endDate'], missingFields) : 0; // Check experience (if any entries exist) const experienceScore = cvData.experience && cvData.experience.length > 0 ? calculateSectionScore(cvData.experience[0], ['company', 'position', 'startDate', 'endDate', 'description'], missingFields) : 0; // Check skills const skillsScore = calculateSkillsScore(cvData.skills, missingFields); // Calculate weighted average score const weightedScore = personalInfoScore * weights.personalInfo + educationScore * weights.education + experienceScore * weights.experience + skillsScore * weights.skills; // Calculate completeness (percentage of fields that are not null) const completeness = 100 - missingFields.length * 5; // For now, we'll use a simple confidence score based on the weighted score const confidence = Math.min(95, weightedScore * 0.9); return { score: Math.round(weightedScore), completeness: Math.max(0, Math.round(completeness)), confidence: Math.round(confidence), fieldScores: { personalInfo: Math.round(personalInfoScore), education: Math.round(educationScore), experience: Math.round(experienceScore), skills: Math.round(skillsScore), }, missingFields, }; } /** * Calculate score for a specific section */ function calculateSectionScore(section, requiredFields, missingFields) { if (!section) return 0; let validCount = 0; let totalFields = requiredFields.length; for (const field of requiredFields) { const value = section[field]; if (value === null || value === undefined || value === '') { missingFields.push(field); } else if (Array.isArray(value) && value.length === 0) { missingFields.push(field); } else { validCount++; } } return (validCount / totalFields) * 100; } /** * Calculate score for skills section */ function calculateSkillsScore(skills, missingFields) { if (!skills) return 0; let hasSkills = false; let totalSkills = 0; // Check for any non-empty skills arrays for (const [category, skillList] of Object.entries(skills)) { if (Array.isArray(skillList) && skillList.length > 0) { hasSkills = true; totalSkills += skillList.length; } } if (!hasSkills) { missingFields.push('skills'); return 0; } // More skills should result in higher score, up to a maximum return Math.min(100, 50 + totalSkills * 5); } /** * Check if CV data meets the minimum accuracy threshold */ function meetsAccuracyThreshold(cvData, minAccuracyThreshold = 70) { return (cvData.accuracy?.score || 0) >= minAccuracyThreshold; } /** * Save CV data to a JSON file */ function saveCVDataToJson(cvData, outputPath) { try { // Create directory if it doesn't exist const dir = path.dirname(outputPath); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } // Generate proper filename with provider, model and timestamp const filename = path.basename(outputPath, '.json'); const provider = cvData.metadata.provider || 'unknown'; const model = cvData.metadata.model || 'unknown'; const timestamp = new Date().toISOString().replace(/:/g, '-').split('.')[0]; const enhancedFilename = `${filename}_${provider}_${model}_${timestamp}.json`; const fullPath = path.join(dir, enhancedFilename); // Write data to file fs.writeFileSync(fullPath, JSON.stringify(cvData, null, 2)); console.log(`CV data saved to ${fullPath}`); } catch (error) { console.error('Error saving CV data to JSON:', error); throw error; } } /** * Process a CV using AI */ async function processCvWithAI(pdfPath, aiProvider, options = {}) { console.log(`Processing CV with AI: ${pdfPath}`); const verbose = options.verbose || false; const minAccuracyThreshold = options.minAccuracyThreshold || 70; // Track token usage let tokenUsage = { promptTokens: 0, completionTokens: 0, totalTokens: 0, estimatedCost: 0, }; try { // Extract text from PDF const { text, tokenUsage: extractionTokenUsage } = await extractTextFromPDF(pdfPath, aiProvider); // Add token usage from text extraction if (extractionTokenUsage) { tokenUsage.promptTokens += extractionTokenUsage.promptTokens || 0; tokenUsage.completionTokens += extractionTokenUsage.completionTokens || 0; tokenUsage.totalTokens += extractionTokenUsage.totalTokens || 0; tokenUsage.estimatedCost = (tokenUsage.estimatedCost || 0) + (extractionTokenUsage.estimatedCost || 0); } if (verbose) { console.log(`Extracted ${text.length} characters of text from PDF`); } // Define the data schema to match our CVData type const dataSchema = { type: 'object', properties: { personalInfo: { type: 'object', properties: { name: { type: 'string' }, email: { type: 'string' }, phone: { type: 'string' }, location: { type: 'string' }, linkedin: { type: 'string' }, github: { type: 'string' }, website: { type: 'string' }, summary: { type: 'string' }, }, }, education: { type: 'array', items: { type: 'object', properties: { institution: { type: 'string' }, degree: { type: 'string' }, fieldOfStudy: { type: 'string' }, startDate: { type: 'string' }, endDate: { type: 'string' }, gpa: { type: 'string' }, location: { type: 'string' }, }, }, }, experience: { type: 'array', items: { type: 'object', properties: { company: { type: 'string' }, position: { type: 'string' }, startDate: { type: 'string' }, endDate: { type: 'string' }, location: { type: 'string' }, description: { type: 'array', items: { type: 'string' }, }, }, }, }, skills: { type: 'object', properties: { programmingLanguages: { type: 'array', items: { type: 'string' }, }, frameworks: { type: 'array', items: { type: 'string' }, }, tools: { type: 'array', items: { type: 'string' }, }, softSkills: { type: 'array', items: { type: 'string' }, }, other: { type: 'array', items: { type: 'string' }, }, }, }, }, }; // Create extraction instructions const instructions = ` You are a CV parser specializing in extracting structured information from resumes. Analyze the provided CV/resume and extract key information including: 1. Personal information (name, contact details, etc.) 2. Education history 3. Work experience 4. Skills (technical and soft skills) Structure the data according to the provided JSON schema and ensure all fields are correctly populated. If information is not found, use null for string fields and empty arrays for arrays. For descriptions in work experience, extract bullet points as separate strings in an array. IMPORTANT: Return ONLY the JSON object with no additional text. `; // Use AI to extract structured data const extractionResult = await aiProvider.extractStructuredData(text, dataSchema, instructions); // Add token usage from structured extraction if (extractionResult.tokenUsage) { tokenUsage.promptTokens += extractionResult.tokenUsage.promptTokens || 0; tokenUsage.completionTokens += extractionResult.tokenUsage.completionTokens || 0; tokenUsage.totalTokens += extractionResult.tokenUsage.totalTokens || 0; tokenUsage.estimatedCost = (tokenUsage.estimatedCost || 0) + (extractionResult.tokenUsage.estimatedCost || 0); // Remove token usage from result as we'll add our aggregated version delete extractionResult.tokenUsage; } // Create default objects if any are missing if (!extractionResult.personalInfo) { extractionResult.personalInfo = { name: null, email: null, phone: null, location: null, linkedin: null, github: null, }; } if (!extractionResult.education) { extractionResult.education = []; } if (!extractionResult.experience) { extractionResult.experience = []; } if (!extractionResult.skills) { extractionResult.skills = {}; } // Get model info const modelInfo = aiProvider.getModelInfo(); // Add metadata extractionResult.metadata = { processedDate: new Date().toISOString(), sourceFile: path.basename(pdfPath), provider: modelInfo.provider, model: modelInfo.model, }; // Calculate accuracy extractionResult.accuracy = calculateAccuracy(extractionResult, options); // Add token usage extractionResult.tokenUsage = tokenUsage; // Check accuracy threshold if (!meetsAccuracyThreshold(extractionResult, minAccuracyThreshold)) { console.warn(`Warning: CV extraction accuracy (${extractionResult.accuracy.score}%) is below the threshold (${minAccuracyThreshold}%)`); } return extractionResult; } catch (error) { console.error('Error processing CV with AI:', error); // Return basic error information with proper error message handling const errorMessage = error instanceof Error ? error.message : String(error); return { personalInfo: { name: null, email: null, phone: null, location: null, linkedin: null, github: null, }, education: [], experience: [], skills: {}, tokenUsage, metadata: { processedDate: new Date().toISOString(), sourceFile: path.basename(pdfPath), error: `Error processing CV: ${errorMessage}`, }, }; } }