cv-parser-ai-tb
Version:
AI-powered CV/Resume parser with multi-provider support (Gemini, OpenAI, Claude)
492 lines (449 loc) • 16.2 kB
JavaScript
;
const DocumentExtractor = require('./parsers/documentExtractor');
const AIProcessor = require('./parsers/aiProcessor');
const DataValidator = require('./validators/dataValidator');
const FieldNormalizer = require('./validators/fieldNormalizer');
const CVSchema = require('./schemas/CVSchema');
const Helpers = require('./utils/helpers');
const {
CVParserError,
DocumentExtractionError,
AIProcessingError,
ValidationError
} = require('./utils/errors');
/**
* Main CV Parser Class
* Provides flexible CV parsing with customizable schemas and multiple AI providers
*/
class CVParser {
constructor(options = {}) {
// Validate required options
if (!options.apiKey) {
throw new CVParserError('AI API key is required');
}
// Validate and set provider
this.provider = (options.provider || 'gemini').toLowerCase();
this.validateProvider(this.provider);
console.log(`Initializing CV Parser with provider: ${this.provider}`);
// Initialize AI processor with provider support
this.aiProcessor = new AIProcessor(options.apiKey, {
provider: this.provider,
model: options.model || AIProcessor.getRecommendedModel(this.provider),
temperature: options.temperature || 0.1
});
// Set schema (default or custom)
this.schema = options.schema || new CVSchema();
this.validator = new DataValidator(this.schema);
// Parser options with enhanced defaults
this.options = {
includeMetadata: options.includeMetadata !== false,
includeKeywords: options.includeKeywords !== false,
validateData: options.validateData !== false,
normalizeData: options.normalizeData !== false,
strictValidation: options.strictValidation || false,
confidenceThreshold: options.confidenceThreshold || 0.5,
retryOnFailure: options.retryOnFailure !== false,
maxRetries: options.maxRetries || 2,
...options
};
console.log('CV Parser initialized successfully');
}
/**
* Validate if provider is supported
*/
validateProvider(provider) {
const availableProviders = AIProcessor.getAvailableProviders();
if (!availableProviders.includes(provider)) {
const supportedList = availableProviders.join(', ');
throw new CVParserError(`Provider '${provider}' is not supported or not installed. ` + `Available providers: ${supportedList}. ` + `Install missing providers with: npm install openai @anthropic-ai/sdk`);
}
}
/**
* Parse CV from file path
*/
async parse(filePath, options = {}) {
console.log(`Parsing CV from file: ${filePath}`);
try {
const mergedOptions = {
...this.options,
...options
};
// Extract text from document
console.log('Extracting text from document...');
const extractedData = await DocumentExtractor.extractText(filePath);
const preprocessedData = DocumentExtractor.preprocessText(extractedData);
console.log(`Extracted ${preprocessedData.wordCount} words from document`);
// Process with AI (with retry logic)
const aiResult = await this._processWithRetry(preprocessedData.text, mergedOptions);
if (!aiResult.success) {
throw new AIProcessingError(aiResult.error);
}
// Post-process the results
console.log('Post-processing results...');
const finalResult = await this._postProcessResults(aiResult.data, preprocessedData, mergedOptions);
console.log('✅ CV parsing completed successfully');
return finalResult;
} catch (error) {
console.error('❌ CV parsing failed:', error.message);
if (error instanceof CVParserError) {
throw error;
}
throw new CVParserError(`Failed to parse CV: ${error.message}`);
}
}
/**
* Parse CV from buffer (for file uploads)
*/
async parseBuffer(buffer, fileType, options = {}) {
console.log(`Parsing CV from buffer, type: ${fileType}`);
try {
const mergedOptions = {
...this.options,
...options
};
// Auto-detect file type if not provided
if (!fileType) {
fileType = Helpers.detectFileType(buffer);
console.log(`Auto-detected file type: ${fileType}`);
}
// Extract text from buffer
console.log('Extracting text from buffer...');
const extractedData = await DocumentExtractor.extractTextFromBuffer(buffer, fileType);
const preprocessedData = DocumentExtractor.preprocessText(extractedData);
console.log(`Extracted ${preprocessedData.wordCount} words from buffer`);
// Process with AI (with retry logic)
const aiResult = await this._processWithRetry(preprocessedData.text, mergedOptions);
if (!aiResult.success) {
throw new AIProcessingError(aiResult.error);
}
// Post-process the results
console.log('Post-processing results...');
const finalResult = await this._postProcessResults(aiResult.data, preprocessedData, mergedOptions);
console.log('✅ CV parsing from buffer completed successfully');
return finalResult;
} catch (error) {
console.error('❌ CV parsing from buffer failed:', error.message);
if (error instanceof CVParserError) {
throw error;
}
throw new CVParserError(`Failed to parse CV from buffer: ${error.message}`);
}
}
/**
* Parse multiple CVs in batch
*/
async parseBatch(files, options = {}) {
console.log(`Starting batch processing of ${files.length} files`);
const results = [];
const mergedOptions = {
...this.options,
...options
};
let successCount = 0;
let failureCount = 0;
for (let i = 0; i < files.length; i++) {
const file = files[i];
const fileName = file.name || file || `file-${i + 1}`;
console.log(`Processing file ${i + 1}/${files.length}: ${fileName}`);
try {
let result;
if (typeof file === 'string') {
// File path
result = await this.parse(file, mergedOptions);
} else if (file.buffer && file.type) {
// Buffer with type
result = await this.parseBuffer(file.buffer, file.type, mergedOptions);
} else {
throw new CVParserError('Invalid file format in batch');
}
results.push({
success: true,
data: result,
file: fileName,
index: i
});
successCount++;
console.log(`✅ Successfully processed: ${fileName}`);
} catch (error) {
results.push({
success: false,
error: error.message,
file: fileName,
index: i
});
failureCount++;
console.error(`❌ Failed to process: ${fileName} - ${error.message}`);
}
}
console.log(`Batch processing completed. Success: ${successCount}, Failed: ${failureCount}`);
return {
results,
summary: {
total: files.length,
successful: successCount,
failed: failureCount,
successRate: Math.round(successCount / files.length * 100)
}
};
}
/**
* Process with AI with retry logic
*/
async _processWithRetry(text, options) {
let lastError;
const maxRetries = options.retryOnFailure ? options.maxRetries : 0;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
if (attempt > 0) {
console.log(`Retry attempt ${attempt}/${maxRetries}`);
// Add small delay between retries
await new Promise(resolve => setTimeout(resolve, 1000 * attempt));
}
const aiResult = await this.aiProcessor.processWithSchema(text, this.schema);
if (aiResult.success) {
if (attempt > 0) {
console.log(`✅ Succeeded on retry attempt ${attempt}`);
}
return aiResult;
} else {
lastError = new Error(aiResult.error);
}
} catch (error) {
lastError = error;
console.warn(`Attempt ${attempt + 1} failed:`, error.message);
}
}
throw lastError;
}
/**
* Post-process AI results
*/
async _postProcessResults(data, extractedData, options) {
var _processedData$metada;
let processedData = {
...data
};
// Validate data if enabled
if (options.validateData) {
console.log('Validating extracted data...');
const validationResult = this.validator.validate(processedData);
if (!validationResult.isValid) {
console.warn('Validation warnings:', validationResult.warnings);
console.warn('Validation errors:', validationResult.errors);
if (options.strictValidation) {
throw new ValidationError(`Validation failed: ${validationResult.errors.join(', ')}`);
}
}
processedData = validationResult.data;
}
// Normalize data if enabled
if (options.normalizeData) {
console.log('Normalizing data...');
processedData = this._normalizeData(processedData);
}
// Add metadata if enabled
if (options.includeMetadata) {
const confidence = this._calculateOverallConfidence(processedData);
processedData.metadata = {
parseDate: new Date().toISOString(),
parseConfidence: confidence,
provider: this.provider,
model: this.aiProcessor.model,
wordCount: extractedData.wordCount,
lineCount: extractedData.lineCount,
processingTime: Date.now(),
// Can be enhanced to track actual time
...(processedData.metadata || {})
};
console.log(`Parse confidence: ${Math.round(confidence * 100)}%`);
}
// Add keywords if enabled
if (options.includeKeywords) {
console.log('Extracting keywords...');
const keywords = Helpers.extractKeywords(extractedData.text);
if (processedData.metadata) {
processedData.metadata.keywords = keywords;
} else {
processedData.keywords = keywords;
}
}
// Check confidence threshold
const confidence = ((_processedData$metada = processedData.metadata) === null || _processedData$metada === void 0 ? void 0 : _processedData$metada.parseConfidence) || 0;
if (confidence < options.confidenceThreshold) {
console.warn(`⚠️ Parse confidence (${Math.round(confidence * 100)}%) below threshold (${Math.round(options.confidenceThreshold * 100)}%)`);
}
return processedData;
}
/**
* Normalize parsed data
*/
_normalizeData(data) {
const normalized = {
...data
};
// Normalize personal information
if (normalized.personal) {
if (normalized.personal.email) {
normalized.personal.email = FieldNormalizer.normalizeEmail(normalized.personal.email);
}
if (normalized.personal.phone) {
normalized.personal.phone = FieldNormalizer.normalizePhone(normalized.personal.phone);
}
if (normalized.personal.fullName) {
normalized.personal.fullName = FieldNormalizer.normalizeName(normalized.personal.fullName);
}
if (normalized.personal.linkedIn) {
normalized.personal.linkedIn = FieldNormalizer.normalizeURL(normalized.personal.linkedIn);
}
if (normalized.personal.github) {
normalized.personal.github = FieldNormalizer.normalizeURL(normalized.personal.github);
}
if (normalized.personal.website) {
normalized.personal.website = FieldNormalizer.normalizeURL(normalized.personal.website);
}
}
// Normalize experience dates and calculate durations
if (normalized.experience && Array.isArray(normalized.experience)) {
normalized.experience = normalized.experience.map(exp => ({
...exp,
startDate: exp.startDate ? FieldNormalizer.normalizeDate(exp.startDate) : null,
endDate: exp.endDate ? FieldNormalizer.normalizeDate(exp.endDate) : null,
duration: FieldNormalizer.calculateDuration(exp.startDate, exp.endDate)
}));
}
// Normalize education dates
if (normalized.education && Array.isArray(normalized.education)) {
normalized.education = normalized.education.map(edu => ({
...edu,
startDate: edu.startDate ? FieldNormalizer.normalizeDate(edu.startDate) : null,
endDate: edu.endDate ? FieldNormalizer.normalizeDate(edu.endDate) : null
}));
}
// Normalize skills
if (normalized.skills) {
Object.keys(normalized.skills).forEach(key => {
if (Array.isArray(normalized.skills[key])) {
normalized.skills[key] = FieldNormalizer.normalizeSkills(normalized.skills[key]);
}
});
}
// Normalize certifications
if (normalized.certifications && Array.isArray(normalized.certifications)) {
normalized.certifications = normalized.certifications.map(cert => ({
...cert,
issueDate: cert.issueDate ? FieldNormalizer.normalizeDate(cert.issueDate) : null,
expiryDate: cert.expiryDate ? FieldNormalizer.normalizeDate(cert.expiryDate) : null,
url: cert.url ? FieldNormalizer.normalizeURL(cert.url) : null
}));
}
return normalized;
}
/**
* Calculate overall confidence score
*/
_calculateOverallConfidence(data) {
let score = 0;
let maxScore = 0;
// Personal information (40% weight)
if (data.personal) {
if (data.personal.fullName) score += 15;
if (data.personal.email) score += 15;
if (data.personal.phone) score += 10;
maxScore += 40;
}
// Experience (30% weight)
if (data.experience && Array.isArray(data.experience) && data.experience.length > 0) {
const expScore = Math.min(data.experience.length * 10, 30);
score += expScore;
maxScore += 30;
}
// Education (15% weight)
if (data.education && Array.isArray(data.education) && data.education.length > 0) {
const eduScore = Math.min(data.education.length * 7.5, 15);
score += eduScore;
maxScore += 15;
}
// Skills (15% weight)
if (data.skills && (data.skills.technical || data.skills.soft)) {
score += 15;
maxScore += 15;
}
return maxScore > 0 ? Math.round(score / maxScore * 100) / 100 : 0;
}
/**
* Get parser information and capabilities
*/
getInfo() {
return {
provider: this.provider,
model: this.aiProcessor.model,
availableProviders: AIProcessor.getAvailableProviders(),
schema: this.schema.getRequiredFields(),
options: this.options
};
}
/**
* Static method to get available providers
*/
static getAvailableProviders() {
return AIProcessor.getAvailableProviders();
}
/**
* Static method to create parser with custom schema
*/
static withSchema(schema, options = {}) {
return new CVParser({
...options,
schema: schema
});
}
/**
* Static method to create parser with minimal schema
*/
static minimal(options = {}) {
return new CVParser({
...options,
schema: CVSchema.getMinimalSchema()
});
}
/**
* Static method to create parser optimized for ATS
*/
static forATS(options = {}) {
return new CVParser({
...options,
schema: CVSchema.getATSSchema()
});
}
/**
* Static method to create parser with specific provider
*/
static withProvider(provider, options = {}) {
return new CVParser({
...options,
provider: provider
});
}
/**
* Static method for quick parsing with sensible defaults
*/
static async quickParse(filePath, apiKey, provider = 'gemini') {
const parser = new CVParser({
apiKey,
provider,
includeMetadata: true,
normalizeData: true
});
return await parser.parse(filePath);
}
}
// Export main class and utilities
module.exports = CVParser;
module.exports.CVSchema = CVSchema;
module.exports.FIELD_TYPES = require('./schemas/fieldTypes').FIELD_TYPES;
module.exports.DocumentExtractor = DocumentExtractor;
module.exports.AIProcessor = AIProcessor;
module.exports.DataValidator = DataValidator;
module.exports.FieldNormalizer = FieldNormalizer;
module.exports.Helpers = Helpers;
module.exports.errors = require('./utils/errors');