@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
172 lines (171 loc) • 7.38 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.CVProcessor = void 0;
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
const EducationExtractor_1 = require("./extractors/EducationExtractor");
const ExperienceExtractor_1 = require("./extractors/ExperienceExtractor");
const PersonalInfoExtractor_1 = require("./extractors/PersonalInfoExtractor");
const SectionExtractor_1 = require("./extractors/SectionExtractor");
const SkillsExtractor_1 = require("./extractors/SkillsExtractor");
const TextExtractor_1 = require("./extractors/TextExtractor");
const AccuracyCalculator_1 = require("./utils/AccuracyCalculator");
/**
* Main CV Processor class to extract structured data from PDF resumes
*/
class CVProcessor {
/**
* Initialize the CV processor
*/
constructor(options = {}) {
this.textExtractor = new TextExtractor_1.TextExtractor();
this.sectionExtractor = new SectionExtractor_1.SectionExtractor();
this.personalInfoExtractor = new PersonalInfoExtractor_1.PersonalInfoExtractor();
this.educationExtractor = new EducationExtractor_1.EducationExtractor();
this.experienceExtractor = new ExperienceExtractor_1.ExperienceExtractor();
this.skillsExtractor = new SkillsExtractor_1.SkillsExtractor();
this.accuracyCalculator = new AccuracyCalculator_1.AccuracyCalculator(options);
this.verbose = options.verbose || false;
this.minAccuracyThreshold = options.minAccuracyThreshold || 70;
if (this.verbose) {
console.log('CV Processor initialized');
}
}
/**
* Process a CV PDF and extract structured information
*/
async processCv(pdfPath) {
console.log(`Processing CV: ${pdfPath}`);
// Extract text from PDF
const text = await this.textExtractor.extractTextFromPDF(pdfPath);
// Segment into sections
const sections = this.sectionExtractor.segmentCVIntoSections(text);
// Extract information from each section
const personalInfo = this.personalInfoExtractor.extractPersonalInfo(sections.header || '');
if (sections.summary) {
personalInfo.summary = this.personalInfoExtractor.extractSummary(sections.summary);
}
const education = this.educationExtractor.extractEducation(sections.education || null);
const experience = this.experienceExtractor.extractWorkExperience(sections.experience || null);
const skills = this.skillsExtractor.extractSkills(sections.skills || null);
// Build complete CV data
const cvData = {
personalInfo,
education,
experience,
skills,
metadata: {
processedDate: new Date().toISOString(),
sourceFile: path.basename(pdfPath),
provider: 'traditional',
model: 'rule-based',
},
};
// Calculate accuracy score
const accuracy = this.accuracyCalculator.calculateAccuracy(cvData);
cvData.accuracy = accuracy;
if (this.verbose) {
console.log(`CV Accuracy Score: ${accuracy.score}`);
console.log(`Completeness: ${accuracy.completeness}`);
console.log(`Confidence: ${accuracy.confidence}`);
if (accuracy.missingFields.length > 0) {
console.log('Missing Fields:', accuracy.missingFields);
}
if (!this.accuracyCalculator.meetsThreshold(accuracy)) {
console.warn(`Warning: CV data does not meet minimum accuracy threshold of ${this.minAccuracyThreshold}%`);
}
}
return cvData;
}
/**
* Save CV data to a JSON file
*/
saveToJson(cvData, outputPath) {
try {
// Generate a filename that includes provider, model, and timestamp
const timestamp = new Date()
.toISOString()
.replace(/:/g, '-')
.replace(/\./g, '-');
const providerName = cvData.metadata?.provider || 'unknown';
const modelName = cvData.metadata?.model || 'unknown';
// Extract base path and extension
const outputDir = path.dirname(outputPath);
const outputBaseName = path.basename(outputPath, path.extname(outputPath));
const outputExt = path.extname(outputPath);
// Create filename with provider, model, and timestamp
const newOutputPath = path.join(outputDir, `${outputBaseName}_${providerName}_${modelName}_${timestamp}${outputExt}`);
fs.writeFileSync(newOutputPath, JSON.stringify(cvData, null, 2));
console.log(`Results saved to ${newOutputPath}`);
// Log accuracy information if available
if (cvData.accuracy) {
console.log(`CV Accuracy: ${cvData.accuracy.score}%`);
if (!this.accuracyCalculator.meetsThreshold(cvData.accuracy)) {
console.warn(`Warning: This CV scored below the minimum accuracy threshold (${this.minAccuracyThreshold}%)`);
}
}
}
catch (error) {
console.error(`Error saving JSON file: ${error}`);
throw error;
}
}
getModelInfo() {
return {
provider: 'traditional',
model: 'rule-based',
};
}
/**
* Check if the CV meets the minimum accuracy threshold
*/
meetsAccuracyThreshold(cvData) {
if (!cvData.accuracy) {
return false;
}
return this.accuracyCalculator.meetsThreshold(cvData.accuracy);
}
/**
* Set minimum accuracy threshold
*/
setMinAccuracyThreshold(threshold) {
if (threshold < 0 || threshold > 100) {
throw new Error('Accuracy threshold must be between 0 and 100');
}
this.minAccuracyThreshold = threshold;
}
}
exports.CVProcessor = CVProcessor;