@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
106 lines (105 loc) • 4 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.EducationExtractor = void 0;
const nlp_1 = require("../utils/nlp");
const patterns_1 = require("../utils/patterns");
/**
* Class for extracting education information from CV text
*/
class EducationExtractor {
/**
* Extract education information from the education section text
*/
extractEducation(educationText) {
if (!educationText) {
return [];
}
// Split into education entries (usually separated by double newlines)
const educationEntries = educationText
.split(/\n\s*\n/)
.filter((entry) => entry.trim());
const entries = [];
for (const entryText of educationEntries) {
const entry = {
institution: null,
degree: null,
fieldOfStudy: null,
startDate: null,
endDate: null,
gpa: null,
location: null,
};
// Extract institution (organization entities)
const organizations = nlp_1.NLPUtils.extractOrganizations(entryText);
if (organizations.length > 0) {
entry.institution = organizations[0];
}
// Extract dates
this.extractDates(entryText, entry);
// Extract degree
this.extractDegree(entryText, entry);
// Extract GPA if present
const gpaMatch = entryText.match(patterns_1.Patterns.gpa);
if (gpaMatch && gpaMatch.length > 1) {
entry.gpa = gpaMatch[1];
}
// Extract location
const locations = nlp_1.NLPUtils.extractLocations(entryText);
if (locations.length > 0) {
entry.location = locations[0];
}
entries.push(entry);
}
return entries;
}
/**
* Extract dates from education entry text
*/
extractDates(entryText, entry) {
const dateMatches = entryText.match(patterns_1.Patterns.date);
if (dateMatches && dateMatches.length > 0) {
// Check for patterns that indicate current/ongoing education
const isPresentOrCurrent = /present|current|now/i.test(entryText);
// Try to find multiple date matches
const allMatches = [
...entryText.matchAll(new RegExp(patterns_1.Patterns.date, 'gi')),
];
if (allMatches.length >= 2) {
// Assume first is start date, second is end date
entry.startDate = allMatches[0][0];
entry.endDate = allMatches[1][0];
}
else if (allMatches.length === 1) {
if (isPresentOrCurrent) {
entry.startDate = allMatches[0][0];
entry.endDate = 'Present';
}
else {
// Just one date, assume it's the end date
entry.endDate = allMatches[0][0];
}
}
}
}
/**
* Extract degree and field of study from education entry text
*/
extractDegree(entryText, entry) {
// Check each degree pattern
for (const pattern of patterns_1.Patterns.degreePatterns) {
const matches = entryText.match(pattern);
if (matches && matches.length > 0) {
entry.degree = matches[0].trim();
break;
}
}
// Extract field of study (often after "in" following degree)
if (entry.degree) {
const fieldMatch = entryText.match(new RegExp(`${entry.degree.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&')}${patterns_1.Patterns.fieldOfStudy.source}`, 'i'));
if (fieldMatch && fieldMatch.length > 1) {
entry.fieldOfStudy = fieldMatch[1].trim();
}
}
}
}
exports.EducationExtractor = EducationExtractor;