UNPKG

@thecodingwhale/cv-processor

Version:

CV Processor to extract structured data from PDF resumes using TypeScript

106 lines (105 loc) 4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.EducationExtractor = void 0; const nlp_1 = require("../utils/nlp"); const patterns_1 = require("../utils/patterns"); /** * Class for extracting education information from CV text */ class EducationExtractor { /** * Extract education information from the education section text */ extractEducation(educationText) { if (!educationText) { return []; } // Split into education entries (usually separated by double newlines) const educationEntries = educationText .split(/\n\s*\n/) .filter((entry) => entry.trim()); const entries = []; for (const entryText of educationEntries) { const entry = { institution: null, degree: null, fieldOfStudy: null, startDate: null, endDate: null, gpa: null, location: null, }; // Extract institution (organization entities) const organizations = nlp_1.NLPUtils.extractOrganizations(entryText); if (organizations.length > 0) { entry.institution = organizations[0]; } // Extract dates this.extractDates(entryText, entry); // Extract degree this.extractDegree(entryText, entry); // Extract GPA if present const gpaMatch = entryText.match(patterns_1.Patterns.gpa); if (gpaMatch && gpaMatch.length > 1) { entry.gpa = gpaMatch[1]; } // Extract location const locations = nlp_1.NLPUtils.extractLocations(entryText); if (locations.length > 0) { entry.location = locations[0]; } entries.push(entry); } return entries; } /** * Extract dates from education entry text */ extractDates(entryText, entry) { const dateMatches = entryText.match(patterns_1.Patterns.date); if (dateMatches && dateMatches.length > 0) { // Check for patterns that indicate current/ongoing education const isPresentOrCurrent = /present|current|now/i.test(entryText); // Try to find multiple date matches const allMatches = [ ...entryText.matchAll(new RegExp(patterns_1.Patterns.date, 'gi')), ]; if (allMatches.length >= 2) { // Assume first is start date, second is end date entry.startDate = allMatches[0][0]; entry.endDate = allMatches[1][0]; } else if (allMatches.length === 1) { if (isPresentOrCurrent) { entry.startDate = allMatches[0][0]; entry.endDate = 'Present'; } else { // Just one date, assume it's the end date entry.endDate = allMatches[0][0]; } } } } /** * Extract degree and field of study from education entry text */ extractDegree(entryText, entry) { // Check each degree pattern for (const pattern of patterns_1.Patterns.degreePatterns) { const matches = entryText.match(pattern); if (matches && matches.length > 0) { entry.degree = matches[0].trim(); break; } } // Extract field of study (often after "in" following degree) if (entry.degree) { const fieldMatch = entryText.match(new RegExp(`${entry.degree.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&')}${patterns_1.Patterns.fieldOfStudy.source}`, 'i')); if (fieldMatch && fieldMatch.length > 1) { entry.fieldOfStudy = fieldMatch[1].trim(); } } } } exports.EducationExtractor = EducationExtractor;