@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
123 lines (122 loc) • 4.61 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.ExperienceExtractor = void 0;
const nlp_1 = require("../utils/nlp");
const patterns_1 = require("../utils/patterns");
/**
* Class for extracting work experience information from CV text
*/
class ExperienceExtractor {
/**
* Extract work experience information from the experience section text
*/
extractWorkExperience(experienceText) {
if (!experienceText) {
return [];
}
// Split by double newlines to separate individual entries
const experiences = experienceText
.split(/\n\s*\n/)
.filter((exp) => exp.trim());
const entries = [];
for (const expText of experiences) {
const entry = {
company: null,
position: null,
startDate: null,
endDate: null,
location: null,
description: [],
};
// Extract company (usually an organization entity)
const organizations = nlp_1.NLPUtils.extractOrganizations(expText);
if (organizations.length > 0) {
entry.company = organizations[0];
}
// Extract job title
this.extractJobTitle(expText, entry);
// Extract dates
this.extractDates(expText, entry);
// Extract location
const locations = nlp_1.NLPUtils.extractLocations(expText);
if (locations.length > 0) {
entry.location = locations[0];
}
// Extract description (bullet points or paragraph after title)
this.extractDescription(expText, entry);
entries.push(entry);
}
return entries;
}
/**
* Extract job title from experience text
*/
extractJobTitle(expText, entry) {
for (const pattern of patterns_1.Patterns.titlePatterns) {
const matches = expText.match(pattern);
if (matches && matches.length > 1) {
entry.position = matches[1].trim();
break;
}
}
}
/**
* Extract dates from experience text
*/
extractDates(expText, entry) {
const dateMatches = expText.match(patterns_1.Patterns.date);
if (dateMatches && dateMatches.length > 0) {
// Check for patterns that indicate current position
const isPresentOrCurrent = /present|current|now/i.test(expText);
// Try to find multiple date matches
const allMatches = [...expText.matchAll(new RegExp(patterns_1.Patterns.date, 'gi'))];
if (allMatches.length >= 2) {
// Assume first is start date, second is end date
entry.startDate = allMatches[0][0];
entry.endDate = allMatches[1][0];
}
else if (allMatches.length === 1) {
if (isPresentOrCurrent) {
entry.startDate = allMatches[0][0];
entry.endDate = 'Present';
}
else {
// Just one date, assume it's the end date
entry.endDate = allMatches[0][0];
}
}
}
}
/**
* Extract job description from experience text
*/
extractDescription(expText, entry) {
const lines = expText.split('\n');
let descriptionStarted = false;
for (const line of lines) {
const trimmedLine = line.trim();
if (!trimmedLine) {
continue;
}
// Skip the line with company name, title, and dates
if ((entry.company && trimmedLine.includes(entry.company)) ||
(entry.position && trimmedLine.includes(entry.position))) {
continue;
}
// Look for bullet points or narrative descriptions
if (trimmedLine.startsWith('•') ||
trimmedLine.startsWith('-') ||
trimmedLine.startsWith('*') ||
descriptionStarted) {
descriptionStarted = true;
// Clean bullet points
const cleanLine = trimmedLine.replace(patterns_1.Patterns.bulletPoint, '').trim();
if (cleanLine && cleanLine.length > 10) {
// Minimum meaningful description
entry.description.push(cleanLine);
}
}
}
}
}
exports.ExperienceExtractor = ExperienceExtractor;