@thecodingwhale/cv-processor
Version:
CV Processor to extract structured data from PDF resumes using TypeScript
880 lines (879 loc) • 21.6 kB
JavaScript
;
/**
* Regex patterns for CV parsing
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.TrainingPatterns = exports.PhysicalAttributes = exports.MediaPatterns = exports.CreditPatterns = exports.SkillCategories = exports.Patterns = void 0;
exports.Patterns = {
// Contact information
email: /[\w.+-]+@[\w-]+\.[\w.-]+/g,
phone: /(\+\d{1,3}\s?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}/g,
// Social profiles
linkedin: /linkedin\.com\/in\/[\w-]+/g,
github: /github\.com\/[\w-]+/g,
imdb: /imdb\.com\/name\/[\w-]+/g,
instagram: /instagram\.com\/[\w.-]+/g,
spotlight: /spotlight\.com\/[\w-]+/g,
castingNetworks: /castingnetworks\.com\/[\w-]+/g,
actorsAccess: /actorsaccess\.com\/[\w-]+/g,
// Media and Portfolio
videoReel: /(?:video|demo|showreel|reel|demoreel)(?:\s*:)?\s*(https?:\/\/[^\s,]+)/gi,
audioReel: /(?:audio|voice|vocal)(?:\s*reel|\s*demo)(?:\s*:)?\s*(https?:\/\/[^\s,]+)/gi,
headshots: /(?:headshots?|photos?|images?|pictures?|portfolio)(?:\s*:)?\s*(https?:\/\/[^\s,]+)/gi,
performanceVideos: /(?:performance|scenes?|clips?|videos?)(?:\s*:)?\s*(https?:\/\/[^\s,]+)/gi,
// Date formats for experience and education
date: /(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\.?\s+\d{4}\s*[-–—]?\s*(?:(Present|Current|Now)|(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|December)\.?\s+\d{4})?/i,
// Section headers
sections: {
education: /education|academic|qualification|degree|training/i,
experience: /experience|employment|work history|professional|credits|filmography|television|film/i,
skills: /skills|expertise|competencies|technical|abilities|special skills|performance skills/i,
projects: /projects|portfolio|productions/i,
certifications: /certifications|certificates/i,
languages: /languages|language proficiency|dialects|accents/i,
summary: /summary|profile|objective|about|bio/i,
representation: /representation|agency|agent|manager|management/i,
training: /training|coaching|workshop|conservatory|studio|technique/i,
reels: /reels?|demo reels?|show reels?|video/i,
credits: /credits|filmography|productions|performances|appearances|television|film/i,
sizes: /sizes|measurements|physical|appearance|stats/i,
},
// Degree and training patterns
degreePatterns: [
/(?:Bachelor|Master|PhD|Doctorate|BSc|BA|MSc|MA|Ph\.D|B\.S|M\.S|B\.A|M\.A)[^\n,]*/i,
/(?:Associate|Diploma|Certificate)[^\n,]*/i,
/(?:Conservatory|Studio|Workshop|Intensive|Program|Course)[^\n,]*/i,
/(?:Acting|Performance|Theatre|Theater|Drama)[^\n,]*/i,
/(?:Method|Technique|Training)[^\n,]*/i,
],
// Job title patterns for film/TV industry
titlePatterns: [
/(?:^|\n)([^,\n]+?Actor[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Actress[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Performer[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Model[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Extra[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Stand-in[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Voice Artist[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Voice Over[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Dancer[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Singer[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Host[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Presenter[^,\n]*)/i,
/(?:^|\n)([^,\n]+?Stunt[^,\n]*)/i,
],
// Film/TV credits extraction
credits: /([^,\n]+?)(?:\s*[-–—]\s*|\s*\(\s*)([^,\n]+?)(?:\)|,)(?:\s*dir\.?\s*|\s*directed by\s*)([^,\n]+?)(?:,|\(|$)/i,
filmTitle: /(?:film|movie|feature)\s*:?\s*([^,\n]+)/i,
tvTitle: /(?:tv|television|series|show)\s*:?\s*([^,\n]+)/i,
role: /(?:role|character|played|as)\s*:?\s*([^,\n]+)/i,
director: /(?:dir\.?|director|directed by)\s*:?\s*([^,\n]+)/i,
year: /(?:year|produced|released|filmed)\s*:?\s*(\d{4})/i,
// GPA extraction
gpa: /GPA[:of\s]+(\d+\.\d+|\d+)/i,
// Field of study
fieldOfStudy: /\s+in\s+([^,\n]+)/i,
// Bullet points
bulletPoint: /^[•\-*]\s*/,
// Actor sizes/measurements patterns
sizes: {
height: /(?:height|hgt)(?:\s*:|\s*-)\s*(\d+'\s*\d+"|(?:\d+)(?:\.\d+)?\s*(?:cm|m|ft|feet))/i,
weight: /(?:weight|wgt)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?\s*(?:kg|lbs|pounds))/i,
tShirt: /(?:t-?shirt|shirt|top)(?:\s*:|\s*-)\s*([XxSsLlMm\d]+)/i,
shoe: /(?:shoe|foot)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?)\s*(?:\(\s*(?:us|uk|eu|female|male|f|m|women'?s|men'?s|normal|wide)(?:\s*,\s*(?:us|uk|eu|female|male|f|m|women'?s|men'?s|normal|wide))*\s*\))?/i,
pants: /(?:pants|trouser|bottom)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?)/i,
waist: /(?:waist)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?"|\d+(?:\.\d+)?(?:\s*in)?)/i,
glove: /(?:glove)(?:\s*:|\s*-)\s*([XxSsLlMm\d]+)/i,
hat: /(?:hat|head)(?:\s*:|\s*-)\s*([XxSsLlMm\d]+\s*(?:\d+\/\d+)?)/i,
dress: /(?:dress)(?:\s*:|\s*-)\s*([XxSsLlMm\d]+)/i,
bust: /(?:bust|chest)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?"|\d+(?:\.\d+)?(?:\s*in)?)/i,
cup: /(?:cup)(?:\s*:|\s*-)\s*([A-Ea-e]{1,2})/i,
hip: /(?:hip)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?"|\d+(?:\.\d+)?(?:\s*in)?)/i,
collar: /(?:collar|neck)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?"|\d+(?:\.\d+)?(?:\s*in)?)/i,
suit: /(?:suit)(?:\s*:|\s*-)\s*(\d+[RSL]?)/i,
inseam: /(?:inseam)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?"|\d+(?:\.\d+)?(?:\s*in)?)/i,
sleeve: /(?:sleeve)(?:\s*:|\s*-)\s*(\d+(?:\.\d+)?"|\d+(?:\.\d+)?(?:\s*in)?)/i,
hairColor: /(?:hair\s*(?:color|colour))(?:\s*:|\s*-)\s*([A-Za-z]+)/i,
eyeColor: /(?:eye\s*(?:color|colour))(?:\s*:|\s*-)\s*([A-Za-z]+)/i,
ethnicity: /(?:ethnicity|ethnic\s*(?:appearance|look))(?:\s*:|\s*-)\s*([A-Za-z]+(?:\s+[A-Za-z]+)*)/i,
},
};
exports.SkillCategories = {
// Original technical skills
programmingLanguages: new Set([
'python',
'java',
'c++',
'c#',
'javascript',
'typescript',
'ruby',
'go',
'swift',
'php',
'kotlin',
'scala',
'rust',
'r',
'matlab',
'sql',
'html',
'css',
'bash',
'shell',
]),
frameworks: new Set([
'react',
'angular',
'vue',
'django',
'flask',
'spring',
'asp.net',
'laravel',
'express',
'tensorflow',
'pytorch',
'scikit-learn',
'pandas',
'numpy',
'bootstrap',
'jquery',
'node.js',
'rails',
'hibernate',
'symfony',
]),
tools: new Set([
'git',
'docker',
'kubernetes',
'aws',
'azure',
'gcp',
'jira',
'jenkins',
'travis ci',
'circleci',
'terraform',
'ansible',
'puppet',
'chef',
'nginx',
'apache',
'postgresql',
'mongodb',
'mysql',
'redis',
'elasticsearch',
'kafka',
'rabbitmq',
]),
softSkills: new Set([
'communication',
'teamwork',
'leadership',
'problem solving',
'critical thinking',
'time management',
'creativity',
'adaptability',
'collaboration',
'project management',
]),
// Acting and performance related skills
actingStyles: new Set([
'method acting',
'stanislavski',
'meisner technique',
'classical',
'shakespeare',
'improvisation',
"commedia dell'arte",
'melodrama',
'naturalism',
'realism',
'comedy',
'drama',
'farce',
'tragedy',
'soap opera',
'sitcom',
'character acting',
'physical comedy',
'mime',
'sketch comedy',
'stand-up comedy',
]),
dialects: new Set([
'american',
'british',
'scottish',
'irish',
'australian',
'new zealand',
'canadian',
'southern american',
'new york',
'boston',
'midwestern',
'texan',
'cockney',
'received pronunciation',
'london',
'liverpool',
'manchester',
'welsh',
'french',
'german',
'italian',
'spanish',
'russian',
'eastern european',
'indian',
'african',
]),
languages: new Set([
'english',
'spanish',
'french',
'german',
'italian',
'portuguese',
'russian',
'mandarin',
'cantonese',
'japanese',
'korean',
'arabic',
'hindi',
'bengali',
'punjabi',
'urdu',
'dutch',
'greek',
'swedish',
'norwegian',
'danish',
'finnish',
'polish',
'czech',
'turkish',
'thai',
'vietnamese',
'tagalog',
'hebrew',
'sign language',
]),
performanceSkills: new Set([
'stage combat',
'swordplay',
'stunts',
'firearms handling',
'choreography',
'horseback riding',
'dance',
'singing',
'monologues',
'cold reading',
'teleprompter',
'puppetry',
'mask work',
'clowning',
'green screen',
'motion capture',
'voice over',
'narration',
'commercial',
'hosting',
'presenting',
'autocue',
'impersonation',
'character voices',
'accent work',
'emotional range',
'crying on cue',
]),
singingStyles: new Set([
'pop',
'rock',
'jazz',
'blues',
'r&b',
'soul',
'gospel',
'country',
'folk',
'musical theater',
'opera',
'classical',
'choral',
'a cappella',
'belting',
'falsetto',
'harmonizing',
'rap',
'hip hop',
'spoken word',
]),
vocalRange: new Set([
'soprano',
'mezzo-soprano',
'alto',
'countertenor',
'tenor',
'baritone',
'bass',
'bass-baritone',
'high',
'middle',
'low',
]),
instruments: new Set([
'piano',
'guitar',
'acoustic guitar',
'electric guitar',
'bass guitar',
'drums',
'percussion',
'violin',
'viola',
'cello',
'double bass',
'flute',
'piccolo',
'clarinet',
'oboe',
'bassoon',
'saxophone',
'trumpet',
'trombone',
'french horn',
'tuba',
'harmonica',
'accordion',
'banjo',
'mandolin',
'ukulele',
'harp',
'keyboard',
'synthesizer',
'djembe',
'bongos',
'bagpipes',
]),
danceStyles: new Set([
'ballet',
'contemporary',
'jazz',
'tap',
'hip hop',
'breakdance',
'street',
'modern',
'ballroom',
'latin',
'salsa',
'swing',
'tango',
'waltz',
'foxtrot',
'quickstep',
'folk',
'irish',
'flamenco',
'bollywood',
'belly dancing',
'pole dancing',
'burlesque',
'musical theater',
'commercial',
'choreography',
]),
sports: new Set([
// Combat Sports
'boxing',
'karate',
'judo',
'jiu-jitsu',
'taekwondo',
'kung fu',
'mma',
'wrestling',
'fencing',
'krav maga',
'muay thai',
// Team Sports
'football',
'soccer',
'basketball',
'baseball',
'volleyball',
'rugby',
'hockey',
'cricket',
'lacrosse',
'ultimate frisbee',
// Racquet Sports
'tennis',
'badminton',
'squash',
'racquetball',
'table tennis',
// Water Sports
'swimming',
'diving',
'surfing',
'water skiing',
'wakeboarding',
'sailing',
'canoeing',
'kayaking',
'rowing',
'windsurfing',
// Winter Sports
'skiing',
'snowboarding',
'ice skating',
'figure skating',
'ice hockey',
'curling',
// Outdoor Sports
'climbing',
'hiking',
'mountain biking',
'archery',
'horseback riding',
'skateboarding',
'rollerblading',
// Extreme Sports
'skydiving',
'bungee jumping',
'parkour',
'rock climbing',
'motocross',
'bmx',
// Gymnastic Sports
'gymnastics',
'tumbling',
'trampoline',
'acrobatics',
'aerial silks',
'trapeze',
]),
weapons: new Set([
'sword',
'fencing',
'rapier',
'broadsword',
'katana',
'knife',
'dagger',
'staff',
'bow and arrow',
'archery',
'axe',
'spear',
'firearms',
'pistol',
'rifle',
'shotgun',
'whip',
'nunchaku',
'bo staff',
'throwing stars',
]),
drivingSkills: new Set([
'car',
'manual',
'automatic',
'motorcycle',
'scooter',
'bus',
'truck',
'heavy vehicle',
'forklift',
'boat',
'jet ski',
'atv',
'snowmobile',
'stunt driving',
'precision driving',
'drifting',
'racing',
'license',
]),
specializedSkills: new Set([
'juggling',
'magic',
'fire breathing',
'fire eating',
'contortion',
'acrobatics',
'aerial arts',
'tightrope',
'unicycle',
'ventriloquism',
'puppetry',
'makeup',
'special effects makeup',
'body painting',
'prosthetics',
'circus arts',
'baton twirling',
'plate spinning',
'mime',
'balloon animals',
'quick change',
]),
};
/**
* Patterns for extracting film, TV, and theater credits
*/
exports.CreditPatterns = {
// Production types
film: new Set([
'feature film',
'film',
'movie',
'short film',
'documentary',
'indie film',
'independent film',
'student film',
'web film',
'web movie',
'feature',
'motion picture',
'cinema',
]),
television: new Set([
'television',
'tv',
'series',
'tv series',
'tv show',
'sitcom',
'soap opera',
'drama series',
'miniseries',
'limited series',
'web series',
'reality tv',
'television movie',
'tv movie',
'pilot',
'tv pilot',
'episode',
]),
theater: new Set([
'theater',
'theatre',
'stage',
'play',
'musical',
'broadway',
'off-broadway',
'off-off-broadway',
'national tour',
'regional theater',
'regional theatre',
'improv',
'sketch comedy',
'live performance',
'one-person show',
'solo performance',
]),
commercial: new Set([
'commercial',
'ad',
'advertisement',
'tv commercial',
'radio commercial',
'voice over',
'print ad',
'industrial',
'corporate video',
'promo',
'infomercial',
]),
new_media: new Set([
'web series',
'youtube',
'tiktok',
'instagram',
'social media',
'streaming',
'internet',
'podcast',
'web content',
'digital series',
]),
// Role types
roles: new Set([
'lead',
'supporting',
'featured',
'principal',
'co-star',
'guest star',
'recurring',
'series regular',
'day player',
'featured extra',
'background',
'extra',
'stand-in',
'double',
'stunt double',
'voice',
'narrator',
'host',
'presenter',
'spokesperson',
'ensemble',
'understudy',
'swing',
'cameo',
]),
};
/**
* Patterns for media and portfolio formats
*/
exports.MediaPatterns = {
videoFormats: new Set([
'mp4',
'mov',
'avi',
'wmv',
'flv',
'mkv',
'webm',
'm4v',
]),
audioFormats: new Set(['mp3', 'wav', 'ogg', 'aac', 'm4a', 'flac', 'wma']),
imageFormats: new Set([
'jpg',
'jpeg',
'png',
'gif',
'bmp',
'tiff',
'webp',
'heic',
]),
hostingSites: new Set([
'youtube',
'vimeo',
'instagram',
'facebook',
'tiktok',
'twitter',
'soundcloud',
'spotify',
'apple music',
'bandcamp',
'flickr',
'pinterest',
'dropbox',
'google drive',
'icloud',
'onedrive',
'box',
]),
portfolioTypes: new Set([
'reel',
'demo reel',
'showreel',
'acting reel',
'voice reel',
'commercial reel',
'comedy reel',
'dramatic reel',
'dance reel',
'headshot',
'portfolio',
'website',
'gallery',
'lookbook',
'comp card',
'resume',
'cv',
'press kit',
]),
};
/**
* Patterns for actor physical attributes and sizes
*/
exports.PhysicalAttributes = {
hairColors: new Set([
'blonde',
'brunette',
'black',
'red',
'auburn',
'brown',
'chestnut',
'gray',
'grey',
'white',
'salt and pepper',
'strawberry blonde',
'platinum',
'sandy',
'dirty blonde',
'dark brown',
'light brown',
'ginger',
]),
eyeColors: new Set([
'blue',
'brown',
'green',
'hazel',
'grey',
'gray',
'amber',
'black',
'dark brown',
'light brown',
]),
ethnicities: new Set([
'caucasian',
'white',
'black',
'african american',
'asian',
'hispanic',
'latino',
'latina',
'latinx',
'middle eastern',
'native american',
'indigenous',
'pacific islander',
'indian',
'south asian',
'southeast asian',
'east asian',
'mediterranean',
'multiracial',
'biracial',
'mixed race',
]),
bodyTypes: new Set([
'slim',
'slender',
'thin',
'athletic',
'fit',
'toned',
'muscular',
'average',
'medium build',
'full-figured',
'plus size',
'curvy',
'petite',
'tall',
]),
sizeCategories: new Set([
'height',
'weight',
't-shirt',
'shirt',
'pant',
'trouser',
'dress',
'suit',
'jacket',
'shoe',
'waist',
'inseam',
'chest',
'bust',
'cup',
'hip',
'collar',
'neck',
'sleeve',
'hat',
'glove',
]),
};
/**
* Patterns for acting training and education
*/
exports.TrainingPatterns = {
trainingTypes: new Set([
'acting school',
'drama school',
'theater school',
'theatre school',
'conservatory',
'studio',
'mfa',
'bfa',
'bachelor',
'master',
'academy',
'workshop',
'class',
'course',
'intensive',
'seminar',
'private coaching',
'scene study',
'technique',
'method',
'bootcamp',
]),
actingTechniques: new Set([
'method',
'meisner',
'stanislavski',
'strasberg',
'adler',
'hagen',
'chekhov',
'practical aesthetics',
'viewpoints',
'suzuki',
'laban',
'alexander',
'feldenkrais',
'improvisation',
'improv',
'clowning',
"commedia dell'arte",
'neutral mask',
'character mask',
'lecoq',
]),
specializedTraining: new Set([
'voice',
'speech',
'dialect',
'accent',
'movement',
'dance',
'combat',
'stage combat',
'fight',
'camera technique',
'on-camera',
'scene study',
'character development',
'script analysis',
'audition technique',
'cold reading',
'monologue',
'shakespeare',
'classical text',
'improv',
'singing',
'musical theater',
'on-set etiquette',
]),
};