UNPKG

linkedin-pdf-to-json

Version:
722 lines (674 loc) 33.4 kB
// JavaScript recursive descent parser for storing text retrieved from LinkedIn profile PDFs in JSON format. // Author: Isaac Mast <isaac.k.mast@gmail.com> [https://github.com/isaacmast] // GitHub repo: https://github.com/isaacmast/linkedin-pdf-to-json var jsonfile = require('jsonfile'); module.exports = LinkedInPdfToJson; function LinkedInPdfToJson() { this.bold = false; // a boolean variable that determines if the current text chunk is bold this.content = []; // the array of objects retrieved from pdf-text module this.count = -1; // a count variable for keeping track of the number of entries in a section this.index = 0; // the current index of the content array this.json = {}; // the object to hold the parsed data this.last = {}; // object to hold data about the previous chunk this.parsingErrorMsg = 'LinkedInPdfToJson ParsingError: could not successfully parse the following text chunk: '; this.pdf = undefined; // the path to the PDF to parse this.pdfText = require('pdf-text'); // import pdf-text module this.section = undefined; // the current section of the PDF this.target = undefined; // the target path for the JSON file this.text = undefined; // the current text string being parsed this.token = 'START'; // the current token this.tokenErrorMsg = 'LinkedInPdfToJson TokenError: could not successfully set the token for the following text chunk: '; this.whiteSpace = 4; // white space amount in output JSON this.y = undefined; // possible section headers that are currently supported this.SECTION_HEADERS = { 'Summary': 'bio', 'Languages': 'languages', 'Education': 'educationExperience', 'Experience': 'workExperience', 'Skills & Expertise': 'skills', 'Volunteer Experience': 'volunteerExperience', 'Unsupported': 'unsupported' }; // currently unsupported sections this.UNSUPPORTED_SECTIONS = ['Publications', 'Projects', 'Certifications', 'Organizations', 'Test Scores', 'Specialties', 'Honors and Awards', 'Interests', 'Courses', 'recommendations', 'Patents']; // available token values this.TOKENS = { 'EOF': 'eof', 'SECTION_HEADER': 'section_header', 'NAME': 'name', 'JOB_TITLE': 'job_title', 'JOB_DATE': 'date_range', 'JOB_DURATION': 'duration', 'SECTION_CONTENT': 'section_content', 'SCHOOL': 'school', 'EDU_BASIC_INFO': 'basic_info', 'EDU_GRADE_LABEL': 'grade', 'EDU_GRADE': 'grade_received', 'EDU_ACTIVITIES_SOCIETIES_LABEL': 'activities_and_societies', 'EDU_ACTIVITY_OR_SOCIETY': 'activity_or_society', 'SKILL': 'skill', 'LANGUAGE': 'language', 'LANGUAGE_PROFICIENCY': 'proficiency', 'UNSUPPORTED': 'unsupported', 'UNKNOWN': 'unknown' }; } // Main runner function LinkedInPdfToJson.prototype.run = function(source, target, options) { this.pdf = source; this.target = target; this.whiteSpace = options.space || this.whiteSpace; var linkedinPdfToJson = this; // Callback function for the pdf-text module. // This function starts the first call to actually parse the chunks array. this.pdfText(source, function(error, chunks) { if (error) { throw new Error('LinkedInPdfToJson: invalid PDF file'); } // console.log(chunks); // console.log(); // console.log('Parsing (' + linkedinPdfToJson.pdf + ')...'); linkedinPdfToJson.parse(chunks); if (linkedinPdfToJson.target) { jsonfile.writeFile(linkedinPdfToJson.target, linkedinPdfToJson.json, { spaces: linkedinPdfToJson.whiteSpace }, function(err) { if (err) { throw new Error('LinkedInPdfToJson: could not successfully write JSON to file'); } return; }); } else { return console.log(JSON.stringify(linkedinPdfToJson.json, null, linkedinPdfToJson.whiteSpace)); } }); }; //=========================== // GRAMMAR LOGIC //=========================== // Parses the PDF using the chunks array retrieved from the pdf-text node module LinkedInPdfToJson.prototype.parse = function(chunks) { // console.log('ZZZ PARSE'); this.sanitize(chunks); this.content = chunks; this.setBasicInfo(); this.getNextToken(); while (this.token !== this.TOKENS.EOF) { if (this.token === this.TOKENS.SECTION_HEADER || this.token === this.TOKENS.UNSUPPORTED || this.token === this.TOKENS.UNKNOWN) { this.parseSection(); } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } } // console.log('ZZZ END PARSE'); }; // Parses a section of the PDF. LinkedInPdfToJson.prototype.parseSection = function() { // console.log('ZZZ SECTION'); // console.log('section = ' + this.section); if (this.section !== 'unsupported') { this.json[this.section] = this.json[this.section] || {}; } if (this.section === this.SECTION_HEADERS.Summary) { this.getNextToken(); if (this.token === this.TOKENS.SECTION_CONTENT) { this.json[this.section] = []; this.parseSummary(); } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS.Education) { this.getNextToken(); if (this.token === this.TOKENS.SCHOOL) { this.json[this.section] = []; while (this.token === this.TOKENS.SCHOOL) { this.count++; this.parseEducation(); } this.resetCount(); } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS.Experience || this.section === this.SECTION_HEADERS['Volunteer Experience']) { this.getNextToken(); if (this.token === this.TOKENS.JOB_TITLE) { this.json[this.section] = []; while (this.token === this.TOKENS.JOB_TITLE) { this.count++; this.parseJob(); } this.resetCount(); } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS.Languages) { this.getNextToken(); if (this.token === this.TOKENS.LANGUAGE) { while (this.section === this.SECTION_HEADERS.Languages) { this.parseLanguages(); } } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS['Skills & Expertise']) { this.getNextToken(); if (this.token === this.TOKENS.SKILL) { this.parseSkillsAndExpertise(); } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } // TODO: Implement the rest of the unsupported this.sections // This marks the start of the unsupported this.sections // See global UNSUPPORTED_SECTIONS variable at the top for a list of // all the unsupported this.sections. } else if (this.token === this.TOKENS.UNSUPPORTED || this.token === this.TOKENS.UNKNOWN) { this.json[this.section] = this.json[this.section] || []; while (this.token === this.TOKENS.UNSUPPORTED || this.token === this.TOKENS.UNKNOWN) { if (this.json[this.section].indexOf(this.text) === -1) { // check is mainly for '...........' section separators to avoid redundant '.' elements this.json[this.section].push(this.text); } this.getNextToken(); } } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } // console.log('ZZZ END SECTION'); }; // Parses the summary section of the PDF. // For description fields in a LinkedIn profile section, the user may choose to format their descriptions // by outlining them with letters or numbers or by using bullet points/bullet-like symbols e.g. 1., a., -, •, #, ~, * . // The goal of this function is to retain that user defined formatting by putting each bulleted line in its own object property. // For non-formatted descriptions the text is simply concatenated into a single string. LinkedInPdfToJson.prototype.parseSummary = function() { // console.log('ZZZ SUMMARY'); var textCount = -1; var inBulleted = false; // var hasBulleted = this.hasBulletedText(); if (this.hasBulletedText()) { while (this.token === this.TOKENS.SECTION_CONTENT) { var bulleted = this.isBulleted(); var newline = this.isSeparatedByNewline(); if (bulleted || newline) { inBulleted = true; textCount++; this.json[this.section][textCount] = this.text; } else if (inBulleted && this.text.match(/^\s\S/)) { this.json[this.section][textCount] = this.json[this.section][textCount] + this.text; } else { textCount = textCount === -1 ? 0 : textCount; inBulleted = false; var text = this.json[this.section][textCount]; if (text && text.match(/\S$/) && this.text.match(/^\S/)) { text = text + ' '; } this.json[this.section][textCount] = text ? text + this.text : this.text; } this.getNextToken(); } } else { textCount = 0; while (this.token === this.TOKENS.SECTION_CONTENT) { var summaryText = this.json[this.section][textCount]; this.json[this.section][textCount] = summaryText ? summaryText + this.text : this.text; this.getNextToken(); } } // console.log('ZZZ END SUMMARY'); }; // Parses the Education section of the PDF. // Individual education sections only require an institution name, so any additional section info needs // to be parsed in separate IF statements since none of it is guaranteed to be present. LinkedInPdfToJson.prototype.parseEducation = function() { // console.log('ZZZ EDUCATION'); var currentSection = this.section; this.json[currentSection][this.count] = this.json[currentSection][this.count] || {}; this.json[currentSection][this.count].school = this.text; this.getNextToken(); if (this.token === this.TOKENS.EDU_BASIC_INFO) { var basicInfo = ''; while (this.token === this.TOKENS.EDU_BASIC_INFO) { basicInfo += this.text; this.getNextToken(); } this.json[currentSection][this.count].basicInfo = basicInfo.split(/\,\s*/); if (this.json[currentSection][this.count].basicInfo[0].match(/(Bachelor|B\.?A\.?|B\.?S\.?|A\.?B\.?|Master|Ph\.D\.)/)) { this.json[currentSection][this.count].degree = this.json[currentSection][this.count].basicInfo[0]; this.json[currentSection][this.count].basicInfo.splice(0, 1); } var length = this.json[currentSection][this.count].basicInfo.length; if (this.json[currentSection][this.count].basicInfo[length - 1].match(/^\w*\s*\d+\s+\-\s+\w*\s*\d*/)) { var dates = this.json[currentSection][this.count].basicInfo[length - 1].split(/\s\-\s/); this.json[currentSection][this.count].startDate = dates[0]; this.json[currentSection][this.count].endDate = dates[1]; this.json[currentSection][this.count].basicInfo.splice(length - 1, 1); } } if (this.token === this.TOKENS.EDU_GRADE_LABEL) { this.getNextToken(); if (this.token === this.TOKENS.EDU_GRADE) { this.json[currentSection][this.count].grade = this.text; } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } this.getNextToken(); } if (this.token === this.TOKENS.EDU_ACTIVITIES_SOCIETIES_LABEL) { this.getNextToken(); if (this.token === this.TOKENS.EDU_ACTIVITY_OR_SOCIETY) { this.json[currentSection][this.count].activitiesAndSocieties = ''; while (this.token === this.TOKENS.EDU_ACTIVITY_OR_SOCIETY) { this.json[currentSection][this.count].activitiesAndSocieties += this.text; this.getNextToken(); } } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } } // console.log('ZZZ END EDUCATION'); }; // Creates and populates a new JSON job object under the appropriate section header. // For description fields in a LinkedIn profile section, the user may choose to format their descriptions // by outlining them with letters or numbers or by using bullet points/bullet-like symbols e.g. 1., a., -, •, #, ~, * . // The goal of this function is to retain that user defined formatting by putting each text chunk in its own object property if the // job description contains bulleted text. // For non-formatted descriptions the text is simply concatenated into a single string. LinkedInPdfToJson.prototype.parseJob = function() { // console.log('ZZZ JOB'); var currentSection = this.section; var currentTitle = ''; this.json[currentSection][this.count] = this.json[this.section][this.count] || {}; // TODO: Look into volunteer experience with job title, organization, and description, // but no date range specified. See JacobStelman.pdf for example. while (this.token === this.TOKENS.JOB_TITLE && this.token !== this.TOKENS.JOB_DATE) { currentTitle += this.text; this.getNextToken(); } var titleAndOrganization = currentTitle.trim().split(/\s{2,}at\s{2,}/); if (titleAndOrganization.length === 2) { this.json[currentSection][this.count].jobTitle = titleAndOrganization[0]; this.json[currentSection][this.count].organization = titleAndOrganization[1]; } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } if (this.token === this.TOKENS.JOB_DATE) { var dates = this.text.trim().split(/\s{2,}\-\s{2,}/); if (dates.length === 2) { this.json[this.section][this.count].startDate = dates[0]; this.json[this.section][this.count].endDate = dates[1]; } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } this.getNextToken(); if (this.token === this.TOKENS.JOB_DURATION) { var splits = this.text.split(/[()]/); if (splits.length === 3) { var amount = splits[1]; this.json[this.section][this.count].duration = amount; } else { throw new Error(this.parsingErrorMsg + '\'' + this.text + '\''); } this.getNextToken(); } } if (this.token === this.TOKENS.SECTION_CONTENT) { this.json[this.section][this.count].responsibilities = this.json[this.section][this.count].responsibilities || []; var textCount = -1; var inBulleted = false; if (this.hasBulletedText()) { while (this.token === this.TOKENS.SECTION_CONTENT) { var bulleted = this.isBulleted(); var newline = this.isSeparatedByNewline(); if (bulleted || newline) { inBulleted = true; textCount++; this.json[this.section][this.count].responsibilities[textCount] = this.text; } else if (inBulleted && this.text.match(/^\s\S/)) { this.json[this.section][this.count].responsibilities[textCount] = this.json[this.section][this.count].responsibilities[textCount] + this.text; } else { textCount = textCount === -1 ? 0 : textCount; inBulleted = false; // var text = this.text.match(/^\S/) ? this.json[this.section][this.count].responsibilities[textCount] + ' ' : this.json[this.section][this.count].responsibilities[textCount]; var text = this.json[this.section][this.count].responsibilities[textCount]; if (text && text.match(/\S$/) && this.text.match(/^\S/)) { text = text + ' '; } this.json[this.section][this.count].responsibilities[textCount] = text ? text + this.text : this.text; } this.getNextToken(); } } else { textCount = 0; while (this.token === this.TOKENS.SECTION_CONTENT) { var jobText = this.json[this.section][this.count].responsibilities[textCount]; this.json[this.section][this.count].responsibilities[textCount] = jobText ? jobText + this.text : this.text; this.getNextToken(); } } } // console.log('ZZZ END JOB'); }; // Parses the languages section of a LinkedIn profile PDF. // The language section is fairly straightforward and simple with the name of the language listed // and the proficiency immediately afterwards if it's available. LinkedInPdfToJson.prototype.parseLanguages = function() { // console.log('ZZZ LANGUAGES'); this.json[this.section] = []; var languageCount = -1; while (this.token === this.TOKENS.LANGUAGE) { languageCount++; this.json[this.section][languageCount] = this.json[this.section][languageCount] || {}; this.json[this.section][languageCount][this.TOKENS.LANGUAGE] = this.text; this.getNextToken(); if (this.token === this.TOKENS.LANGUAGE_PROFICIENCY) { this.json[this.section][languageCount][this.TOKENS.LANGUAGE_PROFICIENCY] = this.text; this.getNextToken(); } } // console.log('ZZZ END LANGUAGES'); }; // Parses the skills section of a LinkedIn profile PDF. LinkedInPdfToJson.prototype.parseSkillsAndExpertise = function() { // console.log('ZZZ SKILLS'); this.json[this.section] = []; while (this.token === this.TOKENS.SKILL) { this.json[this.section].push(this.text); this.getNextToken(); } // console.log('ZZZ END SKILLS'); }; //=========================== // GENERATORS/SETTERS/HELPERS //=========================== // Resets count to -1. LinkedInPdfToJson.prototype.resetCount = function() { this.count = -1; }; // Removes unnecessary 'Page' and '{0}' elements and 'Contact {person} on LinkedIn' element from chunks array. // @param chunks - array of string elements representing the top-to-bottom // flow of text from the PDF. LinkedInPdfToJson.prototype.sanitize = function(chunks) { for (var i = 0; i < chunks.length; i++) { if (chunks[i].text === 'Page' && chunks[i + 1].text.match(/\d+/)) { chunks.splice(i, 2); } } chunks.splice(chunks.length - 1, 1); }; // Sets the name, current job, and potentially email properties of the json object // based on the PDF text. // The email property may not be set if it's not provided in the PDF. // These properties can just be assumed since it's standard across all LinkedIn profile PDFs. LinkedInPdfToJson.prototype.setBasicInfo = function() { this.json.name = this.content[this.index].text; this.index++; this.json.currentJob = this.content[this.index].text; if (!this.isSectionHeader(this.content[this.index + 1].text) && this.UNSUPPORTED_SECTIONS.indexOf(this.content[this.index + 1].text) === -1) { this.index++; this.json.email = this.content[this.index].text; } }; LinkedInPdfToJson.prototype.setLastInfo = function() { this.last.bold = this.bold; this.last.index = this.index; this.last.section = this.section; this.last.text = this.text; this.last.token = this.token; this.last.y = this.y; }; // Searches through the current job description for bulleted text. // @return true if a text chunk from the current job description is bulleted. // @return false otherwise. LinkedInPdfToJson.prototype.hasBulletedText = function() { // console.log('ZZZ HAS_BULLETED_TEXT'); var currentBold = this.bold, currentToken = this.token, currentText = this.text, currentSection = this.section, currentIndex = this.index, currentY = this.y; while (this.token === this.TOKENS.SECTION_CONTENT) { if (this.isBulleted() || this.isSeparatedByNewline()) { this.bold = currentBold; this.token = currentToken; this.text = currentText; this.section = currentSection; this.index = currentIndex; this.y = currentY; // console.log('ZZZ END HAS_BULLETED_TEXT (true)'); return true; } this.getNextToken(); } this.bold = currentBold; this.token = currentToken; this.text = currentText; this.section = currentSection; this.index = currentIndex; this.y = currentY; // console.log('ZZZ END HAS_BULLETED_TEXT (false)'); return false; }; //=========================== // TOKEN CHECKS //=========================== // TODO: Simplify by using subsections for grade, activities, etc. // Determines the next token based on the next text chunk LinkedInPdfToJson.prototype.getNextToken = function() { this.setLastInfo(); this.index++; this.text = this.content[this.index] && this.content[this.index].text || undefined; this.bold = this.content[this.index] && this.content[this.index].bold || false; this.y = this.content[this.index] && this.content[this.index].y || undefined; // console.log(); // console.log(JSON.stringify(this.json, null, this.whiteSpace)); // console.log('Setting token...'); // console.log('previous token = ' + this.token); // console.log('previous section = ' + this.section); // console.log('text = ' + '"' + this.text + '"'); // console.log('this.bold = ' + this.bold); if (this.isEOF()) { this.token = this.section = this.TOKENS.EOF; this.section = this.SECTION_HEADERS.Unsupported; } else if (this.isSectionHeader()) { this.token = this.TOKENS.SECTION_HEADER; this.section = this.SECTION_HEADERS[this.text.trim()]; } else if (this.isUnsupported()) { this.token = this.TOKENS.UNSUPPORTED; this.section = this.SECTION_HEADERS.Unsupported; } else if (this.isInUnsupported()) { this.token = this.TOKENS.UNKNOWN; } else if (this.section === this.SECTION_HEADERS.Summary) { if (this.isSectionContent()) { this.token = this.TOKENS.SECTION_CONTENT; } else { throw new Error(this.tokenErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS.Education) { if (this.isSchool()) { this.token = this.TOKENS.SCHOOL; } else if (this.isGradeLabel()) { this.token = this.TOKENS.EDU_GRADE_LABEL; } else if (this.isGrade()) { this.token = this.TOKENS.EDU_GRADE; } else if (this.isActivitiesAndSocietiesLabel()) { this.token = this.TOKENS.EDU_ACTIVITIES_SOCIETIES_LABEL; } else if (this.isActivityOrSociety()) { this.token = this.TOKENS.EDU_ACTIVITY_OR_SOCIETY; } else if (this.isEduBasicInfo()) { this.token = this.TOKENS.EDU_BASIC_INFO; } else { throw new Error(this.tokenErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS.Experience || this.section === this.SECTION_HEADERS['Volunteer Experience']) { if (this.isJobTitle()) { this.token = this.TOKENS.JOB_TITLE; } else if (this.isDateRange()) { this.token = this.TOKENS.JOB_DATE; } else if (this.isJobDuration()) { this.token = this.TOKENS.JOB_DURATION; } else if (this.isSectionContent()) { this.token = this.TOKENS.SECTION_CONTENT; } else { throw new Error(this.tokenErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS.Languages) { if (this.isLanguageProficiency()) { this.token = this.TOKENS.LANGUAGE_PROFICIENCY; } else if (this.isLanguage()) { this.token = this.TOKENS.LANGUAGE; } else { throw new Error(this.tokenErrorMsg + '\'' + this.text + '\''); } } else if (this.section === this.SECTION_HEADERS['Skills & Expertise']) { if (this.isSkill()) { this.token = this.TOKENS.SKILL; } else { throw new Error(this.tokenErrorMsg + '\'' + this.text + '\''); } } else { throw new Error(this.tokenErrorMsg + '\'' + this.text + '\''); } // console.log('new section = ' + this.section); // console.log('new token = ' + this.token); // console.log('Token set!'); }; // Determines if the text chunk is preceded by a bullet/bullet-like symbol or outlined with numbers or letters e.g. •, -, A., 1., etc. // @param previous (optional) - a specific text chunk to evaluate. // @return true if the chunk has been preceded by a bullet or bullet-like symbol. // @return false otherwise. LinkedInPdfToJson.prototype.isBulleted = function(previous) { var chunk = previous || this.text; return chunk.match(/^([A-z0-9](?=\.)|[\-\•\#\~\*])/); }; LinkedInPdfToJson.prototype.isSeparatedByNewline = function() { return this.content[this.index].y - this.last.y > 2; }; // Checks if the text chunk is the end of the file. // @return true if the text chunk is the end of the file i.e. undefined. // @return false otherwise. LinkedInPdfToJson.prototype.isEOF = function() { return !this.text; }; // Determines whether the passed in text chunk is a LinkedIn profile section header. // @param chunk (optional) - a specific text chunk to evaluate. // @return true if text chunk is present in this.SECTION_HEADERS object. // @return false otherwise. LinkedInPdfToJson.prototype.isSectionHeader = function(chunk) { chunk = chunk || this.text; return this.SECTION_HEADERS.hasOwnProperty(chunk.trim()); }; // Checks if the text chunk is a section that is currently unsupported // @return true if the text chunk is the section header of an unsupported section. // @return false otherwise. LinkedInPdfToJson.prototype.isUnsupported = function() { var chunk = this.text; if (chunk === this.json.name && this.content[this.index + 1].text === this.json.currentJob) { chunk = 'recommendations'; } return chunk ? this.UNSUPPORTED_SECTIONS.indexOf(chunk.trim()) !== -1 : false; }; // Checks if the text chunk is in a section that is currently unsupported // @return true if the text chunk is in an unsupported section. // @return false otherwise. LinkedInPdfToJson.prototype.isInUnsupported = function() { return this.token === this.TOKENS.UNSUPPORTED || this.token === this.TOKENS.UNKNOWN; }; // Checks if the text chunk is a skill. // @return true if the text chunk is a skill under the Skills & Expertise section. // @return false otherwise. LinkedInPdfToJson.prototype.isSkill = function() { return (this.token === this.TOKENS.SKILL || this.token === this.TOKENS.SECTION_HEADER) && this.section === this.SECTION_HEADERS['Skills & Expertise']; }; // Checks if the text chunk is a school. // @return true if the text chunk is a school. // @return false otherwise. LinkedInPdfToJson.prototype.isSchool = function() { return this.bold && this.section === this.SECTION_HEADERS.Education; }; // Checks if the text chunk is the grade label. // @return true if the text chunk is the grade label in an Education experience section. // @return false otherwise. LinkedInPdfToJson.prototype.isGradeLabel = function() { return this.text.match(/^Grade:/); }; // Checks if the text chunk a grade. // @return true if the text chunk is the grade in an Education experience section. // @return false otherwise. LinkedInPdfToJson.prototype.isGrade = function() { return this.token === this.TOKENS.EDU_GRADE_LABEL; }; // Checks if the text chunk is the Actvities and Societies label. // @return true if the text chunk is the Activites and Societies label in an Education experience section. // @return false otherwise. LinkedInPdfToJson.prototype.isActivitiesAndSocietiesLabel = function() { return this.text.match(/^Activities and Societies:/); }; // Checks if the text chunk is an activity or society. // @return true if the text chunk is an activity or society listed in the Activities and Societies section of an Education section. // @return false otherwise. LinkedInPdfToJson.prototype.isActivityOrSociety = function() { return this.token === this.TOKENS.EDU_ACTIVITIES_SOCIETIES_LABEL || this.token === this.TOKENS.EDU_ACTIVITY_OR_SOCIETY; }; // Checks if the text chunk is the basic info of an education section // NOTE: This token check needs to come after all the other education section token checks because of it's simplicity. // @return true if the text chunk is the basic info of an Education experience section. // @return false otherwise. LinkedInPdfToJson.prototype.isEduBasicInfo = function() { return this.token === this.TOKENS.SCHOOL || this.token === this.TOKENS.EDU_BASIC_INFO; }; // Checks if the text chunk is a job title. // Job titles follow this general format: 'job_title at company'. // NOTE: LinkedIn PDF job titles have two spaces before and three spaces after the 'at'. // Job titles are also required by LinkedIn to fill out an Experience or Volunteer Experience section. // @return true if the text chunk is the job title of the currently parsed job. // @return false otherwise. LinkedInPdfToJson.prototype.isJobTitle = function() { if (this.section === this.SECTION_HEADERS.Experience) { return this.bold && (this.token === this.TOKENS.SECTION_HEADER || this.token === this.TOKENS.JOB_DURATION || this.token === this.TOKENS.SECTION_CONTENT || this.token === this.TOKENS.JOB_TITLE); } else if (this.section === this.SECTION_HEADERS['Volunteer Experience']) { return this.bold && (this.token === this.TOKENS.SECTION_HEADER || this.token === this.TOKENS.JOB_DATE || this.token === this.TOKENS.SECTION_CONTENT || this.token === this.TOKENS.JOB_TITLE); } return false; }; // Checks if the text chunk is a job date range e.g. 'September 2014 - December 2014'. // Job dates follow this general format: '[month_name] year - [present|[[month_name] year]]]'. // NOTE: Job dates are required by LinkedIn to fill out an Experience section, but not Volunteer Experience. // This is also used when parsing the Education section to gather basic education info. // @param chunk (optional) - a specific text chunk to evaluate. // @return true if the text chunk is a date range of the currently parse job. // @return false otherwise. LinkedInPdfToJson.prototype.isDateRange = function(chunk) { chunk = chunk || this.text; return (this.token === this.TOKENS.JOB_TITLE || this.token === this.TOKENS.EDU_BASIC_INFO) && chunk.match(/^\w*\s*\d+\s+\-\s+\w*\s*\d*/); }; // Checks if the text chunk is a job period e.g. '(1 year 2 months)'. // Job periods follow this general format: '(number month(s)|year)|(number year(s)[ number month(s)])'. // NOTE: Job durations are always present for jobs since they are calculated by LinkedIn based on the job date, // which is required by LinkedIn to fill out an Experience or Volunteer Experience section. // @return true if the text chunk is a time duration of the currently parsed job. // @return false otherwise. LinkedInPdfToJson.prototype.isJobDuration = function() { return this.token === this.TOKENS.JOB_DATE && this.text.match(/\(\d+\s\w+\s*\d*\s*\w*\)|^\(less than a year\)/); }; // Checks if the text chunk is the proficiency level of a language. // @return true if the text chunk is the proficiency level of a language in the Languages section. // @return false otherwise. LinkedInPdfToJson.prototype.isLanguageProficiency = function() { return this.token === this.TOKENS.LANGUAGE && this.text.match(/proficiency\)$/); }; // Checks if the text chunk is a language. // @return true if the text chunk is a language listed under the Languages section. // @return false otherwise. LinkedInPdfToJson.prototype.isLanguage = function() { return (this.section === this.SECTION_HEADERS.Languages || this.token === this.TOKENS.LANGUAGE); }; // Checks if the text chunk is part of a section. // @return true if the text chunk is part the current sections text content // @return false otherwise. LinkedInPdfToJson.prototype.isSectionContent = function() { if (this.section === this.SECTION_HEADERS['Volunteer Experience']) { return !this.bold && (this.token === this.TOKENS.JOB_TITLE || this.token === this.TOKENS.JOB_DURATION || this.token === this.TOKENS.JOB_DATE || this.token === this.TOKENS.SECTION_CONTENT || this.token === this.TOKENS.SECTION_HEADER); } return !this.bold && (this.token === this.TOKENS.JOB_DURATION || this.token === this.TOKENS.JOB_DATE || this.token === this.TOKENS.SECTION_CONTENT || this.token === this.TOKENS.SECTION_HEADER); };