job-hoarder
Version:
Job board aggregator to pull in standardized job postings from company job pages
104 lines (103 loc) • 4.29 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
var cheerio_1 = __importDefault(require("cheerio"));
var utils_1 = require("../utils");
var JazzScrapeParser = /** @class */ (function () {
function JazzScrapeParser() {
var _this = this;
/**
* Parse jobs from request result
* @param {string} data String of jobs
* @returns {array} List of parsed jobs
*/
this.parseJobs = function (data) {
if (!data)
throw new Error('No jobs to parse');
var jobs = [];
var $ = cheerio_1.default.load(data);
$('.jobs-list ul li.list-group-item').each(function (i, elem) {
var job = $(elem);
var url = job.find('a').attr('href');
var id = utils_1.getMatch(url, /.+apply\/(.+)\/.+/);
var title = job.find('a').text().trim();
var jobLocation = job.find('.fa-map-marker').parent().text();
var department = job.find('.fa-sitemap').parent().text();
jobs.push({
id: id,
url: url,
title: title,
jobLocation: jobLocation,
department: department,
});
});
return jobs;
};
/**
* Parsed ID from the Jazz URL
* @param url URL to parse ID from
*/
this.parseIdFromUrl = function (url) { return utils_1.getMatch(url, /.+apply\/(.+)\/.+/); };
/**
* Parses and normalizes job based on Embedded JSON object
* @param parsedData JSON object embedded in HTML
* @param $ Cheerio object to extract extra features
*/
this.parseJobJSON = function (parsedData, $) {
var title = parsedData.title, _a = parsedData.url, url = _a === void 0 ? '' : _a, loc = parsedData.jobLocation.address, pDate = parsedData.datePosted, description = parsedData.description;
var id = _this.parseIdFromUrl(url);
var datePosted = new Date(pDate);
var jobLocation = loc ? loc.addressLocality + ", " + loc.addressRegion : undefined;
var department = $('li[title="Department"]').text().trim();
return {
id: id,
url: url,
title: title,
datePosted: datePosted,
jobLocation: jobLocation,
department: department,
description: description,
};
};
/**
* Scrapes key fields from HTML
* @param $ Cheerio object to extract features from
*/
this.parseJobHTML = function ($) {
var title = $('h1:not(.brand-text)').text().trim();
var url = $('meta[property="og:url"]').attr('content');
var id = _this.parseIdFromUrl(url);
var jobLocation = $('li[title="Location"]').text().trim();
var department = $('li[title="Department"]').text().trim();
var description = $('div.description').first().html();
return {
id: id,
url: url,
title: title,
jobLocation: jobLocation,
department: department,
description: description,
};
};
/**
* Parses job from request result
* @param {string} data String of job result
* @returns {object} Object of parsed job
*/
this.parseJob = function (data) {
if (!data)
throw new Error('No job to parse');
var $ = cheerio_1.default.load(data);
// Check to see if the page includes a JSON object
var parsedData = JSON.parse($('script[type="application/ld+json"]').html());
if (parsedData && parsedData.url)
return _this.parseJobJSON(parsedData, $);
// Parse HTML as fallback
return _this.parseJobHTML($);
};
}
return JazzScrapeParser;
}());
exports.default = JazzScrapeParser;