@jomari-wp/linkedin-jobs-scraper
Version:
Scrape public available jobs on Linkedin using headless browser
351 lines (350 loc) • 16.1 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var _a;
Object.defineProperty(exports, "__esModule", { value: true });
exports.AnonymousStrategy = exports.Selectors = void 0;
const RunStrategy_1 = require("./RunStrategy");
const events_1 = require("../events");
const utils_1 = require("../../utils/utils");
const logger_1 = require("../../logger/logger");
class Selectors {
static get container() {
return !this.switchSelectors ? '.results__container.results__container--two-pane' :
'.two-pane-serp-page__results-list';
}
static get jobs() {
return !this.switchSelectors ? '.jobs-search__results-list li' :
'.jobs-search__results-list li';
}
static get links() {
return !this.switchSelectors ? '.jobs-search__results-list li a.result-card__full-card-link' :
'a.base-card__full-link';
}
static get applyLink() {
return 'a[data-is-offsite-apply=true]';
}
static get dates() {
return 'time';
}
static get companies() {
return !this.switchSelectors ? '.result-card__subtitle.job-result-card__subtitle' :
'.base-search-card__subtitle';
}
static get places() {
return !this.switchSelectors ? '.job-result-card__location' :
'.job-search-card__location';
}
static get detailsPanel() {
return '.details-pane__content';
}
static get description() {
return '.description__text';
}
static get seeMoreJobs() {
return 'button.infinite-scroller__show-more-button';
}
}
exports.Selectors = Selectors;
Selectors.switchSelectors = false;
/**
* @class AnonymousStrategy
* @extends RunStrategy
*/
class AnonymousStrategy extends RunStrategy_1.RunStrategy {
constructor() {
super(...arguments);
/**
* Run strategy
* @param browser
* @param page
* @param cdpSession
* @param url
* @param query
* @param location
*/
this.run = (browser, page, cdpSession, url, query, location) => __awaiter(this, void 0, void 0, function* () {
var _b;
console.warn("Anonymous session strategy is no longer maintained and it won't probably work. It is recommended to use an authenticated session, see documentation at https://github.com/spinlud/linkedin-jobs-scraper#anonymous-vs-authenticated-session.");
let tag = `[${query.query}][${location}]`;
let processed = 0;
logger_1.logger.info(tag, "Opening", url);
yield page.goto(url, {
waitUntil: 'load',
});
// Verify if authentication is required
if ((yield AnonymousStrategy._needsAuthentication(page))) {
logger_1.logger.error(tag, "Scraper failed to run in anonymous mode, authentication may be necessary for this environment. Please check the documentation on how to use an authenticated session.");
return { exit: true };
}
// Linkedin seems to randomly load two different set of selectors:
// the following hack tries to switch between the two sets
// Try to load first set of selectors
try {
Selectors.switchSelectors = false;
logger_1.logger.info(tag, 'Trying to load first selectors set');
logger_1.logger.debug(tag, `Evaluating selectors`, [Selectors.container]);
yield page.waitForSelector(Selectors.container, { timeout: 3000 });
}
catch (err) {
// Try to load second set of selectors
try {
Selectors.switchSelectors = true;
logger_1.logger.info(tag, 'Trying to load second selectors set');
logger_1.logger.debug(tag, `Evaluating selectors`, [Selectors.container]);
yield page.waitForSelector(Selectors.container, { timeout: 3000 });
}
catch (err) {
logger_1.logger.info(tag, 'Failed to load container selector, skip');
return { exit: false };
}
}
logger_1.logger.info(tag, 'OK');
let jobIndex = 0;
// Pagination loop
while (processed < query.options.limit) {
yield AnonymousStrategy._acceptCookies(page, tag);
// Get number of all job links in the page
let jobsTot = yield page.evaluate((selector) => document.querySelectorAll(selector).length, Selectors.jobs);
if (jobsTot === 0) {
logger_1.logger.info(tag, `No jobs found, skip`);
break;
}
logger_1.logger.info(tag, "Jobs fetched: " + jobsTot);
// Jobs loop
while (jobIndex < jobsTot && processed < query.options.limit) {
tag = `[${query.query}][${location}][${processed + 1}]`;
let jobId;
let jobLink;
let jobApplyLink;
let jobTitle;
let jobCompany;
let jobPlace;
let jobDescription;
let jobDescriptionHTML;
let jobDate;
let jobSenorityLevel;
let jobFunction;
let jobEmploymentType;
let jobIndustries;
let loadJobDetailsResult;
try {
// Extract job main fields
logger_1.logger.debug(tag, `Evaluating selectors`, [
Selectors.jobs,
Selectors.links,
Selectors.companies,
Selectors.places,
Selectors.dates,
]);
[jobId, jobLink, jobTitle, jobCompany, jobPlace, jobDate] = yield page.evaluate((jobsSelector, linksSelector, companiesSelector, placesSelector, datesSelector, jobIndex) => {
const job = document.querySelectorAll(jobsSelector)[jobIndex];
const link = job.querySelector(linksSelector);
// Click job link and scroll
link.scrollIntoView();
link.click();
const linkUrl = link.getAttribute("href");
let jobId = '';
// Try first set of selectors
jobId = job.getAttribute('data-id');
// If failed, try second set of selectors
if (!jobId) {
jobId = job.querySelector(linksSelector)
.parentElement.getAttribute('data-entity-urn')
.split(':').splice(-1)[0];
}
return [
jobId,
linkUrl,
job.querySelector(linksSelector).innerText,
job.querySelector(companiesSelector).innerText,
job.querySelector(placesSelector).innerText,
job.querySelector(datesSelector).getAttribute('datetime')
];
}, Selectors.jobs, Selectors.links, Selectors.companies, Selectors.places, Selectors.dates, jobIndex);
// Load job details and extract job link
logger_1.logger.debug(tag, `Evaluating selectors`, [
Selectors.links,
]);
loadJobDetailsResult = yield AnonymousStrategy._loadJobDetails(page, jobId);
// Check if loading job details has failed
if (!loadJobDetailsResult.success) {
logger_1.logger.error(tag, loadJobDetailsResult.error);
this.scraper.emit(events_1.events.scraper.error, `${tag}\t${loadJobDetailsResult.error}`);
jobIndex += 1;
continue;
}
// Use custom description function if available
logger_1.logger.debug(tag, `Evaluating selectors`, [
Selectors.description
]);
if ((_b = query.options) === null || _b === void 0 ? void 0 : _b.descriptionFn) {
[jobDescription, jobDescriptionHTML] = yield Promise.all([
page.evaluate(`(${query.options.descriptionFn.toString()})();`),
page.evaluate((selector) => {
return document.querySelector(selector).outerHTML;
}, Selectors.description)
]);
}
else {
[jobDescription, jobDescriptionHTML] = yield page.evaluate((selector) => {
const el = document.querySelector(selector);
return [el.innerText, el.outerHTML];
}, Selectors.description);
}
// Extract apply link
logger_1.logger.debug(tag, `Evaluating selectors`, [
Selectors.applyLink
]);
jobApplyLink = yield page.evaluate((selector) => {
const applyBtn = document.querySelector(selector);
return applyBtn ? applyBtn.getAttribute("href") : null;
}, Selectors.applyLink);
}
catch (err) {
const errorMessage = `${tag}\t${err.message}`;
this.scraper.emit(events_1.events.scraper.error, errorMessage);
jobIndex += 1;
continue;
}
// Emit data
this.scraper.emit(events_1.events.scraper.data, Object.assign(Object.assign({ query: query.query || "", location: location, jobId: jobId, jobIndex: jobIndex, link: jobLink }, jobApplyLink && { applyLink: jobApplyLink }), { title: jobTitle, company: jobCompany, place: jobPlace, description: jobDescription, descriptionHTML: jobDescriptionHTML, date: jobDate, insights: [] }));
jobIndex += 1;
processed += 1;
logger_1.logger.info(tag, `Processed`);
if (processed < query.options.limit && jobIndex === jobsTot) {
logger_1.logger.info(tag, 'Fecthing new jobs');
jobsTot = yield page.evaluate((selector) => document.querySelectorAll(selector).length, Selectors.jobs);
}
}
// Check if we reached the limit of jobs to process
if (processed === query.options.limit)
break;
// Check if there are more jobs to load
logger_1.logger.info(tag, "Checking for new jobs to load...");
const loadMoreJobsResult = yield AnonymousStrategy._loadMoreJobs(page, jobsTot);
// Check if loading jobs has failed
if (!loadMoreJobsResult.success) {
logger_1.logger.info(tag, "There are no more jobs available for the current query");
break;
}
}
return { exit: false };
});
}
}
exports.AnonymousStrategy = AnonymousStrategy;
_a = AnonymousStrategy;
/**
* Verify if authentication is required
* @param {Page} page
* @returns {Promise<boolean>}
* @static
* @private
*/
AnonymousStrategy._needsAuthentication = (page) => __awaiter(void 0, void 0, void 0, function* () {
const parsed = new URL(yield page.url());
return parsed.pathname.toLowerCase().includes("authwall");
});
/**
* Wait for job details to load
* @param page {Page}
* @param jobId {string}
* @param timeout {number}
* @returns {Promise<ILoadResult>}
* @static
* @private
*/
AnonymousStrategy._loadJobDetails = (page, jobId, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () {
const waitTime = 50;
let elapsed = 0;
let loaded = false;
while (!loaded) {
loaded = yield page.evaluate((jobId, panelSelector, descriptionSelector) => {
const detailsPanel = document.querySelector(panelSelector);
const description = document.querySelector(descriptionSelector);
return detailsPanel && detailsPanel.innerHTML.includes(jobId) &&
description && description.innerText.length > 0;
}, jobId, Selectors.detailsPanel, Selectors.description);
if (loaded)
return { success: true };
yield (0, utils_1.sleep)(waitTime);
elapsed += waitTime;
if (elapsed >= timeout) {
return {
success: false,
error: `Timeout on loading job details`
};
}
}
return { success: true };
});
/**
* Try to load more jobs
* @param page {Page}
* @param jobLinksTot {number}
* @param timeout {number}
* @returns {Promise<ILoadResult>}
* @private
*/
AnonymousStrategy._loadMoreJobs = (page, jobLinksTot, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () {
const pollingTime = 100;
let elapsed = 0;
let loaded = false;
let clicked = false;
while (!loaded) {
if (!clicked) {
clicked = yield page.evaluate((selector) => {
const button = document.querySelector(selector);
if (button) {
button.click();
return true;
}
else {
return false;
}
}, Selectors.seeMoreJobs);
}
loaded = yield page.evaluate((selector, jobLinksTot) => {
window.scrollTo(0, document.body.scrollHeight);
return document.querySelectorAll(selector).length > jobLinksTot;
}, Selectors.jobs, jobLinksTot);
if (loaded)
return { success: true };
yield (0, utils_1.sleep)(pollingTime);
elapsed += pollingTime;
if (elapsed >= timeout) {
return {
success: false,
error: `Timeout on loading more jobs`
};
}
}
return { success: true };
});
/**
* Accept cookies
* @param {Page} page
* @param {string} tag
*/
AnonymousStrategy._acceptCookies = (page, tag) => __awaiter(void 0, void 0, void 0, function* () {
try {
yield page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
const cookieButton = buttons.find(e => e.innerText.includes('Accept cookies'));
if (cookieButton) {
cookieButton.click();
}
});
}
catch (err) {
logger_1.logger.debug(tag, "Failed to accept cookies");
}
});