@jomari-wp/linkedin-jobs-scraper
Version:
Scrape public available jobs on Linkedin using headless browser
504 lines (503 loc) • 23.1 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var _a;
Object.defineProperty(exports, "__esModule", { value: true });
exports.AuthenticatedStrategy = exports.selectors = void 0;
const config_1 = require("../../config");
const RunStrategy_1 = require("./RunStrategy");
const events_1 = require("../events");
const utils_1 = require("../../utils/utils");
const string_1 = require("../../utils/string");
const logger_1 = require("../../logger/logger");
const constants_1 = require("../constants");
exports.selectors = {
container: '.jobs-search-results-list',
chatPanel: '.msg-overlay-list-bubble',
jobs: 'div.job-card-container',
link: 'a.job-card-container__link',
applyBtn: 'button.jobs-apply-button[role="link"]',
title: '.artdeco-entity-lockup__title',
company: '.artdeco-entity-lockup__subtitle',
companyLink: 'a.job-card-container__company-name',
place: '.artdeco-entity-lockup__caption',
date: 'time',
description: '.jobs-description',
detailsPanel: '.jobs-search__job-details--container',
detailsTop: '.jobs-details-top-card',
details: '.jobs-details__main-content',
insights: '[class=jobs-unified-top-card__job-insight]',
pagination: '.jobs-search-two-pane__pagination',
privacyAcceptBtn: 'button.artdeco-global-alert__action',
paginationNextBtn: 'li[data-test-pagination-page-btn].selected + li',
paginationBtn: (index) => `li[data-test-pagination-page-btn="${index}"] button`,
};
/**
* @class AuthenticatedStrategy
* @extends RunStrategy
*/
class AuthenticatedStrategy extends RunStrategy_1.RunStrategy {
constructor() {
super(...arguments);
/**
* Run strategy
* @param browser
* @param page
* @param cdpSession
* @param url
* @param query
* @param location
*/
this.run = (browser, page, cdpSession, url, query, location) => __awaiter(this, void 0, void 0, function* () {
var _b, _c;
let tag = `[${query.query}][${location}]`;
const metrics = {
processed: 0,
failed: 0,
missed: 0,
};
let paginationIndex = 0;
let paginationSize = 25;
// Navigate to home page
logger_1.logger.debug(tag, "Opening", constants_1.urls.home);
yield page.goto(constants_1.urls.home, {
waitUntil: 'load',
});
// Set cookie
logger_1.logger.info("Setting authentication cookie");
yield page.setCookie({
name: "li_at",
value: this.session || config_1.config.LI_AT_COOKIE,
domain: ".www.linkedin.com"
});
// Open search url
logger_1.logger.info(tag, "Opening", url);
yield page.goto(url, {
waitUntil: 'load',
});
// Verify session
if (!(yield AuthenticatedStrategy._isAuthenticatedSession(page))) {
logger_1.logger.error("The provided session cookie is invalid. Check the documentation on how to obtain a valid session cookie.");
this.scraper.emit(events_1.events.scraper.invalidSession);
return { exit: true };
}
try {
yield page.waitForSelector(exports.selectors.container, { timeout: 5000 });
}
catch (err) {
logger_1.logger.info(tag, `No jobs found, skip`);
return { exit: false };
}
// Pagination loop
while (metrics.processed < query.options.limit) {
// Verify session in the loop
if (!(yield AuthenticatedStrategy._isAuthenticatedSession(page))) {
logger_1.logger.warn(tag, "Session is invalid, this may cause the scraper to fail.");
this.scraper.emit(events_1.events.scraper.invalidSession);
}
else {
logger_1.logger.info(tag, "Session is valid");
}
yield AuthenticatedStrategy._hideChatPanel(page, tag);
yield AuthenticatedStrategy._acceptCookies(page, tag);
yield AuthenticatedStrategy._acceptPrivacy(page, tag);
let jobIndex = 0;
// Get number of all job links in the page
let jobsTot = yield page.evaluate((selector) => document.querySelectorAll(selector).length, exports.selectors.jobs);
if (jobsTot === 0) {
logger_1.logger.info(tag, `No jobs found, skip`);
break;
}
// Jobs loop
while (jobIndex < jobsTot && metrics.processed < query.options.limit) {
tag = `[${query.query}][${location}][${paginationIndex * paginationSize + jobIndex + 1}]`;
let jobId;
let jobLink;
let jobApplyLink;
let jobTitle;
let jobCompany;
let jobCompanyLink;
let jobCompanyImgLink;
let jobPlace;
let jobDescription;
let jobDescriptionHTML;
let jobDate;
let loadDetailsResult;
let jobInsights;
try {
// Extract job main fields
logger_1.logger.debug(tag, 'Evaluating selectors', [
exports.selectors.jobs,
exports.selectors.link,
exports.selectors.company,
exports.selectors.place,
exports.selectors.date,
]);
const jobFieldsResult = yield page.evaluate((jobsSelector, linkSelector, titleSelector, companyLinkSelector, placeSelector, dateSelector, jobIndex) => {
var _b, _c;
const job = document.querySelectorAll(jobsSelector)[jobIndex];
const link = job.querySelector(linkSelector);
// Click job link and scroll
link.scrollIntoView();
link.click();
// Extract job link (relative)
const protocol = window.location.protocol + "//";
const hostname = window.location.hostname;
const jobLink = protocol + hostname + link.getAttribute("href");
const jobId = job.getAttribute("data-job-id");
const title = job.querySelector(titleSelector) ?
job.querySelector(titleSelector).innerText : "";
let company = "";
let companyLink = undefined;
if (job.querySelector(companyLinkSelector)) {
const companyLinkElem = job.querySelector(companyLinkSelector);
company = companyLinkElem.innerText.trim();
companyLink = companyLinkElem.getAttribute("href") ?
`${protocol}${hostname}${companyLinkElem.getAttribute("href")}` : undefined;
}
const companyImgLink = (_c = (_b = job.querySelector("img")) === null || _b === void 0 ? void 0 : _b.getAttribute("src")) !== null && _c !== void 0 ? _c : undefined;
const place = job.querySelector(placeSelector) ?
job.querySelector(placeSelector).innerText : "";
const date = job.querySelector(dateSelector) ?
job.querySelector(dateSelector).getAttribute('datetime') : "";
return {
jobId,
jobLink,
title,
company,
companyLink,
companyImgLink,
place,
date,
};
}, exports.selectors.jobs, exports.selectors.link, exports.selectors.title, exports.selectors.companyLink, exports.selectors.place, exports.selectors.date, jobIndex);
jobId = jobFieldsResult.jobId;
jobLink = jobFieldsResult.jobLink;
jobTitle = jobFieldsResult.title;
jobCompany = jobFieldsResult.company;
jobCompanyLink = jobFieldsResult.companyLink;
jobCompanyImgLink = jobFieldsResult.companyImgLink;
jobPlace = jobFieldsResult.place;
jobDate = jobFieldsResult.date;
// Try to load job details and extract job link
logger_1.logger.debug(tag, 'Evaluating selectors', [
exports.selectors.jobs,
]);
loadDetailsResult = yield AuthenticatedStrategy._loadJobDetails(page, jobId);
// Check if loading job details has failed
if (!loadDetailsResult.success) {
logger_1.logger.error(tag, loadDetailsResult.error);
jobIndex += 1;
continue;
}
// Use custom description function if available
logger_1.logger.debug(tag, 'Evaluating selectors', [
exports.selectors.description,
]);
if ((_b = query.options) === null || _b === void 0 ? void 0 : _b.descriptionFn) {
[jobDescription, jobDescriptionHTML] = yield Promise.all([
page.evaluate(`(${query.options.descriptionFn.toString()})();`),
page.evaluate((selector) => {
return document.querySelector(selector).outerHTML;
}, exports.selectors.description)
]);
}
else {
[jobDescription, jobDescriptionHTML] = yield page.evaluate((selector) => {
const el = document.querySelector(selector);
return [el.innerText, el.outerHTML];
}, exports.selectors.description);
}
jobDescription = jobDescription;
// Extract job insights
logger_1.logger.debug(tag, 'Evaluating selectors', [
exports.selectors.insights,
]);
jobInsights = yield page.evaluate((jobInsightsSelector) => {
const nodes = document.querySelectorAll(jobInsightsSelector);
return Array.from(nodes).map(e => e.textContent
.replace(/[\n\r\t ]+/g, ' ').trim());
}, exports.selectors.insights);
// Apply link
if ((_c = query.options) === null || _c === void 0 ? void 0 : _c.applyLink) {
try {
if (yield page.evaluate((applyBtnSelector) => {
const applyBtn = document.querySelector(applyBtnSelector);
if (applyBtn) {
applyBtn.click();
return true;
}
return false;
}, exports.selectors.applyBtn)) {
logger_1.logger.debug(tag, 'Try extracting apply link');
const targetsResponse = yield cdpSession.send('Target.getTargets');
// The first not attached target should be the apply page
if (targetsResponse.targetInfos && targetsResponse.targetInfos.length > 1) {
const applyTarget = targetsResponse.targetInfos
.filter(e => e.type === 'page')
.find(e => !e.attached);
if (applyTarget) {
jobApplyLink = applyTarget.url;
yield cdpSession.send('Target.closeTarget', { targetId: applyTarget.targetId });
}
}
}
}
catch (err) {
logger_1.logger.warn(tag, 'Failed to extract apply link', err);
}
}
}
catch (err) {
const errorMessage = `${tag}\t${err.message}`;
this.scraper.emit(events_1.events.scraper.error, errorMessage);
jobIndex++;
metrics.failed++;
continue;
}
// Emit data (NB: should be outside of try/catch block to be properly tested)
this.scraper.emit(events_1.events.scraper.data, {
query: query.query || "",
location: location,
jobId: jobId,
jobIndex: jobIndex,
link: jobLink,
applyLink: jobApplyLink,
title: (0, string_1.normalizeString)(jobTitle),
company: (0, string_1.normalizeString)(jobCompany),
companyLink: jobCompanyLink,
companyImgLink: jobCompanyImgLink,
place: (0, string_1.normalizeString)(jobPlace),
description: jobDescription,
descriptionHTML: jobDescriptionHTML,
date: jobDate,
insights: jobInsights,
});
jobIndex += 1;
metrics.processed += 1;
logger_1.logger.info(tag, `Processed`);
if (metrics.processed < query.options.limit && jobIndex === jobsTot && jobsTot < paginationSize) {
const loadJobsResult = yield AuthenticatedStrategy._loadJobs(page, jobsTot);
if (loadJobsResult.success) {
jobsTot = loadJobsResult.count;
}
}
if (jobIndex === jobsTot) {
break;
}
}
tag = `[${query.query}][${location}]`;
logger_1.logger.info(tag, 'No more jobs to process in this page');
// Check if we reached the limit of jobs to process
if (metrics.processed === query.options.limit) {
logger_1.logger.info(tag, 'Query limit reached!');
// Emit metrics
this.scraper.emit(events_1.events.scraper.metrics, metrics);
logger_1.logger.info(tag, 'Metrics:', metrics);
break;
}
else {
metrics.missed += paginationSize - jobIndex;
}
// Emit metrics
this.scraper.emit(events_1.events.scraper.metrics, metrics);
logger_1.logger.info(tag, 'Metrics:', metrics);
// Try to paginate
paginationIndex += 1;
logger_1.logger.info(tag, `Pagination requested [${paginationIndex}]`);
const paginationResult = yield AuthenticatedStrategy._paginate(page, tag);
if (!paginationResult.success) {
logger_1.logger.info(tag, `Couldn\'t find more jobs for the running query`);
break;
}
}
return { exit: false };
});
}
}
exports.AuthenticatedStrategy = AuthenticatedStrategy;
_a = AuthenticatedStrategy;
/**
* Check if session is authenticated
* @param {Page} page
* @returns {Promise<boolean>}
* @returns {Promise<ILoadResult>}
* @static
* @private
*/
AuthenticatedStrategy._isAuthenticatedSession = (page) => __awaiter(void 0, void 0, void 0, function* () {
const cookies = yield page.cookies();
return cookies.some(e => e.name === "li_at");
});
/**
* Load jobs
* @param page {Page}
* @param jobsTot {number}
* @param timeout {number}
* @static
* @private
*/
AuthenticatedStrategy._loadJobs = (page, jobsTot, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () {
const pollingTime = 50;
let elapsed = 0;
yield (0, utils_1.sleep)(pollingTime);
try {
while (elapsed < timeout) {
const jobsCount = yield page.evaluate((selector) => {
return document.querySelectorAll(selector).length;
}, exports.selectors.jobs);
if (jobsCount > jobsTot) {
return { success: true, count: jobsCount };
}
yield (0, utils_1.sleep)(pollingTime);
elapsed += pollingTime;
}
}
catch (err) { }
return {
success: false,
error: `Timeout on loading jobs`
};
});
/**
* Try to load job details
* @param {Page} page
* @param {string} jobId
* @param {number} timeout
* @static
* @private
*/
AuthenticatedStrategy._loadJobDetails = (page, jobId, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () {
const pollingTime = 50;
let elapsed = 0;
let loaded = false;
yield (0, utils_1.sleep)(pollingTime);
try {
while (elapsed < timeout) {
loaded = yield page.evaluate((jobId, panelSelector, descriptionSelector) => {
const detailsPanel = document.querySelector(panelSelector);
const description = document.querySelector(descriptionSelector);
return detailsPanel && detailsPanel.innerHTML.includes(jobId) &&
description && description.innerText.length > 0;
}, jobId, exports.selectors.detailsPanel, exports.selectors.description);
if (loaded) {
return { success: true };
}
yield (0, utils_1.sleep)(pollingTime);
elapsed += pollingTime;
}
}
catch (err) { }
return {
success: false,
error: `Timeout on loading job details`
};
});
/**
* Try to paginate
* @param {Page} page
* @param {string} tag
* @param {string} paginationSize
* @param {number} timeout
* @returns {Promise<ILoadResult>}
* @static
* @private
*/
AuthenticatedStrategy._paginate = (page, tag, paginationSize = 25, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () {
const url = new URL(page.url());
// Extract offset from url
let offset = parseInt(url.searchParams.get('start') || "0", 10);
offset += paginationSize;
// Update offset in url
url.searchParams.set('start', '' + offset);
logger_1.logger.info(tag, 'Next offset: ', offset);
logger_1.logger.info(tag, 'Opening', url.toString());
// Navigate new url
yield page.goto(url.toString(), {
waitUntil: 'load',
});
const pollingTime = 100;
let elapsed = 0;
let loaded = false;
logger_1.logger.info(tag, 'Waiting for new jobs to load');
// Wait for new jobs to load
while (!loaded) {
loaded = yield page.evaluate((selector) => {
return document.querySelectorAll(selector).length > 0;
}, exports.selectors.jobs);
if (loaded)
return { success: true };
yield (0, utils_1.sleep)(pollingTime);
elapsed += pollingTime;
if (elapsed >= timeout) {
return {
success: false,
error: `Timeout on pagination`
};
}
}
return { success: true };
});
/**
* Hide chat panel
* @param {Page} page
* @param {string} tag
*/
AuthenticatedStrategy._hideChatPanel = (page, tag) => __awaiter(void 0, void 0, void 0, function* () {
try {
yield page.evaluate((selector) => {
const div = document.querySelector(selector);
if (div) {
div.style.display = "none";
}
}, exports.selectors.chatPanel);
}
catch (err) {
logger_1.logger.debug(tag, "Failed to hide chat panel");
}
});
/**
* Accept cookies
* @param {Page} page
* @param {string} tag
*/
AuthenticatedStrategy._acceptCookies = (page, tag) => __awaiter(void 0, void 0, void 0, function* () {
try {
yield page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
const cookieButton = buttons.find(e => e.innerText.includes('Accept cookies'));
if (cookieButton) {
cookieButton.click();
}
});
}
catch (err) {
logger_1.logger.debug(tag, "Failed to accept cookies");
}
});
/**
* Accept privacy
* @param page
* @param tag
*/
AuthenticatedStrategy._acceptPrivacy = (page, tag) => __awaiter(void 0, void 0, void 0, function* () {
try {
yield page.evaluate((selector) => {
const privacyButton = Array.from(document.querySelectorAll(selector))
.find(e => e.innerText === 'Accept');
if (privacyButton) {
privacyButton.click();
}
}, exports.selectors.privacyAcceptBtn);
}
catch (err) {
logger_1.logger.debug(tag, "Failed to accept privacy");
}
});