UNPKG

@jomari-wp/linkedin-jobs-scraper

Version:

Scrape public available jobs on Linkedin using headless browser

504 lines (503 loc) 23.1 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var _a; Object.defineProperty(exports, "__esModule", { value: true }); exports.AuthenticatedStrategy = exports.selectors = void 0; const config_1 = require("../../config"); const RunStrategy_1 = require("./RunStrategy"); const events_1 = require("../events"); const utils_1 = require("../../utils/utils"); const string_1 = require("../../utils/string"); const logger_1 = require("../../logger/logger"); const constants_1 = require("../constants"); exports.selectors = { container: '.jobs-search-results-list', chatPanel: '.msg-overlay-list-bubble', jobs: 'div.job-card-container', link: 'a.job-card-container__link', applyBtn: 'button.jobs-apply-button[role="link"]', title: '.artdeco-entity-lockup__title', company: '.artdeco-entity-lockup__subtitle', companyLink: 'a.job-card-container__company-name', place: '.artdeco-entity-lockup__caption', date: 'time', description: '.jobs-description', detailsPanel: '.jobs-search__job-details--container', detailsTop: '.jobs-details-top-card', details: '.jobs-details__main-content', insights: '[class=jobs-unified-top-card__job-insight]', pagination: '.jobs-search-two-pane__pagination', privacyAcceptBtn: 'button.artdeco-global-alert__action', paginationNextBtn: 'li[data-test-pagination-page-btn].selected + li', paginationBtn: (index) => `li[data-test-pagination-page-btn="${index}"] button`, }; /** * @class AuthenticatedStrategy * @extends RunStrategy */ class AuthenticatedStrategy extends RunStrategy_1.RunStrategy { constructor() { super(...arguments); /** * Run strategy * @param browser * @param page * @param cdpSession * @param url * @param query * @param location */ this.run = (browser, page, cdpSession, url, query, location) => __awaiter(this, void 0, void 0, function* () { var _b, _c; let tag = `[${query.query}][${location}]`; const metrics = { processed: 0, failed: 0, missed: 0, }; let paginationIndex = 0; let paginationSize = 25; // Navigate to home page logger_1.logger.debug(tag, "Opening", constants_1.urls.home); yield page.goto(constants_1.urls.home, { waitUntil: 'load', }); // Set cookie logger_1.logger.info("Setting authentication cookie"); yield page.setCookie({ name: "li_at", value: this.session || config_1.config.LI_AT_COOKIE, domain: ".www.linkedin.com" }); // Open search url logger_1.logger.info(tag, "Opening", url); yield page.goto(url, { waitUntil: 'load', }); // Verify session if (!(yield AuthenticatedStrategy._isAuthenticatedSession(page))) { logger_1.logger.error("The provided session cookie is invalid. Check the documentation on how to obtain a valid session cookie."); this.scraper.emit(events_1.events.scraper.invalidSession); return { exit: true }; } try { yield page.waitForSelector(exports.selectors.container, { timeout: 5000 }); } catch (err) { logger_1.logger.info(tag, `No jobs found, skip`); return { exit: false }; } // Pagination loop while (metrics.processed < query.options.limit) { // Verify session in the loop if (!(yield AuthenticatedStrategy._isAuthenticatedSession(page))) { logger_1.logger.warn(tag, "Session is invalid, this may cause the scraper to fail."); this.scraper.emit(events_1.events.scraper.invalidSession); } else { logger_1.logger.info(tag, "Session is valid"); } yield AuthenticatedStrategy._hideChatPanel(page, tag); yield AuthenticatedStrategy._acceptCookies(page, tag); yield AuthenticatedStrategy._acceptPrivacy(page, tag); let jobIndex = 0; // Get number of all job links in the page let jobsTot = yield page.evaluate((selector) => document.querySelectorAll(selector).length, exports.selectors.jobs); if (jobsTot === 0) { logger_1.logger.info(tag, `No jobs found, skip`); break; } // Jobs loop while (jobIndex < jobsTot && metrics.processed < query.options.limit) { tag = `[${query.query}][${location}][${paginationIndex * paginationSize + jobIndex + 1}]`; let jobId; let jobLink; let jobApplyLink; let jobTitle; let jobCompany; let jobCompanyLink; let jobCompanyImgLink; let jobPlace; let jobDescription; let jobDescriptionHTML; let jobDate; let loadDetailsResult; let jobInsights; try { // Extract job main fields logger_1.logger.debug(tag, 'Evaluating selectors', [ exports.selectors.jobs, exports.selectors.link, exports.selectors.company, exports.selectors.place, exports.selectors.date, ]); const jobFieldsResult = yield page.evaluate((jobsSelector, linkSelector, titleSelector, companyLinkSelector, placeSelector, dateSelector, jobIndex) => { var _b, _c; const job = document.querySelectorAll(jobsSelector)[jobIndex]; const link = job.querySelector(linkSelector); // Click job link and scroll link.scrollIntoView(); link.click(); // Extract job link (relative) const protocol = window.location.protocol + "//"; const hostname = window.location.hostname; const jobLink = protocol + hostname + link.getAttribute("href"); const jobId = job.getAttribute("data-job-id"); const title = job.querySelector(titleSelector) ? job.querySelector(titleSelector).innerText : ""; let company = ""; let companyLink = undefined; if (job.querySelector(companyLinkSelector)) { const companyLinkElem = job.querySelector(companyLinkSelector); company = companyLinkElem.innerText.trim(); companyLink = companyLinkElem.getAttribute("href") ? `${protocol}${hostname}${companyLinkElem.getAttribute("href")}` : undefined; } const companyImgLink = (_c = (_b = job.querySelector("img")) === null || _b === void 0 ? void 0 : _b.getAttribute("src")) !== null && _c !== void 0 ? _c : undefined; const place = job.querySelector(placeSelector) ? job.querySelector(placeSelector).innerText : ""; const date = job.querySelector(dateSelector) ? job.querySelector(dateSelector).getAttribute('datetime') : ""; return { jobId, jobLink, title, company, companyLink, companyImgLink, place, date, }; }, exports.selectors.jobs, exports.selectors.link, exports.selectors.title, exports.selectors.companyLink, exports.selectors.place, exports.selectors.date, jobIndex); jobId = jobFieldsResult.jobId; jobLink = jobFieldsResult.jobLink; jobTitle = jobFieldsResult.title; jobCompany = jobFieldsResult.company; jobCompanyLink = jobFieldsResult.companyLink; jobCompanyImgLink = jobFieldsResult.companyImgLink; jobPlace = jobFieldsResult.place; jobDate = jobFieldsResult.date; // Try to load job details and extract job link logger_1.logger.debug(tag, 'Evaluating selectors', [ exports.selectors.jobs, ]); loadDetailsResult = yield AuthenticatedStrategy._loadJobDetails(page, jobId); // Check if loading job details has failed if (!loadDetailsResult.success) { logger_1.logger.error(tag, loadDetailsResult.error); jobIndex += 1; continue; } // Use custom description function if available logger_1.logger.debug(tag, 'Evaluating selectors', [ exports.selectors.description, ]); if ((_b = query.options) === null || _b === void 0 ? void 0 : _b.descriptionFn) { [jobDescription, jobDescriptionHTML] = yield Promise.all([ page.evaluate(`(${query.options.descriptionFn.toString()})();`), page.evaluate((selector) => { return document.querySelector(selector).outerHTML; }, exports.selectors.description) ]); } else { [jobDescription, jobDescriptionHTML] = yield page.evaluate((selector) => { const el = document.querySelector(selector); return [el.innerText, el.outerHTML]; }, exports.selectors.description); } jobDescription = jobDescription; // Extract job insights logger_1.logger.debug(tag, 'Evaluating selectors', [ exports.selectors.insights, ]); jobInsights = yield page.evaluate((jobInsightsSelector) => { const nodes = document.querySelectorAll(jobInsightsSelector); return Array.from(nodes).map(e => e.textContent .replace(/[\n\r\t ]+/g, ' ').trim()); }, exports.selectors.insights); // Apply link if ((_c = query.options) === null || _c === void 0 ? void 0 : _c.applyLink) { try { if (yield page.evaluate((applyBtnSelector) => { const applyBtn = document.querySelector(applyBtnSelector); if (applyBtn) { applyBtn.click(); return true; } return false; }, exports.selectors.applyBtn)) { logger_1.logger.debug(tag, 'Try extracting apply link'); const targetsResponse = yield cdpSession.send('Target.getTargets'); // The first not attached target should be the apply page if (targetsResponse.targetInfos && targetsResponse.targetInfos.length > 1) { const applyTarget = targetsResponse.targetInfos .filter(e => e.type === 'page') .find(e => !e.attached); if (applyTarget) { jobApplyLink = applyTarget.url; yield cdpSession.send('Target.closeTarget', { targetId: applyTarget.targetId }); } } } } catch (err) { logger_1.logger.warn(tag, 'Failed to extract apply link', err); } } } catch (err) { const errorMessage = `${tag}\t${err.message}`; this.scraper.emit(events_1.events.scraper.error, errorMessage); jobIndex++; metrics.failed++; continue; } // Emit data (NB: should be outside of try/catch block to be properly tested) this.scraper.emit(events_1.events.scraper.data, { query: query.query || "", location: location, jobId: jobId, jobIndex: jobIndex, link: jobLink, applyLink: jobApplyLink, title: (0, string_1.normalizeString)(jobTitle), company: (0, string_1.normalizeString)(jobCompany), companyLink: jobCompanyLink, companyImgLink: jobCompanyImgLink, place: (0, string_1.normalizeString)(jobPlace), description: jobDescription, descriptionHTML: jobDescriptionHTML, date: jobDate, insights: jobInsights, }); jobIndex += 1; metrics.processed += 1; logger_1.logger.info(tag, `Processed`); if (metrics.processed < query.options.limit && jobIndex === jobsTot && jobsTot < paginationSize) { const loadJobsResult = yield AuthenticatedStrategy._loadJobs(page, jobsTot); if (loadJobsResult.success) { jobsTot = loadJobsResult.count; } } if (jobIndex === jobsTot) { break; } } tag = `[${query.query}][${location}]`; logger_1.logger.info(tag, 'No more jobs to process in this page'); // Check if we reached the limit of jobs to process if (metrics.processed === query.options.limit) { logger_1.logger.info(tag, 'Query limit reached!'); // Emit metrics this.scraper.emit(events_1.events.scraper.metrics, metrics); logger_1.logger.info(tag, 'Metrics:', metrics); break; } else { metrics.missed += paginationSize - jobIndex; } // Emit metrics this.scraper.emit(events_1.events.scraper.metrics, metrics); logger_1.logger.info(tag, 'Metrics:', metrics); // Try to paginate paginationIndex += 1; logger_1.logger.info(tag, `Pagination requested [${paginationIndex}]`); const paginationResult = yield AuthenticatedStrategy._paginate(page, tag); if (!paginationResult.success) { logger_1.logger.info(tag, `Couldn\'t find more jobs for the running query`); break; } } return { exit: false }; }); } } exports.AuthenticatedStrategy = AuthenticatedStrategy; _a = AuthenticatedStrategy; /** * Check if session is authenticated * @param {Page} page * @returns {Promise<boolean>} * @returns {Promise<ILoadResult>} * @static * @private */ AuthenticatedStrategy._isAuthenticatedSession = (page) => __awaiter(void 0, void 0, void 0, function* () { const cookies = yield page.cookies(); return cookies.some(e => e.name === "li_at"); }); /** * Load jobs * @param page {Page} * @param jobsTot {number} * @param timeout {number} * @static * @private */ AuthenticatedStrategy._loadJobs = (page, jobsTot, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () { const pollingTime = 50; let elapsed = 0; yield (0, utils_1.sleep)(pollingTime); try { while (elapsed < timeout) { const jobsCount = yield page.evaluate((selector) => { return document.querySelectorAll(selector).length; }, exports.selectors.jobs); if (jobsCount > jobsTot) { return { success: true, count: jobsCount }; } yield (0, utils_1.sleep)(pollingTime); elapsed += pollingTime; } } catch (err) { } return { success: false, error: `Timeout on loading jobs` }; }); /** * Try to load job details * @param {Page} page * @param {string} jobId * @param {number} timeout * @static * @private */ AuthenticatedStrategy._loadJobDetails = (page, jobId, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () { const pollingTime = 50; let elapsed = 0; let loaded = false; yield (0, utils_1.sleep)(pollingTime); try { while (elapsed < timeout) { loaded = yield page.evaluate((jobId, panelSelector, descriptionSelector) => { const detailsPanel = document.querySelector(panelSelector); const description = document.querySelector(descriptionSelector); return detailsPanel && detailsPanel.innerHTML.includes(jobId) && description && description.innerText.length > 0; }, jobId, exports.selectors.detailsPanel, exports.selectors.description); if (loaded) { return { success: true }; } yield (0, utils_1.sleep)(pollingTime); elapsed += pollingTime; } } catch (err) { } return { success: false, error: `Timeout on loading job details` }; }); /** * Try to paginate * @param {Page} page * @param {string} tag * @param {string} paginationSize * @param {number} timeout * @returns {Promise<ILoadResult>} * @static * @private */ AuthenticatedStrategy._paginate = (page, tag, paginationSize = 25, timeout = 2000) => __awaiter(void 0, void 0, void 0, function* () { const url = new URL(page.url()); // Extract offset from url let offset = parseInt(url.searchParams.get('start') || "0", 10); offset += paginationSize; // Update offset in url url.searchParams.set('start', '' + offset); logger_1.logger.info(tag, 'Next offset: ', offset); logger_1.logger.info(tag, 'Opening', url.toString()); // Navigate new url yield page.goto(url.toString(), { waitUntil: 'load', }); const pollingTime = 100; let elapsed = 0; let loaded = false; logger_1.logger.info(tag, 'Waiting for new jobs to load'); // Wait for new jobs to load while (!loaded) { loaded = yield page.evaluate((selector) => { return document.querySelectorAll(selector).length > 0; }, exports.selectors.jobs); if (loaded) return { success: true }; yield (0, utils_1.sleep)(pollingTime); elapsed += pollingTime; if (elapsed >= timeout) { return { success: false, error: `Timeout on pagination` }; } } return { success: true }; }); /** * Hide chat panel * @param {Page} page * @param {string} tag */ AuthenticatedStrategy._hideChatPanel = (page, tag) => __awaiter(void 0, void 0, void 0, function* () { try { yield page.evaluate((selector) => { const div = document.querySelector(selector); if (div) { div.style.display = "none"; } }, exports.selectors.chatPanel); } catch (err) { logger_1.logger.debug(tag, "Failed to hide chat panel"); } }); /** * Accept cookies * @param {Page} page * @param {string} tag */ AuthenticatedStrategy._acceptCookies = (page, tag) => __awaiter(void 0, void 0, void 0, function* () { try { yield page.evaluate(() => { const buttons = Array.from(document.querySelectorAll('button')); const cookieButton = buttons.find(e => e.innerText.includes('Accept cookies')); if (cookieButton) { cookieButton.click(); } }); } catch (err) { logger_1.logger.debug(tag, "Failed to accept cookies"); } }); /** * Accept privacy * @param page * @param tag */ AuthenticatedStrategy._acceptPrivacy = (page, tag) => __awaiter(void 0, void 0, void 0, function* () { try { yield page.evaluate((selector) => { const privacyButton = Array.from(document.querySelectorAll(selector)) .find(e => e.innerText === 'Accept'); if (privacyButton) { privacyButton.click(); } }, exports.selectors.privacyAcceptBtn); } catch (err) { logger_1.logger.debug(tag, "Failed to accept privacy"); } });