UNPKG

@jomari-wp/linkedin-jobs-scraper

Version:

Scrape public available jobs on Linkedin using headless browser

472 lines (405 loc) 16.5 kB
import { RunStrategy, IRunStrategyResult, ILoadResult } from "./RunStrategy"; import { BrowserContext, Page, CDPSession } from "puppeteer"; import { events } from "../events"; import { sleep } from "../../utils/utils"; import { IQuery } from "../query"; import { logger } from "../../logger/logger"; export class Selectors { static switchSelectors = false; static get container() { return !this.switchSelectors ? '.results__container.results__container--two-pane' : '.two-pane-serp-page__results-list'; } static get jobs() { return !this.switchSelectors ? '.jobs-search__results-list li' : '.jobs-search__results-list li'; } static get links() { return !this.switchSelectors ? '.jobs-search__results-list li a.result-card__full-card-link' : 'a.base-card__full-link'; } static get applyLink() { return 'a[data-is-offsite-apply=true]'; } static get dates() { return 'time'; } static get companies() { return !this.switchSelectors ? '.result-card__subtitle.job-result-card__subtitle' : '.base-search-card__subtitle'; } static get places() { return !this.switchSelectors ? '.job-result-card__location' : '.job-search-card__location'; } static get detailsPanel() { return '.details-pane__content'; } static get description() { return '.description__text'; } static get seeMoreJobs() { return 'button.infinite-scroller__show-more-button'; } } /** * @class AnonymousStrategy * @extends RunStrategy */ export class AnonymousStrategy extends RunStrategy { /** * Verify if authentication is required * @param {Page} page * @returns {Promise<boolean>} * @static * @private */ private static _needsAuthentication = async ( page: Page ): Promise<boolean> => { const parsed = new URL(await page.url()); return parsed.pathname.toLowerCase().includes("authwall"); }; /** * Wait for job details to load * @param page {Page} * @param jobId {string} * @param timeout {number} * @returns {Promise<ILoadResult>} * @static * @private */ private static _loadJobDetails = async ( page: Page, jobId: string, timeout: number = 2000 ): Promise<ILoadResult> => { const waitTime = 50; let elapsed = 0; let loaded = false; while(!loaded) { loaded = await page.evaluate( ( jobId: string, panelSelector: string, descriptionSelector: string ) => { const detailsPanel = document.querySelector(panelSelector) as HTMLElement; const description = document.querySelector(descriptionSelector) as HTMLElement; return detailsPanel && detailsPanel.innerHTML.includes(jobId) && description && description.innerText.length > 0; }, jobId, Selectors.detailsPanel, Selectors.description ); if (loaded) return { success: true }; await sleep(waitTime); elapsed += waitTime; if (elapsed >= timeout) { return { success: false, error: `Timeout on loading job details` }; } } return { success: true }; }; /** * Try to load more jobs * @param page {Page} * @param jobLinksTot {number} * @param timeout {number} * @returns {Promise<ILoadResult>} * @private */ private static _loadMoreJobs = async ( page: Page, jobLinksTot: number, timeout: number = 2000 ): Promise<ILoadResult> => { const pollingTime = 100; let elapsed = 0; let loaded = false; let clicked = false; while(!loaded) { if (!clicked) { clicked = await page.evaluate( (selector: string) => { const button = <HTMLElement>document.querySelector(selector); if (button) { button.click(); return true; } else { return false; } }, Selectors.seeMoreJobs ); } loaded = await page.evaluate( (selector: string, jobLinksTot: number) => { window.scrollTo(0, document.body.scrollHeight); return document.querySelectorAll(selector).length > jobLinksTot; }, Selectors.jobs, jobLinksTot ); if (loaded) return { success: true }; await sleep(pollingTime); elapsed += pollingTime; if (elapsed >= timeout) { return { success: false, error: `Timeout on loading more jobs` }; } } return { success: true }; }; /** * Accept cookies * @param {Page} page * @param {string} tag */ private static _acceptCookies = async ( page: Page, tag: string, ): Promise<void> => { try { await page.evaluate(() => { const buttons = Array.from(document.querySelectorAll('button')); const cookieButton = buttons.find(e => e.innerText.includes('Accept cookies')); if (cookieButton) { cookieButton.click(); } }); } catch (err) { logger.debug(tag, "Failed to accept cookies"); } }; /** * Run strategy * @param browser * @param page * @param cdpSession * @param url * @param query * @param location */ public run = async ( browser: BrowserContext, page: Page, cdpSession: CDPSession, url: string, query: IQuery, location: string, ): Promise<IRunStrategyResult> => { console.warn("Anonymous session strategy is no longer maintained and it won't probably work. It is recommended to use an authenticated session, see documentation at https://github.com/spinlud/linkedin-jobs-scraper#anonymous-vs-authenticated-session."); let tag = `[${query.query}][${location}]`; let processed = 0; logger.info(tag, "Opening", url); await page.goto(url, { waitUntil: 'load', }); // Verify if authentication is required if ((await AnonymousStrategy._needsAuthentication(page))) { logger.error(tag, "Scraper failed to run in anonymous mode, authentication may be necessary for this environment. Please check the documentation on how to use an authenticated session.") return { exit: true }; } // Linkedin seems to randomly load two different set of selectors: // the following hack tries to switch between the two sets // Try to load first set of selectors try { Selectors.switchSelectors = false; logger.info(tag, 'Trying to load first selectors set'); logger.debug(tag, `Evaluating selectors`, [Selectors.container]); await page.waitForSelector(Selectors.container, { timeout: 3000 }); } catch(err: any) { // Try to load second set of selectors try { Selectors.switchSelectors = true; logger.info(tag, 'Trying to load second selectors set'); logger.debug(tag, `Evaluating selectors`, [Selectors.container]); await page.waitForSelector(Selectors.container, { timeout: 3000 }); } catch(err: any) { logger.info(tag, 'Failed to load container selector, skip'); return { exit: false }; } } logger.info(tag, 'OK'); let jobIndex = 0; // Pagination loop while (processed < query.options!.limit!) { await AnonymousStrategy._acceptCookies(page, tag); // Get number of all job links in the page let jobsTot = await page.evaluate( (selector) => document.querySelectorAll(selector).length, Selectors.jobs ); if (jobsTot === 0) { logger.info(tag, `No jobs found, skip`); break; } logger.info(tag, "Jobs fetched: " + jobsTot); // Jobs loop while (jobIndex < jobsTot && processed < query.options!.limit!) { tag = `[${query.query}][${location}][${processed + 1}]`; let jobId; let jobLink; let jobApplyLink; let jobTitle; let jobCompany; let jobPlace; let jobDescription; let jobDescriptionHTML; let jobDate; let jobSenorityLevel; let jobFunction; let jobEmploymentType; let jobIndustries; let loadJobDetailsResult; try { // Extract job main fields logger.debug(tag, `Evaluating selectors`, [ Selectors.jobs, Selectors.links, Selectors.companies, Selectors.places, Selectors.dates, ]); [jobId, jobLink, jobTitle, jobCompany, jobPlace, jobDate] = await page.evaluate( ( jobsSelector: string, linksSelector: string, companiesSelector: string, placesSelector: string, datesSelector: string, jobIndex: number ) => { const job = document.querySelectorAll(jobsSelector)[jobIndex]; const link = job.querySelector(linksSelector) as HTMLElement; // Click job link and scroll link.scrollIntoView(); link.click(); const linkUrl = link.getAttribute("href"); let jobId: string | null = ''; // Try first set of selectors jobId = job.getAttribute('data-id'); // If failed, try second set of selectors if (!jobId) { jobId = (<HTMLElement>job.querySelector(linksSelector)) .parentElement!.getAttribute('data-entity-urn')! .split(':').splice(-1)[0]; } return [ jobId, linkUrl, (<HTMLElement>job.querySelector(linksSelector)).innerText, (<HTMLElement>job.querySelector(companiesSelector)).innerText, (<HTMLElement>job.querySelector(placesSelector)).innerText, (<HTMLElement>job.querySelector(datesSelector)).getAttribute('datetime') ]; }, Selectors.jobs, Selectors.links, Selectors.companies, Selectors.places, Selectors.dates, jobIndex ); // Load job details and extract job link logger.debug(tag, `Evaluating selectors`, [ Selectors.links, ]); loadJobDetailsResult = await AnonymousStrategy._loadJobDetails(page, jobId!); // Check if loading job details has failed if (!loadJobDetailsResult.success) { logger.error(tag, loadJobDetailsResult.error); this.scraper.emit(events.scraper.error, `${tag}\t${loadJobDetailsResult.error}`); jobIndex += 1; continue; } // Use custom description function if available logger.debug(tag, `Evaluating selectors`, [ Selectors.description ]); if (query.options?.descriptionFn) { [jobDescription, jobDescriptionHTML] = await Promise.all([ page.evaluate(`(${query.options.descriptionFn.toString()})();`), page.evaluate((selector) => { return (<HTMLElement>document.querySelector(selector)).outerHTML; }, Selectors.description) ]); } else { [jobDescription, jobDescriptionHTML] = await page.evaluate((selector) => { const el = (<HTMLElement>document.querySelector(selector)); return [el.innerText, el.outerHTML]; }, Selectors.description ); } // Extract apply link logger.debug(tag, `Evaluating selectors`, [ Selectors.applyLink ]); jobApplyLink = await page.evaluate((selector) => { const applyBtn = document.querySelector<HTMLElement>(selector); return applyBtn ? applyBtn.getAttribute("href") : null; }, Selectors.applyLink); } catch(err: any) { const errorMessage = `${tag}\t${err.message}`; this.scraper.emit(events.scraper.error, errorMessage); jobIndex += 1; continue; } // Emit data this.scraper.emit(events.scraper.data, { query: query.query || "", location: location, jobId: jobId!, jobIndex: jobIndex, link: jobLink!, ...jobApplyLink && { applyLink: jobApplyLink }, title: jobTitle!, company: jobCompany!, place: jobPlace!, description: jobDescription! as string, descriptionHTML: jobDescriptionHTML! as string, date: jobDate!, insights: [], }); jobIndex += 1; processed += 1; logger.info(tag, `Processed`); if (processed < query.options!.limit! && jobIndex === jobsTot) { logger.info(tag, 'Fecthing new jobs'); jobsTot = await page.evaluate( (selector) => document.querySelectorAll(selector).length, Selectors.jobs ); } } // Check if we reached the limit of jobs to process if (processed === query.options!.limit!) break; // Check if there are more jobs to load logger.info(tag, "Checking for new jobs to load..."); const loadMoreJobsResult = await AnonymousStrategy._loadMoreJobs( page, jobsTot ); // Check if loading jobs has failed if (!loadMoreJobsResult.success) { logger.info(tag, "There are no more jobs available for the current query"); break; } } return { exit: false }; } }