UNPKG

@jomari-wp/linkedin-jobs-scraper

Version:

Scrape public available jobs on Linkedin using headless browser

634 lines (544 loc) 22.5 kB
import { config } from "../../config"; import { RunStrategy, IRunStrategyResult, ILoadResult } from "./RunStrategy"; import { BrowserContext, Page, CDPSession } from "puppeteer"; import { events, IMetrics } from "../events"; import { sleep } from "../../utils/utils"; import { normalizeString } from "../../utils/string"; import { IQuery } from "../query"; import { logger } from "../../logger/logger"; import { urls } from "../constants"; export const selectors = { container: '.jobs-search-results-list', chatPanel: '.msg-overlay-list-bubble', jobs: 'div.job-card-container', link: 'a.job-card-container__link', applyBtn: 'button.jobs-apply-button[role="link"]', title: '.artdeco-entity-lockup__title', company: '.artdeco-entity-lockup__subtitle', companyLink: 'a.job-card-container__company-name', place: '.artdeco-entity-lockup__caption', date: 'time', description: '.jobs-description', detailsPanel: '.jobs-search__job-details--container', detailsTop: '.jobs-details-top-card', details: '.jobs-details__main-content', insights: '[class=jobs-unified-top-card__job-insight]', // only one class pagination: '.jobs-search-two-pane__pagination', privacyAcceptBtn: 'button.artdeco-global-alert__action', paginationNextBtn: 'li[data-test-pagination-page-btn].selected + li', paginationBtn: (index: number) => `li[data-test-pagination-page-btn="${index}"] button`, }; /** * @class AuthenticatedStrategy * @extends RunStrategy */ export class AuthenticatedStrategy extends RunStrategy { /** * Check if session is authenticated * @param {Page} page * @returns {Promise<boolean>} * @returns {Promise<ILoadResult>} * @static * @private */ private static _isAuthenticatedSession = async (page: Page): Promise<boolean> => { const cookies = await page.cookies(); return cookies.some(e => e.name === "li_at"); }; /** * Load jobs * @param page {Page} * @param jobsTot {number} * @param timeout {number} * @static * @private */ private static _loadJobs = async ( page: Page, jobsTot: number, timeout: number = 2000, ): Promise<any> => { const pollingTime = 50; let elapsed = 0; await sleep(pollingTime); try { while (elapsed < timeout) { const jobsCount = await page.evaluate((selector) => { return document.querySelectorAll(selector).length; }, selectors.jobs); if (jobsCount > jobsTot) { return { success: true, count: jobsCount }; } await sleep(pollingTime); elapsed += pollingTime; } } catch (err) {} return { success: false, error: `Timeout on loading jobs` }; }; /** * Try to load job details * @param {Page} page * @param {string} jobId * @param {number} timeout * @static * @private */ private static _loadJobDetails = async ( page: Page, jobId: string, timeout: number = 2000, ): Promise<ILoadResult> => { const pollingTime = 50; let elapsed = 0; let loaded = false; await sleep(pollingTime); try { while (elapsed < timeout) { loaded = await page.evaluate( (jobId, panelSelector, descriptionSelector) => { const detailsPanel = document.querySelector(panelSelector) as HTMLElement; const description = document.querySelector(descriptionSelector) as HTMLElement; return detailsPanel && detailsPanel.innerHTML.includes(jobId) && description && description.innerText.length > 0; }, jobId, selectors.detailsPanel, selectors.description, ); if (loaded) { return { success: true }; } await sleep(pollingTime); elapsed += pollingTime; } } catch (err) {} return { success: false, error: `Timeout on loading job details` }; }; /** * Try to paginate * @param {Page} page * @param {string} tag * @param {string} paginationSize * @param {number} timeout * @returns {Promise<ILoadResult>} * @static * @private */ private static _paginate = async ( page: Page, tag: string, paginationSize: number = 25, timeout: number = 2000, ): Promise<ILoadResult> => { const url = new URL(page.url()); // Extract offset from url let offset = parseInt(url.searchParams.get('start') || "0", 10); offset += paginationSize; // Update offset in url url.searchParams.set('start', '' + offset); logger.info(tag, 'Next offset: ', offset); logger.info(tag, 'Opening', url.toString()); // Navigate new url await page.goto(url.toString(), { waitUntil: 'load', }); const pollingTime = 100; let elapsed = 0; let loaded = false; logger.info(tag, 'Waiting for new jobs to load'); // Wait for new jobs to load while (!loaded) { loaded = await page.evaluate( (selector) => { return document.querySelectorAll(selector).length > 0; }, selectors.jobs, ); if (loaded) return { success: true }; await sleep(pollingTime); elapsed += pollingTime; if (elapsed >= timeout) { return { success: false, error: `Timeout on pagination` }; } } return { success: true }; }; /** * Hide chat panel * @param {Page} page * @param {string} tag */ private static _hideChatPanel = async ( page: Page, tag: string, ): Promise<void> => { try { await page.evaluate((selector) => { const div = document.querySelector(selector) as HTMLElement; if (div) { div.style.display = "none"; } }, selectors.chatPanel); } catch (err) { logger.debug(tag, "Failed to hide chat panel"); } }; /** * Accept cookies * @param {Page} page * @param {string} tag */ private static _acceptCookies = async ( page: Page, tag: string, ): Promise<void> => { try { await page.evaluate(() => { const buttons = Array.from(document.querySelectorAll('button')); const cookieButton = buttons.find(e => e.innerText.includes('Accept cookies')); if (cookieButton) { cookieButton.click(); } }); } catch (err) { logger.debug(tag, "Failed to accept cookies"); } }; /** * Accept privacy * @param page * @param tag */ private static _acceptPrivacy = async ( page: Page, tag: string, ): Promise<void> => { try { await page.evaluate((selector) => { const privacyButton = Array.from(document.querySelectorAll<HTMLElement>(selector)) .find(e => e.innerText === 'Accept'); if (privacyButton) { privacyButton.click(); } }, selectors.privacyAcceptBtn); } catch (err) { logger.debug(tag, "Failed to accept privacy"); } }; /** * Run strategy * @param browser * @param page * @param cdpSession * @param url * @param query * @param location */ public run = async ( browser: BrowserContext, page: Page, cdpSession: CDPSession, url: string, query: IQuery, location: string, ): Promise<IRunStrategyResult> => { let tag = `[${query.query}][${location}]`; const metrics: IMetrics = { processed: 0, failed: 0, missed: 0, }; let paginationIndex = 0; let paginationSize = 25; // Navigate to home page logger.debug(tag, "Opening", urls.home); await page.goto(urls.home, { waitUntil: 'load', }); // Set cookie logger.info("Setting authentication cookie"); await page.setCookie({ name: "li_at", value: this.session || config.LI_AT_COOKIE!, domain: ".www.linkedin.com" }); // Open search url logger.info(tag, "Opening", url); await page.goto(url, { waitUntil: 'load', }); // Verify session if (!(await AuthenticatedStrategy._isAuthenticatedSession(page))) { logger.error("The provided session cookie is invalid. Check the documentation on how to obtain a valid session cookie."); this.scraper.emit(events.scraper.invalidSession); return { exit: true }; } try { await page.waitForSelector(selectors.container, { timeout: 5000 }); } catch(err: any) { logger.info(tag, `No jobs found, skip`); return { exit: false }; } // Pagination loop while (metrics.processed < query.options!.limit!) { // Verify session in the loop if (!(await AuthenticatedStrategy._isAuthenticatedSession(page))) { logger.warn(tag, "Session is invalid, this may cause the scraper to fail."); this.scraper.emit(events.scraper.invalidSession); } else { logger.info(tag, "Session is valid"); } await AuthenticatedStrategy._hideChatPanel(page, tag); await AuthenticatedStrategy._acceptCookies(page, tag); await AuthenticatedStrategy._acceptPrivacy(page, tag); let jobIndex = 0; // Get number of all job links in the page let jobsTot = await page.evaluate( (selector) => document.querySelectorAll(selector).length, selectors.jobs ); if (jobsTot === 0) { logger.info(tag, `No jobs found, skip`); break; } // Jobs loop while (jobIndex < jobsTot && metrics.processed < query.options!.limit!) { tag = `[${query.query}][${location}][${paginationIndex * paginationSize + jobIndex + 1}]`; let jobId; let jobLink; let jobApplyLink; let jobTitle; let jobCompany; let jobCompanyLink; let jobCompanyImgLink; let jobPlace; let jobDescription; let jobDescriptionHTML; let jobDate; let loadDetailsResult; let jobInsights; try { // Extract job main fields logger.debug(tag, 'Evaluating selectors', [ selectors.jobs, selectors.link, selectors.company, selectors.place, selectors.date, ]); const jobFieldsResult = await page.evaluate( ( jobsSelector: string, linkSelector: string, titleSelector: string, companyLinkSelector: string, placeSelector: string, dateSelector: string, jobIndex: number ) => { const job = document.querySelectorAll(jobsSelector)[jobIndex]; const link = job.querySelector(linkSelector) as HTMLElement; // Click job link and scroll link.scrollIntoView(); link.click(); // Extract job link (relative) const protocol = window.location.protocol + "//"; const hostname = window.location.hostname; const jobLink = protocol + hostname + link.getAttribute("href"); const jobId = job.getAttribute("data-job-id"); const title = job.querySelector(titleSelector) ? (<HTMLElement>job.querySelector(titleSelector)).innerText : ""; let company = ""; let companyLink = undefined; if (job.querySelector(companyLinkSelector)) { const companyLinkElem = job.querySelector(companyLinkSelector) as HTMLElement; company = companyLinkElem.innerText.trim(); companyLink = companyLinkElem.getAttribute("href") ? `${protocol}${hostname}${companyLinkElem.getAttribute("href")}` : undefined; } const companyImgLink = (<HTMLElement>job.querySelector("img"))?.getAttribute("src") ?? undefined; const place = job.querySelector(placeSelector) ? (<HTMLElement>job.querySelector(placeSelector)).innerText : ""; const date = job.querySelector(dateSelector) ? (<HTMLElement>job.querySelector(dateSelector)).getAttribute('datetime') : ""; return { jobId, jobLink, title, company, companyLink, companyImgLink, place, date, }; }, selectors.jobs, selectors.link, selectors.title, selectors.companyLink, selectors.place, selectors.date, jobIndex ); jobId = jobFieldsResult.jobId; jobLink = jobFieldsResult.jobLink; jobTitle = jobFieldsResult.title; jobCompany = jobFieldsResult.company; jobCompanyLink = jobFieldsResult.companyLink; jobCompanyImgLink = jobFieldsResult.companyImgLink; jobPlace = jobFieldsResult.place; jobDate = jobFieldsResult.date; // Try to load job details and extract job link logger.debug(tag, 'Evaluating selectors', [ selectors.jobs, ]); loadDetailsResult = await AuthenticatedStrategy._loadJobDetails(page, jobId!); // Check if loading job details has failed if (!loadDetailsResult.success) { logger.error(tag, loadDetailsResult.error); jobIndex += 1; continue; } // Use custom description function if available logger.debug(tag, 'Evaluating selectors', [ selectors.description, ]); if (query.options?.descriptionFn) { [jobDescription, jobDescriptionHTML] = await Promise.all([ page.evaluate(`(${query.options.descriptionFn.toString()})();`), page.evaluate((selector) => { return (<HTMLElement>document.querySelector(selector)).outerHTML; }, selectors.description) ]); } else { [jobDescription, jobDescriptionHTML] = await page.evaluate((selector) => { const el = (<HTMLElement>document.querySelector(selector)); return [el.innerText, el.outerHTML]; }, selectors.description ); } jobDescription = jobDescription as string; // Extract job insights logger.debug(tag, 'Evaluating selectors', [ selectors.insights, ]); jobInsights = await page.evaluate((jobInsightsSelector: string) => { const nodes = document.querySelectorAll(jobInsightsSelector); return Array.from(nodes).map(e => e.textContent! .replace(/[\n\r\t ]+/g, ' ').trim()); }, selectors.insights); // Apply link if (query.options?.applyLink) { try { if (await page.evaluate((applyBtnSelector: string) => { const applyBtn = document.querySelector(applyBtnSelector) as HTMLButtonElement; if (applyBtn) { applyBtn.click(); return true; } return false; }, selectors.applyBtn)) { logger.debug(tag, 'Try extracting apply link'); const targetsResponse = await cdpSession.send('Target.getTargets'); // The first not attached target should be the apply page if (targetsResponse.targetInfos && targetsResponse.targetInfos.length > 1) { const applyTarget = targetsResponse.targetInfos .filter(e => e.type === 'page') .find(e => !e.attached); if (applyTarget) { jobApplyLink = applyTarget.url; await cdpSession.send('Target.closeTarget', { targetId: applyTarget.targetId }); } } } } catch (err) { logger.warn(tag, 'Failed to extract apply link', err); } } } catch(err: any) { const errorMessage = `${tag}\t${err.message}`; this.scraper.emit(events.scraper.error, errorMessage); jobIndex++; metrics.failed++; continue; } // Emit data (NB: should be outside of try/catch block to be properly tested) this.scraper.emit(events.scraper.data, { query: query.query || "", location: location, jobId: jobId!, jobIndex: jobIndex, link: jobLink!, applyLink: jobApplyLink, title: normalizeString(jobTitle!), company: normalizeString(jobCompany!), companyLink: jobCompanyLink, companyImgLink: jobCompanyImgLink, place: normalizeString(jobPlace!), description: jobDescription! as string, descriptionHTML: jobDescriptionHTML! as string, date: jobDate!, insights: jobInsights, }); jobIndex += 1; metrics.processed += 1; logger.info(tag, `Processed`); if (metrics.processed < query.options!.limit! && jobIndex === jobsTot && jobsTot < paginationSize) { const loadJobsResult = await AuthenticatedStrategy._loadJobs(page, jobsTot); if (loadJobsResult.success) { jobsTot = loadJobsResult.count; } } if (jobIndex === jobsTot) { break; } } tag = `[${query.query}][${location}]`; logger.info(tag, 'No more jobs to process in this page'); // Check if we reached the limit of jobs to process if (metrics.processed === query.options!.limit!) { logger.info(tag, 'Query limit reached!') // Emit metrics this.scraper.emit(events.scraper.metrics, metrics); logger.info(tag, 'Metrics:', metrics); break; } else { metrics.missed += paginationSize - jobIndex; } // Emit metrics this.scraper.emit(events.scraper.metrics, metrics); logger.info(tag, 'Metrics:', metrics); // Try to paginate paginationIndex += 1; logger.info(tag, `Pagination requested [${paginationIndex}]`); const paginationResult = await AuthenticatedStrategy._paginate(page, tag); if (!paginationResult.success) { logger.info(tag, `Couldn\'t find more jobs for the running query`); break; } } return { exit: false }; } }