UNPKG

@jomari-wp/linkedin-jobs-scraper

Version:

Scrape public available jobs on Linkedin using headless browser

308 lines (307 loc) 14.8 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.LinkedinScraper = void 0; const deepmerge_1 = __importDefault(require("deepmerge")); const config_1 = require("../config"); const puppeteer_1 = __importDefault(require("puppeteer")); const events_1 = require("./events"); const states_1 = require("./states"); const defaults_1 = require("./defaults"); const utils_1 = require("../utils/utils"); const url_1 = require("../utils/url"); const constants_1 = require("./constants"); const query_1 = require("./query"); const browser_1 = require("../utils/browser"); const Scraper_1 = require("./Scraper"); const strategies_1 = require("./strategies"); const logger_1 = require("../logger/logger"); // puppeteer.use(require('puppeteer-extra-plugin-stealth')()); // TODO: breaks with new target tabs: to investigate /** * Main class * @extends EventEmitter * @param options {ScraperOptions} Puppeteer browser options, for more informations see https://pptr.dev/#?product=Puppeteer&version=v2.0.0&show=api-puppeteerlaunchoptions * @constructor */ class LinkedinScraper extends Scraper_1.Scraper { /** * @constructor * @param {ScraperOptions} options */ constructor(options, session) { super(options, session); this._browser = undefined; this._context = undefined; this._state = states_1.states.notInitialized; /** * Build jobs search url * @param {string} query * @param {string} location * @param {IQueryOptions} options * @returns {string} * @private */ this._buildSearchUrl = (query, location, options) => { const url = new URL(constants_1.urls.jobsSearch); if (query && query.length) { url.searchParams.append("keywords", query); } if (location && location.length) { url.searchParams.append("location", location); } if (options && options.filters) { if (options.filters.companyJobsUrl) { const queryParams = (0, url_1.getQueryParams)(options.filters.companyJobsUrl); url.searchParams.append("f_C", queryParams["f_C"]); } if (options.filters.relevance) { url.searchParams.append("sortBy", options.filters.relevance); } if (options.filters.time && options.filters.time.length) { url.searchParams.append("f_TPR", options.filters.time); } if (options.filters.type) { if (!Array.isArray(options.filters.type)) { options.filters.type = [options.filters.type]; } url.searchParams.append("f_JT", options.filters.type.join(",")); } if (options.filters.experience) { if (!Array.isArray(options.filters.experience)) { options.filters.experience = [options.filters.experience]; } url.searchParams.append("f_E", options.filters.experience.join(",")); } if (options.filters.remote && config_1.config.LI_AT_COOKIE) { url.searchParams.append("f_WRA", options.filters.remote); } } url.searchParams.append("start", "0"); return url.href; }; /** * Scrape linkedin jobs * @param {IQuery | IQuery[]} queries * @param {IQueryOptions} [options] * @return {Promise<void>} * @private */ this._run = (queries, options) => __awaiter(this, void 0, void 0, function* () { var _a, _b; let tag; if (!Array.isArray(queries)) { queries = [queries]; } // Merge options and validate for (const query of queries) { const optionsToMerge = [defaults_1.queryOptionsDefault]; options && optionsToMerge.push(options); query.options && optionsToMerge.push(query.options); query.options = deepmerge_1.default.all(optionsToMerge, { arrayMerge: (destinationArray, sourceArray, options) => sourceArray, }); // Add default location if none provided if (!((_b = (_a = query === null || query === void 0 ? void 0 : query.options) === null || _a === void 0 ? void 0 : _a.locations) === null || _b === void 0 ? void 0 : _b.length)) { query.options.locations = ["Worldwide"]; } const errors = (0, query_1.validateQuery)(query); if (errors.length) { logger_1.logger.error(errors); process.exit(1); } } // Initialize browser if (!this._browser) { yield this._initialize(); } const wsEndpoint = this._browser.wsEndpoint(); if (wsEndpoint) { logger_1.logger.info('Websocket debugger url:', wsEndpoint); } // Queries loop for (const query of queries) { // Locations loop for (const location of query.options.locations) { tag = `[${query.query}][${location}]`; logger_1.logger.info(tag, `Starting new query:`, `query="${query.query}"`, `location="${location}"`); logger_1.logger.info(tag, `Query options`, query.options); // Open new page in incognito context const page = yield this._context.newPage(); // Create Chrome Developer Tools session const cdpSession = yield page.target().createCDPSession(); // Disable Content Security Policy: needed for pagination to work properly in anonymous mode yield page.setBypassCSP(true); // Tricks to speed up page yield cdpSession.send('Page.enable'); yield cdpSession.send('Page.setWebLifecycleState', { state: 'active', }); // Set a random user agent yield page.setUserAgent((0, browser_1.getRandomUserAgent)()); // Enable request interception yield page.setRequestInterception(true); const onRequest = (request) => __awaiter(this, void 0, void 0, function* () { const url = new URL(request.url()); const domain = url.hostname.split(".").slice(-2).join(".").toLowerCase(); // Block tracking and other stuff not useful const toBlock = [ 'li/track', 'realtime.www.linkedin.com/realtime', ]; if (toBlock.some(e => url.pathname.includes(e))) { return request.abort(); } // Block 3rd part domains requests if (!["linkedin.com", "licdn.com"].includes(domain)) { return request.abort(); } // If optimization is enabled, block other resource types if (query.options.optimize) { const resourcesToBlock = [ "image", "stylesheet", "media", "font", "imageset", ]; if (resourcesToBlock.some(r => request.resourceType() === r) || request.url().includes(".jpg") || request.url().includes(".jpeg") || request.url().includes(".png") || request.url().includes(".gif") || request.url().includes(".css")) { return request.abort(); } } yield request.continue(); }); // Add listener page.on("request", onRequest); // Error response and rate limiting check page.on("response", response => { if (response.status() === 429) { logger_1.logger.warn(tag, "Error 429 too many requests. You would probably need to use a higher 'slowMo' value and/or reduce the number of concurrent queries."); } else if (response.status() >= 400) { logger_1.logger.warn(tag, response.status(), `Error for request ${response.request().url()}`); } }); // Build search url const searchUrl = this._buildSearchUrl(query.query || "", location, query.options); // Run strategy const runStrategyResult = yield this._runStrategy.run(this._context, page, cdpSession, searchUrl, query, location); // Check if forced exit is required if (runStrategyResult.exit) { logger_1.logger.warn(tag, "Forced termination"); return; } // Close page page && (yield page.close()); } } // Emit end event this.emit(events_1.events.scraper.end); }); /** * Scrape linkedin jobs * @param {IQuery | IQuery[]} queries * @param {IQueryOptions} [options] * @return {Promise<void>} */ this.run = (queries, options) => __awaiter(this, void 0, void 0, function* () { try { if (this._state === states_1.states.notInitialized) { yield this._initialize(); } else if (this._state === states_1.states.initializing) { const timeout = 10000; const pollingTime = 100; let elapsed = 0; while (this._state !== states_1.states.initialized) { yield (0, utils_1.sleep)(pollingTime); elapsed += pollingTime; if (elapsed >= timeout) { throw new Error(`Initialize timeout exceeded: ${timeout}ms`); } } } yield this._run(queries, options); } catch (err) { // logger.error(err); this.emit(events_1.events.scraper.error, err); yield this.close(); throw err; } }); /** * Close browser instance * @returns {Promise<void>} */ this.close = () => __awaiter(this, void 0, void 0, function* () { try { if (this._browser) { this._browser.removeAllListeners() && (yield this._browser.close()); } } finally { this._browser = undefined; this._state = states_1.states.notInitialized; } }); if (config_1.config.LI_AT_COOKIE || session) { this._runStrategy = new strategies_1.AuthenticatedStrategy(this, session); if (session) { logger_1.logger.info(`Session initialize LI_AT_COOKIE detected. Using ${strategies_1.AuthenticatedStrategy.name}`); } else { logger_1.logger.info(`Env variable LI_AT_COOKIE detected. Using ${strategies_1.AuthenticatedStrategy.name}`); } } else { this._runStrategy = new strategies_1.AnonymousStrategy(this); logger_1.logger.info(`Using ${strategies_1.AnonymousStrategy.name}`); } } /** * Initialize browser * @private */ _initialize() { return __awaiter(this, void 0, void 0, function* () { this._state = states_1.states.initializing; this._browser && this._browser.removeAllListeners(); const launchOptions = deepmerge_1.default.all([defaults_1.browserDefaults, this.options]); logger_1.logger.info('Setting chrome launch options', launchOptions); this._browser = yield puppeteer_1.default.launch(launchOptions); // Close initial browser page yield (yield this._browser.pages())[0].close(); this._context = yield this._browser.createIncognitoBrowserContext(); this._browser.on(events_1.events.puppeteer.browser.disconnected, () => { this.emit(events_1.events.puppeteer.browser.disconnected); }); this._browser.on(events_1.events.puppeteer.browser.targetcreated, () => { this.emit(events_1.events.puppeteer.browser.targetcreated); }); this._browser.on(events_1.events.puppeteer.browser.targetchanged, () => { this.emit(events_1.events.puppeteer.browser.targetchanged); }); this._browser.on(events_1.events.puppeteer.browser.targetdestroyed, () => { this.emit(events_1.events.puppeteer.browser.targetdestroyed); }); this._state = states_1.states.initialized; }); } } exports.LinkedinScraper = LinkedinScraper;