@jomari-wp/linkedin-jobs-scraper
Version:
Scrape public available jobs on Linkedin using headless browser
372 lines (308 loc) • 12.8 kB
text/typescript
import deepmerge from 'deepmerge';
import { config } from '../config';
import puppeteer from 'puppeteer';
import { Browser, BrowserContext, HTTPRequest } from 'puppeteer';
import { events, IEventListeners } from './events';
import { states } from './states';
import { browserDefaults, queryOptionsDefault } from './defaults';
import { sleep } from '../utils/utils';
import { getQueryParams } from '../utils/url';
import { urls, } from './constants';
import { IQuery, IQueryOptions, validateQuery } from './query';
import { getRandomUserAgent } from '../utils/browser';
import { Scraper, ScraperOptions } from './Scraper';
import { RunStrategy, AuthenticatedStrategy, AnonymousStrategy } from './strategies';
import { logger } from '../logger/logger';
// puppeteer.use(require('puppeteer-extra-plugin-stealth')()); // TODO: breaks with new target tabs: to investigate
/**
* Main class
* @extends EventEmitter
* @param options {ScraperOptions} Puppeteer browser options, for more informations see https://pptr.dev/#?product=Puppeteer&version=v2.0.0&show=api-puppeteerlaunchoptions
* @constructor
*/
class LinkedinScraper extends Scraper {
private _runStrategy: RunStrategy;
private _browser: Browser | undefined = undefined;
private _context: BrowserContext | undefined = undefined;
private _state = states.notInitialized;
/**
* @constructor
* @param {ScraperOptions} options
*/
constructor(options: ScraperOptions, session?: string) {
super(options, session);
if (config.LI_AT_COOKIE || session) {
this._runStrategy = new AuthenticatedStrategy(this, session);
if(session){
logger.info(`Session initialize LI_AT_COOKIE detected. Using ${AuthenticatedStrategy.name}`)
}else{
logger.info(`Env variable LI_AT_COOKIE detected. Using ${AuthenticatedStrategy.name}`)
}
}
else {
this._runStrategy = new AnonymousStrategy(this);
logger.info(`Using ${AnonymousStrategy.name}`)
}
}
/**
* Initialize browser
* @private
*/
private async _initialize() {
this._state = states.initializing;
this._browser && this._browser.removeAllListeners();
const launchOptions = deepmerge.all([browserDefaults, this.options]);
logger.info('Setting chrome launch options', launchOptions);
this._browser = await puppeteer.launch(launchOptions);
// Close initial browser page
await (await this._browser.pages())[0].close();
this._context = await this._browser.createIncognitoBrowserContext();
this._browser.on(events.puppeteer.browser.disconnected, () => {
this.emit(events.puppeteer.browser.disconnected);
});
this._browser.on(events.puppeteer.browser.targetcreated, () => {
this.emit(events.puppeteer.browser.targetcreated);
});
this._browser.on(events.puppeteer.browser.targetchanged, () => {
this.emit(events.puppeteer.browser.targetchanged);
});
this._browser.on(events.puppeteer.browser.targetdestroyed, () => {
this.emit(events.puppeteer.browser.targetdestroyed);
});
this._state = states.initialized;
}
/**
* Build jobs search url
* @param {string} query
* @param {string} location
* @param {IQueryOptions} options
* @returns {string}
* @private
*/
private _buildSearchUrl = (query: string, location: string, options: IQueryOptions): string => {
const url = new URL(urls.jobsSearch);
if (query && query.length) {
url.searchParams.append("keywords", query);
}
if (location && location.length) {
url.searchParams.append("location", location);
}
if (options && options.filters) {
if (options.filters.companyJobsUrl) {
const queryParams = getQueryParams(options.filters.companyJobsUrl);
url.searchParams.append("f_C", queryParams["f_C"]);
}
if (options.filters.relevance) {
url.searchParams.append("sortBy", options.filters.relevance);
}
if (options.filters.time && options.filters.time.length) {
url.searchParams.append("f_TPR", options.filters.time);
}
if (options.filters.type) {
if (!Array.isArray(options.filters.type)) {
options.filters.type = [options.filters.type]
}
url.searchParams.append("f_JT", options.filters.type.join(","));
}
if (options.filters.experience) {
if (!Array.isArray(options.filters.experience)) {
options.filters.experience = [options.filters.experience]
}
url.searchParams.append("f_E", options.filters.experience.join(","));
}
if (options.filters.remote && config.LI_AT_COOKIE) {
url.searchParams.append("f_WRA", options.filters.remote);
}
}
url.searchParams.append("start", "0");
return url.href;
}
/**
* Scrape linkedin jobs
* @param {IQuery | IQuery[]} queries
* @param {IQueryOptions} [options]
* @return {Promise<void>}
* @private
*/
private _run = async (
queries: IQuery | IQuery[],
options?: IQueryOptions
): Promise<void> => {
let tag: string;
if (!Array.isArray(queries)) {
queries = [queries];
}
// Merge options and validate
for (const query of queries) {
const optionsToMerge = [queryOptionsDefault];
options && optionsToMerge.push(options);
query.options && optionsToMerge.push(query.options);
query.options = deepmerge.all(optionsToMerge, {
arrayMerge: (destinationArray, sourceArray, options) => sourceArray,
});
// Add default location if none provided
if (!query?.options?.locations?.length) {
query.options.locations = ["Worldwide"];
}
const errors = validateQuery(query);
if (errors.length) {
logger.error(errors);
process.exit(1);
}
}
// Initialize browser
if (!this._browser) {
await this._initialize();
}
const wsEndpoint = this._browser!.wsEndpoint();
if (wsEndpoint) {
logger.info('Websocket debugger url:', wsEndpoint);
}
// Queries loop
for (const query of queries) {
// Locations loop
for (const location of query.options!.locations!) {
tag = `[${query.query}][${location}]`;
logger.info(tag, `Starting new query:`, `query="${query.query}"`, `location="${location}"`);
logger.info(tag, `Query options`, query.options);
// Open new page in incognito context
const page = await this._context!.newPage();
// Create Chrome Developer Tools session
const cdpSession = await page.target().createCDPSession();
// Disable Content Security Policy: needed for pagination to work properly in anonymous mode
await page.setBypassCSP(true);
// Tricks to speed up page
await cdpSession.send('Page.enable');
await cdpSession.send('Page.setWebLifecycleState', {
state: 'active',
});
// Set a random user agent
await page.setUserAgent(getRandomUserAgent());
// Enable request interception
await page.setRequestInterception(true);
const onRequest = async (request: HTTPRequest) => {
const url = new URL(request.url());
const domain = url.hostname.split(".").slice(-2).join(".").toLowerCase();
// Block tracking and other stuff not useful
const toBlock = [
'li/track',
'realtime.www.linkedin.com/realtime',
];
if (toBlock.some(e => url.pathname.includes(e))) {
return request.abort();
}
// Block 3rd part domains requests
if (!["linkedin.com", "licdn.com"].includes(domain)) {
return request.abort();
}
// If optimization is enabled, block other resource types
if (query.options!.optimize) {
const resourcesToBlock = [
"image",
"stylesheet",
"media",
"font",
"imageset",
];
if (
resourcesToBlock.some(r => request.resourceType() === r)
|| request.url().includes(".jpg")
|| request.url().includes(".jpeg")
|| request.url().includes(".png")
|| request.url().includes(".gif")
|| request.url().includes(".css")
) {
return request.abort();
}
}
await request.continue();
}
// Add listener
page.on("request", onRequest);
// Error response and rate limiting check
page.on("response", response => {
if (response.status() === 429) {
logger.warn(tag, "Error 429 too many requests. You would probably need to use a higher 'slowMo' value and/or reduce the number of concurrent queries.");
}
else if (response.status() >= 400) {
logger.warn(tag, response.status(), `Error for request ${response.request().url()}`)
}
});
// Build search url
const searchUrl = this._buildSearchUrl(query.query || "", location, query.options!);
// Run strategy
const runStrategyResult = await this._runStrategy.run(
this._context!,
page,
cdpSession,
searchUrl,
query,
location
);
// Check if forced exit is required
if (runStrategyResult.exit) {
logger.warn(tag, "Forced termination");
return;
}
// Close page
page && await page.close();
}
}
// Emit end event
this.emit(events.scraper.end);
};
/**
* Scrape linkedin jobs
* @param {IQuery | IQuery[]} queries
* @param {IQueryOptions} [options]
* @return {Promise<void>}
*/
public run = async (
queries: IQuery | IQuery[],
options?: IQueryOptions
): Promise<void> => {
try {
if (this._state === states.notInitialized) {
await this._initialize();
}
else if (this._state === states.initializing) {
const timeout = 10000;
const pollingTime = 100;
let elapsed = 0;
while(this._state !== states.initialized) {
await sleep(pollingTime);
elapsed += pollingTime;
if (elapsed >= timeout) {
throw new Error(`Initialize timeout exceeded: ${timeout}ms`);
}
}
}
await this._run(
queries,
options
);
}
catch (err: any) {
// logger.error(err);
this.emit(events.scraper.error, err);
await this.close();
throw err;
}
};
/**
* Close browser instance
* @returns {Promise<void>}
*/
public close = async (): Promise<void> => {
try {
if (this._browser) {
this._browser.removeAllListeners() && await this._browser.close();
}
}
finally {
this._browser = undefined;
this._state = states.notInitialized;
}
};
}
export { LinkedinScraper };