UNPKG

@harvestapi/scraper

Version:

HarvestAPI provides LinkedIn data scraping tools for real-time, high-performance scraping at a low cost. API allows to search for Linkedin `jobs`, `companies`, `profiles`, and `posts` using a wide range of filters.

1,315 lines (1,171 loc) 44.2 kB
import { randomUUID } from 'crypto'; import fs from 'fs-extra'; import { resolve, dirname } from 'path'; import { styleText } from 'node:util'; const token = '%[a-f0-9]{2}'; const singleMatcher = new RegExp('(' + token + ')|([^%]+?)', 'gi'); const multiMatcher = new RegExp('(' + token + ')+', 'gi'); function decodeComponents(components, split) { try { // Try to decode the entire string first return [decodeURIComponent(components.join(''))]; } catch { // Do nothing } if (components.length === 1) { return components; } split = split || 1; // Split the array in 2 parts const left = components.slice(0, split); const right = components.slice(split); return Array.prototype.concat.call([], decodeComponents(left), decodeComponents(right)); } function decode$1(input) { try { return decodeURIComponent(input); } catch { let tokens = input.match(singleMatcher) || []; for (let i = 1; i < tokens.length; i++) { input = decodeComponents(tokens, i).join(''); tokens = input.match(singleMatcher) || []; } return input; } } function customDecodeURIComponent(input) { // Keep track of all the replacements and prefill the map with the `BOM` const replaceMap = { '%FE%FF': '\uFFFD\uFFFD', '%FF%FE': '\uFFFD\uFFFD', }; let match = multiMatcher.exec(input); while (match) { try { // Decode as big chunks as possible replaceMap[match[0]] = decodeURIComponent(match[0]); } catch { const result = decode$1(match[0]); if (result !== match[0]) { replaceMap[match[0]] = result; } } match = multiMatcher.exec(input); } // Add `%C2` at the end of the map to make sure it does not replace the combinator before everything else replaceMap['%C2'] = '\uFFFD'; const entries = Object.keys(replaceMap); for (const key of entries) { // Replace all decoded components input = input.replace(new RegExp(key, 'g'), replaceMap[key]); } return input; } function decodeUriComponent(encodedURI) { if (typeof encodedURI !== 'string') { throw new TypeError('Expected `encodedURI` to be of type `string`, got `' + typeof encodedURI + '`'); } try { // Try the built in decoder first return decodeURIComponent(encodedURI); } catch { // Fallback to a more advanced decoder return customDecodeURIComponent(encodedURI); } } function includeKeys(object, predicate) { const result = {}; if (Array.isArray(predicate)) { for (const key of predicate) { const descriptor = Object.getOwnPropertyDescriptor(object, key); if (descriptor?.enumerable) { Object.defineProperty(result, key, descriptor); } } } else { // `Reflect.ownKeys()` is required to retrieve symbol properties for (const key of Reflect.ownKeys(object)) { const descriptor = Object.getOwnPropertyDescriptor(object, key); if (descriptor.enumerable) { const value = object[key]; if (predicate(key, value, object)) { Object.defineProperty(result, key, descriptor); } } } } return result; } function splitOnFirst(string, separator) { if (!(typeof string === 'string' && typeof separator === 'string')) { throw new TypeError('Expected the arguments to be of type `string`'); } if (string === '' || separator === '') { return []; } const separatorIndex = string.indexOf(separator); if (separatorIndex === -1) { return []; } return [ string.slice(0, separatorIndex), string.slice(separatorIndex + separator.length) ]; } const isNullOrUndefined = value => value === null || value === undefined; // eslint-disable-next-line unicorn/prefer-code-point const strictUriEncode = string => encodeURIComponent(string).replaceAll(/[!'()*]/g, x => `%${x.charCodeAt(0).toString(16).toUpperCase()}`); const encodeFragmentIdentifier = Symbol('encodeFragmentIdentifier'); function encoderForArrayFormat(options) { switch (options.arrayFormat) { case 'index': { return key => (result, value) => { const index = result.length; if ( value === undefined || (options.skipNull && value === null) || (options.skipEmptyString && value === '') ) { return result; } if (value === null) { return [ ...result, [encode(key, options), '[', index, ']'].join(''), ]; } return [ ...result, [encode(key, options), '[', encode(index, options), ']=', encode(value, options)].join(''), ]; }; } case 'bracket': { return key => (result, value) => { if ( value === undefined || (options.skipNull && value === null) || (options.skipEmptyString && value === '') ) { return result; } if (value === null) { return [ ...result, [encode(key, options), '[]'].join(''), ]; } return [ ...result, [encode(key, options), '[]=', encode(value, options)].join(''), ]; }; } case 'colon-list-separator': { return key => (result, value) => { if ( value === undefined || (options.skipNull && value === null) || (options.skipEmptyString && value === '') ) { return result; } if (value === null) { return [ ...result, [encode(key, options), ':list='].join(''), ]; } return [ ...result, [encode(key, options), ':list=', encode(value, options)].join(''), ]; }; } case 'comma': case 'separator': case 'bracket-separator': { const keyValueSeparator = options.arrayFormat === 'bracket-separator' ? '[]=' : '='; return key => (result, value) => { if ( value === undefined || (options.skipNull && value === null) || (options.skipEmptyString && value === '') ) { return result; } // Translate null to an empty string so that it doesn't serialize as 'null' value = value === null ? '' : value; if (result.length === 0) { return [[encode(key, options), keyValueSeparator, encode(value, options)].join('')]; } return [[result, encode(value, options)].join(options.arrayFormatSeparator)]; }; } default: { return key => (result, value) => { if ( value === undefined || (options.skipNull && value === null) || (options.skipEmptyString && value === '') ) { return result; } if (value === null) { return [ ...result, encode(key, options), ]; } return [ ...result, [encode(key, options), '=', encode(value, options)].join(''), ]; }; } } } function parserForArrayFormat(options) { let result; switch (options.arrayFormat) { case 'index': { return (key, value, accumulator) => { result = /\[(\d*)]$/.exec(key); key = key.replace(/\[\d*]$/, ''); if (!result) { accumulator[key] = value; return; } if (accumulator[key] === undefined) { accumulator[key] = {}; } accumulator[key][result[1]] = value; }; } case 'bracket': { return (key, value, accumulator) => { result = /(\[])$/.exec(key); key = key.replace(/\[]$/, ''); if (!result) { accumulator[key] = value; return; } if (accumulator[key] === undefined) { accumulator[key] = [value]; return; } accumulator[key] = [...accumulator[key], value]; }; } case 'colon-list-separator': { return (key, value, accumulator) => { result = /(:list)$/.exec(key); key = key.replace(/:list$/, ''); if (!result) { accumulator[key] = value; return; } if (accumulator[key] === undefined) { accumulator[key] = [value]; return; } accumulator[key] = [...accumulator[key], value]; }; } case 'comma': case 'separator': { return (key, value, accumulator) => { const isArray = typeof value === 'string' && value.includes(options.arrayFormatSeparator); const isEncodedArray = (typeof value === 'string' && !isArray && decode(value, options).includes(options.arrayFormatSeparator)); value = isEncodedArray ? decode(value, options) : value; const newValue = isArray || isEncodedArray ? value.split(options.arrayFormatSeparator).map(item => decode(item, options)) : (value === null ? value : decode(value, options)); accumulator[key] = newValue; }; } case 'bracket-separator': { return (key, value, accumulator) => { const isArray = /(\[])$/.test(key); key = key.replace(/\[]$/, ''); if (!isArray) { accumulator[key] = value ? decode(value, options) : value; return; } const arrayValue = value === null ? [] : decode(value, options).split(options.arrayFormatSeparator); if (accumulator[key] === undefined) { accumulator[key] = arrayValue; return; } accumulator[key] = [...accumulator[key], ...arrayValue]; }; } default: { return (key, value, accumulator) => { if (accumulator[key] === undefined) { accumulator[key] = value; return; } accumulator[key] = [...[accumulator[key]].flat(), value]; }; } } } function validateArrayFormatSeparator(value) { if (typeof value !== 'string' || value.length !== 1) { throw new TypeError('arrayFormatSeparator must be single character string'); } } function encode(value, options) { if (options.encode) { return options.strict ? strictUriEncode(value) : encodeURIComponent(value); } return value; } function decode(value, options) { if (options.decode) { return decodeUriComponent(value); } return value; } function keysSorter(input) { if (Array.isArray(input)) { return input.sort(); } if (typeof input === 'object') { return keysSorter(Object.keys(input)) .sort((a, b) => Number(a) - Number(b)) .map(key => input[key]); } return input; } function removeHash(input) { const hashStart = input.indexOf('#'); if (hashStart !== -1) { input = input.slice(0, hashStart); } return input; } function getHash(url) { let hash = ''; const hashStart = url.indexOf('#'); if (hashStart !== -1) { hash = url.slice(hashStart); } return hash; } function parseValue(value, options, type) { if (type === 'string' && typeof value === 'string') { return value; } if (typeof type === 'function' && typeof value === 'string') { return type(value); } if (options.parseBooleans && value !== null && (value.toLowerCase() === 'true' || value.toLowerCase() === 'false')) { return value.toLowerCase() === 'true'; } if (type === 'number' && !Number.isNaN(Number(value)) && (typeof value === 'string' && value.trim() !== '')) { return Number(value); } if (options.parseNumbers && !Number.isNaN(Number(value)) && (typeof value === 'string' && value.trim() !== '')) { return Number(value); } return value; } function extract(input) { input = removeHash(input); const queryStart = input.indexOf('?'); if (queryStart === -1) { return ''; } return input.slice(queryStart + 1); } function parse(query, options) { options = { decode: true, sort: true, arrayFormat: 'none', arrayFormatSeparator: ',', parseNumbers: false, parseBooleans: false, types: Object.create(null), ...options, }; validateArrayFormatSeparator(options.arrayFormatSeparator); const formatter = parserForArrayFormat(options); // Create an object with no prototype const returnValue = Object.create(null); if (typeof query !== 'string') { return returnValue; } query = query.trim().replace(/^[?#&]/, ''); if (!query) { return returnValue; } for (const parameter of query.split('&')) { if (parameter === '') { continue; } const parameter_ = options.decode ? parameter.replaceAll('+', ' ') : parameter; let [key, value] = splitOnFirst(parameter_, '='); if (key === undefined) { key = parameter_; } // Missing `=` should be `null`: // http://w3.org/TR/2012/WD-url-20120524/#collect-url-parameters value = value === undefined ? null : (['comma', 'separator', 'bracket-separator'].includes(options.arrayFormat) ? value : decode(value, options)); formatter(decode(key, options), value, returnValue); } for (const [key, value] of Object.entries(returnValue)) { if (typeof value === 'object' && value !== null && options.types[key] !== 'string') { for (const [key2, value2] of Object.entries(value)) { const type = options.types[key] ? options.types[key].replace('[]', '') : undefined; value[key2] = parseValue(value2, options, type); } } else if (typeof value === 'object' && value !== null && options.types[key] === 'string') { returnValue[key] = Object.values(value).join(options.arrayFormatSeparator); } else { returnValue[key] = parseValue(value, options, options.types[key]); } } if (options.sort === false) { return returnValue; } // TODO: Remove the use of `reduce`. // eslint-disable-next-line unicorn/no-array-reduce return (options.sort === true ? Object.keys(returnValue).sort() : Object.keys(returnValue).sort(options.sort)).reduce((result, key) => { const value = returnValue[key]; result[key] = Boolean(value) && typeof value === 'object' && !Array.isArray(value) ? keysSorter(value) : value; return result; }, Object.create(null)); } function stringify(object, options) { if (!object) { return ''; } options = { encode: true, strict: true, arrayFormat: 'none', arrayFormatSeparator: ',', ...options, }; validateArrayFormatSeparator(options.arrayFormatSeparator); const shouldFilter = key => ( (options.skipNull && isNullOrUndefined(object[key])) || (options.skipEmptyString && object[key] === '') ); const formatter = encoderForArrayFormat(options); const objectCopy = {}; for (const [key, value] of Object.entries(object)) { if (!shouldFilter(key)) { objectCopy[key] = value; } } const keys = Object.keys(objectCopy); if (options.sort !== false) { keys.sort(options.sort); } return keys.map(key => { const value = object[key]; if (value === undefined) { return ''; } if (value === null) { return encode(key, options); } if (Array.isArray(value)) { if (value.length === 0 && options.arrayFormat === 'bracket-separator') { return encode(key, options) + '[]'; } return value .reduce(formatter(key), []) .join('&'); } return encode(key, options) + '=' + encode(value, options); }).filter(x => x.length > 0).join('&'); } function parseUrl(url, options) { options = { decode: true, ...options, }; let [url_, hash] = splitOnFirst(url, '#'); if (url_ === undefined) { url_ = url; } return { url: url_?.split('?')?.[0] ?? '', query: parse(extract(url), options), ...(options && options.parseFragmentIdentifier && hash ? {fragmentIdentifier: decode(hash, options)} : {}), }; } function stringifyUrl(object, options) { options = { encode: true, strict: true, [encodeFragmentIdentifier]: true, ...options, }; const url = removeHash(object.url).split('?')[0] || ''; const queryFromUrl = extract(object.url); const query = { ...parse(queryFromUrl, {sort: false}), ...object.query, }; let queryString = stringify(query, options); queryString &&= `?${queryString}`; let hash = getHash(object.url); if (typeof object.fragmentIdentifier === 'string') { const urlObjectForFragmentEncode = new URL(url); urlObjectForFragmentEncode.hash = object.fragmentIdentifier; hash = options[encodeFragmentIdentifier] ? urlObjectForFragmentEncode.hash : `#${object.fragmentIdentifier}`; } return `${url}${queryString}${hash}`; } function pick(input, filter, options) { options = { parseFragmentIdentifier: true, [encodeFragmentIdentifier]: false, ...options, }; const {url, query, fragmentIdentifier} = parseUrl(input, options); return stringifyUrl({ url, query: includeKeys(query, filter), fragmentIdentifier, }, options); } function exclude(input, filter, options) { const exclusionFilter = Array.isArray(filter) ? key => !filter.includes(key) : (key, value) => !filter(key, value); return pick(input, exclusionFilter, options); } var queryString = /*#__PURE__*/Object.freeze({ __proto__: null, exclude: exclude, extract: extract, parse: parse, parseUrl: parseUrl, pick: pick, stringify: stringify, stringifyUrl: stringifyUrl }); class BaseScraper { constructor(options) { this.options = options; this.apiBaseUrl = 'https://api.harvest-api.com'; this.logger = console; if (options.baseUrl) { this.apiBaseUrl = options.baseUrl; } if (this.apiBaseUrl.endsWith('/')) { this.apiBaseUrl = this.apiBaseUrl.slice(0, -1); } this.logger = options.logger || console; } async fetchApi({ path, params, addHeaders, method = 'GET', body, }) { var _a, _b; if (!this.options.apiKey) { this.logger.error('API Key is required'); return { error: 'API Key is required to fetch API', }; } if (!path) { this.logger.error('Path is required'); return { error: 'Path is required to fetch API', }; } if (!path.startsWith('/')) { path = `/${path}`; } if (params.addHeaders) { addHeaders = { ...addHeaders, ...params.addHeaders, }; delete params.addHeaders; } if (params && Object.values(params).filter(Boolean).length > 0) { path += `?${queryString.stringify(params, { arrayFormat: 'comma', skipNull: true, skipEmptyString: true, })}`; } const apiUrl = `${this.apiBaseUrl}${path}`; let error = null; const response = await fetch(apiUrl, { method: method || 'GET', headers: { 'Content-Type': 'application/json', 'X-API-KEY': this.options.apiKey, ...this.options.addHeaders, ...addHeaders, }, body: body ? JSON.stringify(body) : undefined, }).catch((e) => { this.logger.error('Error fetching API:', e); error = e; return null; }); const data = await ((_a = response === null || response === void 0 ? void 0 : response.json()) === null || _a === void 0 ? void 0 : _a.catch((e) => { this.logger.error('Error parsing response:', e); error = e; return null; })); if (!(response === null || response === void 0 ? void 0 : response.ok)) { return { error: ((_b = data === null || data === void 0 ? void 0 : data.error) === null || _b === void 0 ? void 0 : _b.error) || (data === null || data === void 0 ? void 0 : data.error) || error, status: response === null || response === void 0 ? void 0 : response.status, }; } return data; } } function createConcurrentQueues(concurrency, fn, opts) { if (!concurrency || isNaN(concurrency)) { throw new Error(`createConcurrentQueues: Concurrency must be a number > 0, provided: ${concurrency}`); } let activePromises = 0; const pendingTasks = []; return async (...args) => { return new Promise((resolve, reject) => { const execute = async () => { activePromises++; try { const result = await fn(...args); resolve(result); } catch (error) { reject(error); } finally { activePromises--; if (pendingTasks.length > 0) { const nextTask = pendingTasks.shift(); if (nextTask) setTimeout(nextTask, 1); } } }; if (activePromises < concurrency) { execute(); } else { pendingTasks.push(execute); } }); }; } class ListingScraper { constructor(options) { this.options = options; this.id = randomUUID(); this.startTime = new Date(); this.inMemoryItems = []; this.stats = { pages: 0, pagesSuccess: 0, items: 0, itemsSuccess: 0, requests: 0, requestsStartTime: new Date(), }; this.db = null; this.sqliteDatabaseOpenPromise = null; this.done = false; this.scrapePagesDone = false; this.error = null; this.scrapedItems = {}; this.paginationToken = null; this.undefinedPagination = false; this.onItemScraped = async ({ item, ...apiArgs }) => { var _a, _b, _c, _d, _e, _f; const logger = { log: (...args) => this.log(...args), error: (...args) => this.errorLog(...args), }; if (this.options.outputType === 'json') { this.inMemoryItems.push(item); void ((_b = (_a = this.options).onItemScraped) === null || _b === void 0 ? void 0 : _b.call(_a, { item, logger, ...apiArgs })); } if (this.options.outputType === 'sqlite') { await this.insertSqliteItem(item).catch((error) => { this.errorLog('Error inserting item to SQLite:', error); }); void ((_d = (_c = this.options).onItemScraped) === null || _d === void 0 ? void 0 : _d.call(_c, { item, logger, ...apiArgs })); } if (this.options.outputType === 'callback') { await ((_f = (_e = this.options).onItemScraped) === null || _f === void 0 ? void 0 : _f.call(_e, { item, logger, ...apiArgs })); } }; if (this.options.optionsOverride) { this.options = { ...this.options, ...this.options.optionsOverride, }; } if (!this.options.outputType) { this.options.outputType = 'sqlite'; } this.tableName = this.options.tableName || `${this.options.entityName}_${this.id}`; this.filePath = resolve(this.options.outputDir || resolve(process.cwd(), 'output'), this.options.filename || `${this.startTime.toISOString().replace(/:/g, '-').replace(/\./g, '-')}_${this.options.entityName}_${this.id}`); } log(...args) { if (!this.options.disableLog) { console.log(`[${new Date().toISOString()}]`, ...args); } } errorLog(...args) { if (!this.options.disableErrorLog) { console.error(`[${new Date().toISOString()}]`, ...args); } } async scrapeStart() { var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o; this.stats = { pages: 0, pagesSuccess: 0, items: 0, itemsSuccess: 0, requests: 0, requestsStartTime: new Date(), }; this.paginationToken = null; this.scrapePagesDone = false; const startPageNumber = this.options.startPage || 1; const firstPage = await this.fetchPage({ page: startPageNumber }); (_b = (_a = this.options).onFirstPageFetched) === null || _b === void 0 ? void 0 : _b.call(_a, { data: firstPage }); let totalPages = ((_c = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _c === void 0 ? void 0 : _c.totalPages) || 0; this.paginationToken = ((_d = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _d === void 0 ? void 0 : _d.paginationToken) || null; if (this.options.maxPageNumber && totalPages > this.options.maxPageNumber) { totalPages = this.options.maxPageNumber; } if (!totalPages && ((_e = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _e === void 0 ? void 0 : _e.length)) { totalPages = this.options.maxPageNumber; this.undefinedPagination = true; } else { this.undefinedPagination = false; } if (this.options.warnPageLimit && totalPages === this.options.maxPageNumber && ((_f = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _f === void 0 ? void 0 : _f.pageSize) && (firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination.totalElements)) { const totalAllowedItems = this.options.maxPageNumber * firstPage.pagination.pageSize; const totalItems = firstPage.pagination.totalElements; if (totalItems > totalAllowedItems) { console.warn('\n' + styleText('bgYellow', ' [WARNING] \n') + `The search results are limited to ${this.options.maxPageNumber * ((_g = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _g === void 0 ? void 0 : _g.pageSize)} items (out of total ${firstPage.pagination.totalElements}) because LinkedIn does not allow to scrape more for one query. \n` + `Which means you will not be able to extract all data for this exact query. \n` + 'Consider splitting your query into multiple queries applying more filters. \n' + `For example do multiple runs for locations of specific cities, instead of one run targeting entire country or region. ` + '\n'); } } const concurrency = ((_h = this.options) === null || _h === void 0 ? void 0 : _h.overrideConcurrency) || ((_j = firstPage === null || firstPage === void 0 ? void 0 : firstPage.user) === null || _j === void 0 ? void 0 : _j.requestsConcurrency) || 1; this.log(`Scraping ${this.options.entityName} with ${concurrency} concurrent ${concurrency === 1 ? 'worker' : 'workers'}... Total pages: ${totalPages}. Total items: ${((_k = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _k === void 0 ? void 0 : _k.totalElements) || ((_l = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _l === void 0 ? void 0 : _l.length) || 0}`); if (!((_m = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _m === void 0 ? void 0 : _m.length)) { this.done = true; if (this.error) { const errors = Array.isArray(this.error) ? this.error : [this.error]; this.errorLog(...errors); } this.errorLog('Error fetching first page or no items found. Exiting.', (firstPage === null || firstPage === void 0 ? void 0 : firstPage.error) ? JSON.stringify(firstPage === null || firstPage === void 0 ? void 0 : firstPage.error, null, 2) : ''); return; } this.scrapePageQueue = createConcurrentQueues(((_o = this.options) === null || _o === void 0 ? void 0 : _o.overridePageConcurrency) || 2, (args) => this.scrapePage(args)); this.fetchItemQueue = createConcurrentQueues(concurrency, async ({ item, pagination }) => { var _a; if (this.options.maxItems && this.stats.itemsSuccess + 1 > this.options.maxItems) { this.done = true; this.error = `Max items limit reached: ${this.options.maxItems}`; return null; } const result = await ((_a = this.options .fetchItem({ item, addHeaders: this.options.addItemHeaders, })) === null || _a === void 0 ? void 0 : _a.catch((error) => { this.errorLog('Error scraping item', error); return null; })); if (!result) return null; return { ...result, pagination, }; }); this.onItemScrapedQueue = createConcurrentQueues(this.options.outputType === 'sqlite' ? 1 : concurrency, ({ item, ...rest }) => this.onItemScraped({ item, ...rest })); this.stats.requestsStartTime = new Date(); this.stats.pages = 1; this.stats.pagesSuccess = 1; if (this.options.outputType === 'sqlite') { this.sqliteDatabaseOpenPromise = this.createSqliteDatabase(); } let lastPageNumber = totalPages; if (this.options.takePages && this.options.takePages > 0) { lastPageNumber = Math.min(startPageNumber + this.options.takePages - 1, totalPages); } const promises = []; for (let page = startPageNumber; page <= lastPageNumber; page++) { promises.push(this.scrapePageQueue({ page, scrapedList: page === startPageNumber ? firstPage : undefined, })); } await Promise.all(promises); await this.finalize(); this.log(`Finished scraping ${this.options.entityName}. Scraped pages: ${this.stats.pages}. Scraped items: ${this.stats.itemsSuccess}. Total requests: ${this.stats.requests}.`); if (this.error) { const errors = Array.isArray(this.error) ? this.error : [this.error]; this.errorLog(...errors); } return this.stats; } async scrapePage({ page, scrapedList, }) { var _a, _b; if (this.done || this.scrapePagesDone) return; const list = scrapedList ? scrapedList : await this.fetchPage({ page }); if (this.done) return; this.paginationToken = ((_a = list === null || list === void 0 ? void 0 : list.pagination) === null || _a === void 0 ? void 0 : _a.paginationToken) || null; let details = []; if ((_b = list === null || list === void 0 ? void 0 : list.elements) === null || _b === void 0 ? void 0 : _b.length) { details = await this.scrapePageItems({ list }); } else { this.scrapePagesDone = true; } if (this.done) return; if (!(details === null || details === void 0 ? void 0 : details.length)) { this.scrapePagesDone = true; } else { this.scrapePagesDone = false; } this.log(`Scraped ${this.options.entityName} page ${page}. Items found: ${details.length}. Requests/second: ${(this.stats.requests / ((Date.now() - this.stats.requestsStartTime.getTime()) / 1000)).toFixed(2)}`); } async fetchPage({ page }) { var _a, _b; this.log(`Scraping page ${page} of ${this.options.entityName}...`); const result = await this.options .fetchList({ page, paginationToken: this.paginationToken, sessionId: this.options.sessionId, addHeaders: this.options.addListingHeaders, }) .catch((error) => { this.errorLog('Error fetching page', page, error); return null; }); (_b = (_a = this.options).onPageFetched) === null || _b === void 0 ? void 0 : _b.call(_a, { page: 1, data: result }); if ((result === null || result === void 0 ? void 0 : result.status) === 402) { this.done = true; this.error = result.error || 'Request limit exceeded - upgrade your plan'; return null; } this.stats.pages++; this.stats.requests++; if (result === null || result === void 0 ? void 0 : result.entityId) { this.stats.pagesSuccess++; } return result; } async scrapePageItems({ list }) { if (!(list === null || list === void 0 ? void 0 : list.elements)) { return []; } const details = []; const itemPromises = list.elements.map(async (item) => { let itemDetails = null; this.stats.items++; if (!(item === null || item === void 0 ? void 0 : item.id) || this.scrapedItems[item.id]) { return null; } this.scrapedItems[item.id] = { found: true, scraped: false }; if (this.options.scrapeDetails) { itemDetails = await this.fetchItemQueue({ item, pagination: list.pagination }); if ((itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.status) === 402) { this.done = true; this.error = (itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.error) || 'Request limit exceeded - upgrade your plan'; return null; } } else { itemDetails = { entityId: item === null || item === void 0 ? void 0 : item.id, element: item, status: list.status, error: list.error, query: list.query, pagination: list.pagination, }; } if (this.options.scrapeDetails && !(itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.skipped)) { this.stats.requests++; } if (itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.done) { this.scrapePagesDone = true; this.done = true; } if ((itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.element) && itemDetails.entityId) { if (this.options.maxItems && this.stats.itemsSuccess + 1 > this.options.maxItems) { this.done = true; this.error = `Max items limit reached: ${this.options.maxItems}`; return null; } if (!this.scrapedItems[item.id].scraped) { this.scrapedItems[item.id].scraped = true; this.stats.itemsSuccess++; await this.onItemScrapedQueue({ item: itemDetails.element, ...itemDetails }); details.push(itemDetails.element); } } }); await Promise.all(itemPromises).catch((error) => { this.errorLog('Error scraping items', error); }); if (this.options.maxItems && this.stats.itemsSuccess + 1 > this.options.maxItems) { this.done = true; this.error = `Max items limit reached: ${this.options.maxItems}`; } return details; } async createSqliteDatabase() { try { const open = require('sqlite').open; const sqlite3 = require('sqlite3'); await fs.ensureDir(dirname(this.filePath)); this.db = await open({ filename: `${this.filePath}.sqlite`, driver: sqlite3.Database, }); await this.db.exec(`CREATE TABLE IF NOT EXISTS "${this.tableName}" (db_id INTEGER PRIMARY KEY AUTOINCREMENT)`); } catch (error) { this.error = ['Error creating SQLite database:', error]; this.done = true; } } async insertSqliteItem(item) { await this.sqliteDatabaseOpenPromise; const existingColumns = await this.db.all(`PRAGMA table_info("${this.tableName}")`); const existingColumnNames = existingColumns.map((col) => col.name); for (const key of Object.keys(item)) { if (!existingColumnNames.includes(key)) { await this.db.exec(`ALTER TABLE "${this.tableName}" ADD COLUMN "${key}" TEXT`); } } const keys = Object.keys(item) .map((key) => key) .map((key) => `"${key}"`); const insertSQL = `INSERT INTO "${this.tableName}" (${keys.join(', ')}) VALUES (${keys .map(() => '?') .join(', ')})`; await this.db.run(insertSQL, Object.values(item).map((value) => typeof value === 'object' ? JSON.stringify(value) : String(value))); } async finalize() { if (this.options.outputType === 'json') { fs.outputJson(`${this.filePath}.json`, { stats: this.stats, list: this.inMemoryItems, }, { spaces: 2 }); } if (this.db) { await this.db.close(); } } } class LinkedinScraper { constructor(options) { this.options = options; this.scraper = new BaseScraper(options); } async getProfile(params) { return this.scraper.fetchApi({ path: 'linkedin/profile', params }); } async getProfileId(params) { return this.scraper.fetchApi({ path: 'linkedin/profile-id', params }); } async searchProfileEmail({ profile, ...params }) { return this.scraper.fetchApi({ path: 'linkedin/email-search-by-profile', params, method: 'POST', body: { profile, }, }); } async searchProfiles(params) { return this.scraper.fetchApi({ path: 'linkedin/profile-search', params }); } async getCompany(params) { return this.scraper.fetchApi({ path: 'linkedin/company', params }); } async searchCompanies(params) { return this.scraper.fetchApi({ path: 'linkedin/company-search', params }); } async getJob(params) { const results = await this.scraper.fetchApi({ path: 'linkedin/job', params }); return results; } async searchJobs(params) { const results = await this.scraper.fetchApi({ path: 'linkedin/job-search', params }); return results; } async searchPosts(params) { return this.scraper.fetchApi({ path: 'linkedin/post-search', params }); } async getPostReactions(params) { return this.scraper.fetchApi({ path: 'linkedin/post-reactions', params }); } async getPostComments(params) { return this.scraper.fetchApi({ path: 'linkedin/post-comments', params }); } async getProfileComments(params) { return this.scraper.fetchApi({ path: 'linkedin/profile-comments', params }); } async getProfileReactions(params) { return this.scraper.fetchApi({ path: 'linkedin/profile-reactions', params }); } async scrapeJobs({ query, ...options }) { return new ListingScraper({ fetchList: (listParams) => this.searchJobs({ ...query, ...listParams }), fetchItem: async ({ item, ...rest }) => (item === null || item === void 0 ? void 0 : item.id) ? this.getJob({ jobId: item.id, ...rest }) : { skipped: true }, scrapeDetails: true, entityName: 'jobs', ...options, maxPageNumber: 40, }).scrapeStart(); } async scrapeCompanies({ query, ...options }) { return new ListingScraper({ fetchList: (listParams) => this.searchCompanies({ ...query, ...listParams }), fetchItem: async ({ item, ...rest }) => (item === null || item === void 0 ? void 0 : item.universalName) ? this.getCompany({ universalName: item.universalName, ...rest }) : { skipped: true }, scrapeDetails: true, entityName: 'companies', ...options, maxPageNumber: 100, }).scrapeStart(); } async scrapeProfiles({ query, findEmail, ...options }) { return new ListingScraper({ fetchList: (listParams) => this.searchProfiles({ ...query, ...listParams }), fetchItem: async ({ item, ...rest }) => (item === null || item === void 0 ? void 0 : item.publicIdentifier) ? this.getProfile({ publicIdentifier: item.publicIdentifier, findEmail, ...rest }) : { skipped: true }, scrapeDetails: true, entityName: 'profiles', ...options, maxPageNumber: 100, }).scrapeStart(); } async scrapePosts({ query, ...options }) { return new ListingScraper({ fetchList: (listParams) => this.searchPosts({ ...query, ...listParams }), fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id) ? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item } : { skipped: true }, scrapeDetails: false, entityName: 'posts', ...options, maxPageNumber: 100, }).scrapeStart(); } async scrapePostReactions({ query, ...options }) { return new ListingScraper({ fetchList: (listParams) => this.getPostReactions({ ...query, ...listParams }), fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id) ? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item } : { skipped: true }, scrapeDetails: false, entityName: 'post-reactions', ...options, maxPageNumber: 100, }).scrapeStart(); } async scrapePostComments({ query, ...options }) { return new ListingScraper({ fetchList: (fetchArgs) => this.getPostComments({ ...query, ...fetchArgs }), fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id) ? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item } : { skipped: true }, scrapeDetails: false, entityName: 'post-comments', ...options, maxPageNumber: 100, }).scrapeStart(); } async scrapeProfileComments({ query, ...options }) { return new ListingScraper({ fetchList: (fetchArgs) => this.getProfileComments({ ...query, ...fetchArgs }), fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id) ? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item } : { skipped: true }, scrapeDetails: false, entityName: 'profile-comments', ...options, maxPageNumber: 100, }).scrapeStart(); } async scrapeProfileReactions({ query, ...options }) { return new ListingScraper({ fetchList: (fetchArgs) => { return this.getProfileReactions({ ...query, ...fetchArgs }); }, fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id) ? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item } : { skipped: true }, scrapeDetails: false, entityName: 'profile-reactions', ...options, maxPageNumber: 100, }).scrapeStart(); } async searchSalesNavigatorLeads(params) { return this.scraper.fetchApi({ path: 'linkedin-sales-nav/lead-search', params }); } async scrapeSalesNavigatorLeads({ query, findEmail, ...options }) { return new ListingScraper({ fetchList: (listParams) => this.searchSalesNavigatorLeads({ ...query, ...listParams }), fetchItem: async ({ item, ...rest }) => { return (item === null || item === void 0 ? void 0 : item.id) ? this.getProfile({ profileId: item.id, findEmail, ...rest }) : { skipped: true }; }, scrapeDetails: true, entityName: 'profiles', warnPageLimit: true, ...options, maxPageNumber: 100, }).scrapeStart(); } async getGroup(params) { return this.scraper.fetchApi({ path: 'linkedin/group', params }); } async searchGroups(params) { return this.scraper.fetchApi({ path: 'linkedin/group-search', params }); } async test() { return this.scraper.fetchApi({ path: 'linkedin/test' }); } } function createLinkedinScraper(options) { return new LinkedinScraper(options); } export { LinkedinScraper, createLinkedinScraper }; //# sourceMappingURL=index.esm.js.map