@harvestapi/scraper
Version:
HarvestAPI provides LinkedIn data scraping tools for real-time, high-performance scraping at a low cost. API allows to search for Linkedin `jobs`, `companies`, `profiles`, and `posts` using a wide range of filters.
1,318 lines (1,173 loc) • 44.3 kB
JavaScript
;
var crypto = require('crypto');
var fs = require('fs-extra');
var path = require('path');
var node_util = require('node:util');
const token = '%[a-f0-9]{2}';
const singleMatcher = new RegExp('(' + token + ')|([^%]+?)', 'gi');
const multiMatcher = new RegExp('(' + token + ')+', 'gi');
function decodeComponents(components, split) {
try {
// Try to decode the entire string first
return [decodeURIComponent(components.join(''))];
} catch {
// Do nothing
}
if (components.length === 1) {
return components;
}
split = split || 1;
// Split the array in 2 parts
const left = components.slice(0, split);
const right = components.slice(split);
return Array.prototype.concat.call([], decodeComponents(left), decodeComponents(right));
}
function decode$1(input) {
try {
return decodeURIComponent(input);
} catch {
let tokens = input.match(singleMatcher) || [];
for (let i = 1; i < tokens.length; i++) {
input = decodeComponents(tokens, i).join('');
tokens = input.match(singleMatcher) || [];
}
return input;
}
}
function customDecodeURIComponent(input) {
// Keep track of all the replacements and prefill the map with the `BOM`
const replaceMap = {
'%FE%FF': '\uFFFD\uFFFD',
'%FF%FE': '\uFFFD\uFFFD',
};
let match = multiMatcher.exec(input);
while (match) {
try {
// Decode as big chunks as possible
replaceMap[match[0]] = decodeURIComponent(match[0]);
} catch {
const result = decode$1(match[0]);
if (result !== match[0]) {
replaceMap[match[0]] = result;
}
}
match = multiMatcher.exec(input);
}
// Add `%C2` at the end of the map to make sure it does not replace the combinator before everything else
replaceMap['%C2'] = '\uFFFD';
const entries = Object.keys(replaceMap);
for (const key of entries) {
// Replace all decoded components
input = input.replace(new RegExp(key, 'g'), replaceMap[key]);
}
return input;
}
function decodeUriComponent(encodedURI) {
if (typeof encodedURI !== 'string') {
throw new TypeError('Expected `encodedURI` to be of type `string`, got `' + typeof encodedURI + '`');
}
try {
// Try the built in decoder first
return decodeURIComponent(encodedURI);
} catch {
// Fallback to a more advanced decoder
return customDecodeURIComponent(encodedURI);
}
}
function includeKeys(object, predicate) {
const result = {};
if (Array.isArray(predicate)) {
for (const key of predicate) {
const descriptor = Object.getOwnPropertyDescriptor(object, key);
if (descriptor?.enumerable) {
Object.defineProperty(result, key, descriptor);
}
}
} else {
// `Reflect.ownKeys()` is required to retrieve symbol properties
for (const key of Reflect.ownKeys(object)) {
const descriptor = Object.getOwnPropertyDescriptor(object, key);
if (descriptor.enumerable) {
const value = object[key];
if (predicate(key, value, object)) {
Object.defineProperty(result, key, descriptor);
}
}
}
}
return result;
}
function splitOnFirst(string, separator) {
if (!(typeof string === 'string' && typeof separator === 'string')) {
throw new TypeError('Expected the arguments to be of type `string`');
}
if (string === '' || separator === '') {
return [];
}
const separatorIndex = string.indexOf(separator);
if (separatorIndex === -1) {
return [];
}
return [
string.slice(0, separatorIndex),
string.slice(separatorIndex + separator.length)
];
}
const isNullOrUndefined = value => value === null || value === undefined;
// eslint-disable-next-line unicorn/prefer-code-point
const strictUriEncode = string => encodeURIComponent(string).replaceAll(/[!'()*]/g, x => `%${x.charCodeAt(0).toString(16).toUpperCase()}`);
const encodeFragmentIdentifier = Symbol('encodeFragmentIdentifier');
function encoderForArrayFormat(options) {
switch (options.arrayFormat) {
case 'index': {
return key => (result, value) => {
const index = result.length;
if (
value === undefined
|| (options.skipNull && value === null)
|| (options.skipEmptyString && value === '')
) {
return result;
}
if (value === null) {
return [
...result, [encode(key, options), '[', index, ']'].join(''),
];
}
return [
...result,
[encode(key, options), '[', encode(index, options), ']=', encode(value, options)].join(''),
];
};
}
case 'bracket': {
return key => (result, value) => {
if (
value === undefined
|| (options.skipNull && value === null)
|| (options.skipEmptyString && value === '')
) {
return result;
}
if (value === null) {
return [
...result,
[encode(key, options), '[]'].join(''),
];
}
return [
...result,
[encode(key, options), '[]=', encode(value, options)].join(''),
];
};
}
case 'colon-list-separator': {
return key => (result, value) => {
if (
value === undefined
|| (options.skipNull && value === null)
|| (options.skipEmptyString && value === '')
) {
return result;
}
if (value === null) {
return [
...result,
[encode(key, options), ':list='].join(''),
];
}
return [
...result,
[encode(key, options), ':list=', encode(value, options)].join(''),
];
};
}
case 'comma':
case 'separator':
case 'bracket-separator': {
const keyValueSeparator = options.arrayFormat === 'bracket-separator'
? '[]='
: '=';
return key => (result, value) => {
if (
value === undefined
|| (options.skipNull && value === null)
|| (options.skipEmptyString && value === '')
) {
return result;
}
// Translate null to an empty string so that it doesn't serialize as 'null'
value = value === null ? '' : value;
if (result.length === 0) {
return [[encode(key, options), keyValueSeparator, encode(value, options)].join('')];
}
return [[result, encode(value, options)].join(options.arrayFormatSeparator)];
};
}
default: {
return key => (result, value) => {
if (
value === undefined
|| (options.skipNull && value === null)
|| (options.skipEmptyString && value === '')
) {
return result;
}
if (value === null) {
return [
...result,
encode(key, options),
];
}
return [
...result,
[encode(key, options), '=', encode(value, options)].join(''),
];
};
}
}
}
function parserForArrayFormat(options) {
let result;
switch (options.arrayFormat) {
case 'index': {
return (key, value, accumulator) => {
result = /\[(\d*)]$/.exec(key);
key = key.replace(/\[\d*]$/, '');
if (!result) {
accumulator[key] = value;
return;
}
if (accumulator[key] === undefined) {
accumulator[key] = {};
}
accumulator[key][result[1]] = value;
};
}
case 'bracket': {
return (key, value, accumulator) => {
result = /(\[])$/.exec(key);
key = key.replace(/\[]$/, '');
if (!result) {
accumulator[key] = value;
return;
}
if (accumulator[key] === undefined) {
accumulator[key] = [value];
return;
}
accumulator[key] = [...accumulator[key], value];
};
}
case 'colon-list-separator': {
return (key, value, accumulator) => {
result = /(:list)$/.exec(key);
key = key.replace(/:list$/, '');
if (!result) {
accumulator[key] = value;
return;
}
if (accumulator[key] === undefined) {
accumulator[key] = [value];
return;
}
accumulator[key] = [...accumulator[key], value];
};
}
case 'comma':
case 'separator': {
return (key, value, accumulator) => {
const isArray = typeof value === 'string' && value.includes(options.arrayFormatSeparator);
const isEncodedArray = (typeof value === 'string' && !isArray && decode(value, options).includes(options.arrayFormatSeparator));
value = isEncodedArray ? decode(value, options) : value;
const newValue = isArray || isEncodedArray ? value.split(options.arrayFormatSeparator).map(item => decode(item, options)) : (value === null ? value : decode(value, options));
accumulator[key] = newValue;
};
}
case 'bracket-separator': {
return (key, value, accumulator) => {
const isArray = /(\[])$/.test(key);
key = key.replace(/\[]$/, '');
if (!isArray) {
accumulator[key] = value ? decode(value, options) : value;
return;
}
const arrayValue = value === null
? []
: decode(value, options).split(options.arrayFormatSeparator);
if (accumulator[key] === undefined) {
accumulator[key] = arrayValue;
return;
}
accumulator[key] = [...accumulator[key], ...arrayValue];
};
}
default: {
return (key, value, accumulator) => {
if (accumulator[key] === undefined) {
accumulator[key] = value;
return;
}
accumulator[key] = [...[accumulator[key]].flat(), value];
};
}
}
}
function validateArrayFormatSeparator(value) {
if (typeof value !== 'string' || value.length !== 1) {
throw new TypeError('arrayFormatSeparator must be single character string');
}
}
function encode(value, options) {
if (options.encode) {
return options.strict ? strictUriEncode(value) : encodeURIComponent(value);
}
return value;
}
function decode(value, options) {
if (options.decode) {
return decodeUriComponent(value);
}
return value;
}
function keysSorter(input) {
if (Array.isArray(input)) {
return input.sort();
}
if (typeof input === 'object') {
return keysSorter(Object.keys(input))
.sort((a, b) => Number(a) - Number(b))
.map(key => input[key]);
}
return input;
}
function removeHash(input) {
const hashStart = input.indexOf('#');
if (hashStart !== -1) {
input = input.slice(0, hashStart);
}
return input;
}
function getHash(url) {
let hash = '';
const hashStart = url.indexOf('#');
if (hashStart !== -1) {
hash = url.slice(hashStart);
}
return hash;
}
function parseValue(value, options, type) {
if (type === 'string' && typeof value === 'string') {
return value;
}
if (typeof type === 'function' && typeof value === 'string') {
return type(value);
}
if (options.parseBooleans && value !== null && (value.toLowerCase() === 'true' || value.toLowerCase() === 'false')) {
return value.toLowerCase() === 'true';
}
if (type === 'number' && !Number.isNaN(Number(value)) && (typeof value === 'string' && value.trim() !== '')) {
return Number(value);
}
if (options.parseNumbers && !Number.isNaN(Number(value)) && (typeof value === 'string' && value.trim() !== '')) {
return Number(value);
}
return value;
}
function extract(input) {
input = removeHash(input);
const queryStart = input.indexOf('?');
if (queryStart === -1) {
return '';
}
return input.slice(queryStart + 1);
}
function parse(query, options) {
options = {
decode: true,
sort: true,
arrayFormat: 'none',
arrayFormatSeparator: ',',
parseNumbers: false,
parseBooleans: false,
types: Object.create(null),
...options,
};
validateArrayFormatSeparator(options.arrayFormatSeparator);
const formatter = parserForArrayFormat(options);
// Create an object with no prototype
const returnValue = Object.create(null);
if (typeof query !== 'string') {
return returnValue;
}
query = query.trim().replace(/^[?#&]/, '');
if (!query) {
return returnValue;
}
for (const parameter of query.split('&')) {
if (parameter === '') {
continue;
}
const parameter_ = options.decode ? parameter.replaceAll('+', ' ') : parameter;
let [key, value] = splitOnFirst(parameter_, '=');
if (key === undefined) {
key = parameter_;
}
// Missing `=` should be `null`:
// http://w3.org/TR/2012/WD-url-20120524/#collect-url-parameters
value = value === undefined ? null : (['comma', 'separator', 'bracket-separator'].includes(options.arrayFormat) ? value : decode(value, options));
formatter(decode(key, options), value, returnValue);
}
for (const [key, value] of Object.entries(returnValue)) {
if (typeof value === 'object' && value !== null && options.types[key] !== 'string') {
for (const [key2, value2] of Object.entries(value)) {
const type = options.types[key] ? options.types[key].replace('[]', '') : undefined;
value[key2] = parseValue(value2, options, type);
}
} else if (typeof value === 'object' && value !== null && options.types[key] === 'string') {
returnValue[key] = Object.values(value).join(options.arrayFormatSeparator);
} else {
returnValue[key] = parseValue(value, options, options.types[key]);
}
}
if (options.sort === false) {
return returnValue;
}
// TODO: Remove the use of `reduce`.
// eslint-disable-next-line unicorn/no-array-reduce
return (options.sort === true ? Object.keys(returnValue).sort() : Object.keys(returnValue).sort(options.sort)).reduce((result, key) => {
const value = returnValue[key];
result[key] = Boolean(value) && typeof value === 'object' && !Array.isArray(value) ? keysSorter(value) : value;
return result;
}, Object.create(null));
}
function stringify(object, options) {
if (!object) {
return '';
}
options = {
encode: true,
strict: true,
arrayFormat: 'none',
arrayFormatSeparator: ',',
...options,
};
validateArrayFormatSeparator(options.arrayFormatSeparator);
const shouldFilter = key => (
(options.skipNull && isNullOrUndefined(object[key]))
|| (options.skipEmptyString && object[key] === '')
);
const formatter = encoderForArrayFormat(options);
const objectCopy = {};
for (const [key, value] of Object.entries(object)) {
if (!shouldFilter(key)) {
objectCopy[key] = value;
}
}
const keys = Object.keys(objectCopy);
if (options.sort !== false) {
keys.sort(options.sort);
}
return keys.map(key => {
const value = object[key];
if (value === undefined) {
return '';
}
if (value === null) {
return encode(key, options);
}
if (Array.isArray(value)) {
if (value.length === 0 && options.arrayFormat === 'bracket-separator') {
return encode(key, options) + '[]';
}
return value
.reduce(formatter(key), [])
.join('&');
}
return encode(key, options) + '=' + encode(value, options);
}).filter(x => x.length > 0).join('&');
}
function parseUrl(url, options) {
options = {
decode: true,
...options,
};
let [url_, hash] = splitOnFirst(url, '#');
if (url_ === undefined) {
url_ = url;
}
return {
url: url_?.split('?')?.[0] ?? '',
query: parse(extract(url), options),
...(options && options.parseFragmentIdentifier && hash ? {fragmentIdentifier: decode(hash, options)} : {}),
};
}
function stringifyUrl(object, options) {
options = {
encode: true,
strict: true,
[encodeFragmentIdentifier]: true,
...options,
};
const url = removeHash(object.url).split('?')[0] || '';
const queryFromUrl = extract(object.url);
const query = {
...parse(queryFromUrl, {sort: false}),
...object.query,
};
let queryString = stringify(query, options);
queryString &&= `?${queryString}`;
let hash = getHash(object.url);
if (typeof object.fragmentIdentifier === 'string') {
const urlObjectForFragmentEncode = new URL(url);
urlObjectForFragmentEncode.hash = object.fragmentIdentifier;
hash = options[encodeFragmentIdentifier] ? urlObjectForFragmentEncode.hash : `#${object.fragmentIdentifier}`;
}
return `${url}${queryString}${hash}`;
}
function pick(input, filter, options) {
options = {
parseFragmentIdentifier: true,
[encodeFragmentIdentifier]: false,
...options,
};
const {url, query, fragmentIdentifier} = parseUrl(input, options);
return stringifyUrl({
url,
query: includeKeys(query, filter),
fragmentIdentifier,
}, options);
}
function exclude(input, filter, options) {
const exclusionFilter = Array.isArray(filter) ? key => !filter.includes(key) : (key, value) => !filter(key, value);
return pick(input, exclusionFilter, options);
}
var queryString = /*#__PURE__*/Object.freeze({
__proto__: null,
exclude: exclude,
extract: extract,
parse: parse,
parseUrl: parseUrl,
pick: pick,
stringify: stringify,
stringifyUrl: stringifyUrl
});
class BaseScraper {
constructor(options) {
this.options = options;
this.apiBaseUrl = 'https://api.harvest-api.com';
this.logger = console;
if (options.baseUrl) {
this.apiBaseUrl = options.baseUrl;
}
if (this.apiBaseUrl.endsWith('/')) {
this.apiBaseUrl = this.apiBaseUrl.slice(0, -1);
}
this.logger = options.logger || console;
}
async fetchApi({ path, params, addHeaders, method = 'GET', body, }) {
var _a, _b;
if (!this.options.apiKey) {
this.logger.error('API Key is required');
return {
error: 'API Key is required to fetch API',
};
}
if (!path) {
this.logger.error('Path is required');
return {
error: 'Path is required to fetch API',
};
}
if (!path.startsWith('/')) {
path = `/${path}`;
}
if (params.addHeaders) {
addHeaders = {
...addHeaders,
...params.addHeaders,
};
delete params.addHeaders;
}
if (params && Object.values(params).filter(Boolean).length > 0) {
path += `?${queryString.stringify(params, {
arrayFormat: 'comma',
skipNull: true,
skipEmptyString: true,
})}`;
}
const apiUrl = `${this.apiBaseUrl}${path}`;
let error = null;
const response = await fetch(apiUrl, {
method: method || 'GET',
headers: {
'Content-Type': 'application/json',
'X-API-KEY': this.options.apiKey,
...this.options.addHeaders,
...addHeaders,
},
body: body ? JSON.stringify(body) : undefined,
}).catch((e) => {
this.logger.error('Error fetching API:', e);
error = e;
return null;
});
const data = await ((_a = response === null || response === void 0 ? void 0 : response.json()) === null || _a === void 0 ? void 0 : _a.catch((e) => {
this.logger.error('Error parsing response:', e);
error = e;
return null;
}));
if (!(response === null || response === void 0 ? void 0 : response.ok)) {
return {
error: ((_b = data === null || data === void 0 ? void 0 : data.error) === null || _b === void 0 ? void 0 : _b.error) || (data === null || data === void 0 ? void 0 : data.error) || error,
status: response === null || response === void 0 ? void 0 : response.status,
};
}
return data;
}
}
function createConcurrentQueues(concurrency, fn, opts) {
if (!concurrency || isNaN(concurrency)) {
throw new Error(`createConcurrentQueues: Concurrency must be a number > 0, provided: ${concurrency}`);
}
let activePromises = 0;
const pendingTasks = [];
return async (...args) => {
return new Promise((resolve, reject) => {
const execute = async () => {
activePromises++;
try {
const result = await fn(...args);
resolve(result);
}
catch (error) {
reject(error);
}
finally {
activePromises--;
if (pendingTasks.length > 0) {
const nextTask = pendingTasks.shift();
if (nextTask)
setTimeout(nextTask, 1);
}
}
};
if (activePromises < concurrency) {
execute();
}
else {
pendingTasks.push(execute);
}
});
};
}
class ListingScraper {
constructor(options) {
this.options = options;
this.id = crypto.randomUUID();
this.startTime = new Date();
this.inMemoryItems = [];
this.stats = {
pages: 0,
pagesSuccess: 0,
items: 0,
itemsSuccess: 0,
requests: 0,
requestsStartTime: new Date(),
};
this.db = null;
this.sqliteDatabaseOpenPromise = null;
this.done = false;
this.scrapePagesDone = false;
this.error = null;
this.scrapedItems = {};
this.paginationToken = null;
this.undefinedPagination = false;
this.onItemScraped = async ({ item, ...apiArgs }) => {
var _a, _b, _c, _d, _e, _f;
const logger = {
log: (...args) => this.log(...args),
error: (...args) => this.errorLog(...args),
};
if (this.options.outputType === 'json') {
this.inMemoryItems.push(item);
void ((_b = (_a = this.options).onItemScraped) === null || _b === void 0 ? void 0 : _b.call(_a, { item, logger, ...apiArgs }));
}
if (this.options.outputType === 'sqlite') {
await this.insertSqliteItem(item).catch((error) => {
this.errorLog('Error inserting item to SQLite:', error);
});
void ((_d = (_c = this.options).onItemScraped) === null || _d === void 0 ? void 0 : _d.call(_c, { item, logger, ...apiArgs }));
}
if (this.options.outputType === 'callback') {
await ((_f = (_e = this.options).onItemScraped) === null || _f === void 0 ? void 0 : _f.call(_e, { item, logger, ...apiArgs }));
}
};
if (this.options.optionsOverride) {
this.options = {
...this.options,
...this.options.optionsOverride,
};
}
if (!this.options.outputType) {
this.options.outputType = 'sqlite';
}
this.tableName = this.options.tableName || `${this.options.entityName}_${this.id}`;
this.filePath = path.resolve(this.options.outputDir || path.resolve(process.cwd(), 'output'), this.options.filename ||
`${this.startTime.toISOString().replace(/:/g, '-').replace(/\./g, '-')}_${this.options.entityName}_${this.id}`);
}
log(...args) {
if (!this.options.disableLog) {
console.log(`[${new Date().toISOString()}]`, ...args);
}
}
errorLog(...args) {
if (!this.options.disableErrorLog) {
console.error(`[${new Date().toISOString()}]`, ...args);
}
}
async scrapeStart() {
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m, _o;
this.stats = {
pages: 0,
pagesSuccess: 0,
items: 0,
itemsSuccess: 0,
requests: 0,
requestsStartTime: new Date(),
};
this.paginationToken = null;
this.scrapePagesDone = false;
const startPageNumber = this.options.startPage || 1;
const firstPage = await this.fetchPage({ page: startPageNumber });
(_b = (_a = this.options).onFirstPageFetched) === null || _b === void 0 ? void 0 : _b.call(_a, { data: firstPage });
let totalPages = ((_c = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _c === void 0 ? void 0 : _c.totalPages) || 0;
this.paginationToken = ((_d = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _d === void 0 ? void 0 : _d.paginationToken) || null;
if (this.options.maxPageNumber && totalPages > this.options.maxPageNumber) {
totalPages = this.options.maxPageNumber;
}
if (!totalPages && ((_e = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _e === void 0 ? void 0 : _e.length)) {
totalPages = this.options.maxPageNumber;
this.undefinedPagination = true;
}
else {
this.undefinedPagination = false;
}
if (this.options.warnPageLimit &&
totalPages === this.options.maxPageNumber &&
((_f = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _f === void 0 ? void 0 : _f.pageSize) &&
(firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination.totalElements)) {
const totalAllowedItems = this.options.maxPageNumber * firstPage.pagination.pageSize;
const totalItems = firstPage.pagination.totalElements;
if (totalItems > totalAllowedItems) {
console.warn('\n' +
node_util.styleText('bgYellow', ' [WARNING] \n') +
`The search results are limited to ${this.options.maxPageNumber * ((_g = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _g === void 0 ? void 0 : _g.pageSize)} items (out of total ${firstPage.pagination.totalElements}) because LinkedIn does not allow to scrape more for one query. \n` +
`Which means you will not be able to extract all data for this exact query. \n` +
'Consider splitting your query into multiple queries applying more filters. \n' +
`For example do multiple runs for locations of specific cities, instead of one run targeting entire country or region. ` +
'\n');
}
}
const concurrency = ((_h = this.options) === null || _h === void 0 ? void 0 : _h.overrideConcurrency) || ((_j = firstPage === null || firstPage === void 0 ? void 0 : firstPage.user) === null || _j === void 0 ? void 0 : _j.requestsConcurrency) || 1;
this.log(`Scraping ${this.options.entityName} with ${concurrency} concurrent ${concurrency === 1 ? 'worker' : 'workers'}... Total pages: ${totalPages}. Total items: ${((_k = firstPage === null || firstPage === void 0 ? void 0 : firstPage.pagination) === null || _k === void 0 ? void 0 : _k.totalElements) || ((_l = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _l === void 0 ? void 0 : _l.length) || 0}`);
if (!((_m = firstPage === null || firstPage === void 0 ? void 0 : firstPage.elements) === null || _m === void 0 ? void 0 : _m.length)) {
this.done = true;
if (this.error) {
const errors = Array.isArray(this.error) ? this.error : [this.error];
this.errorLog(...errors);
}
this.errorLog('Error fetching first page or no items found. Exiting.', (firstPage === null || firstPage === void 0 ? void 0 : firstPage.error) ? JSON.stringify(firstPage === null || firstPage === void 0 ? void 0 : firstPage.error, null, 2) : '');
return;
}
this.scrapePageQueue = createConcurrentQueues(((_o = this.options) === null || _o === void 0 ? void 0 : _o.overridePageConcurrency) || 2, (args) => this.scrapePage(args));
this.fetchItemQueue = createConcurrentQueues(concurrency, async ({ item, pagination }) => {
var _a;
if (this.options.maxItems && this.stats.itemsSuccess + 1 > this.options.maxItems) {
this.done = true;
this.error = `Max items limit reached: ${this.options.maxItems}`;
return null;
}
const result = await ((_a = this.options
.fetchItem({
item,
addHeaders: this.options.addItemHeaders,
})) === null || _a === void 0 ? void 0 : _a.catch((error) => {
this.errorLog('Error scraping item', error);
return null;
}));
if (!result)
return null;
return {
...result,
pagination,
};
});
this.onItemScrapedQueue = createConcurrentQueues(this.options.outputType === 'sqlite' ? 1 : concurrency, ({ item, ...rest }) => this.onItemScraped({ item, ...rest }));
this.stats.requestsStartTime = new Date();
this.stats.pages = 1;
this.stats.pagesSuccess = 1;
if (this.options.outputType === 'sqlite') {
this.sqliteDatabaseOpenPromise = this.createSqliteDatabase();
}
let lastPageNumber = totalPages;
if (this.options.takePages && this.options.takePages > 0) {
lastPageNumber = Math.min(startPageNumber + this.options.takePages - 1, totalPages);
}
const promises = [];
for (let page = startPageNumber; page <= lastPageNumber; page++) {
promises.push(this.scrapePageQueue({
page,
scrapedList: page === startPageNumber ? firstPage : undefined,
}));
}
await Promise.all(promises);
await this.finalize();
this.log(`Finished scraping ${this.options.entityName}. Scraped pages: ${this.stats.pages}. Scraped items: ${this.stats.itemsSuccess}. Total requests: ${this.stats.requests}.`);
if (this.error) {
const errors = Array.isArray(this.error) ? this.error : [this.error];
this.errorLog(...errors);
}
return this.stats;
}
async scrapePage({ page, scrapedList, }) {
var _a, _b;
if (this.done || this.scrapePagesDone)
return;
const list = scrapedList ? scrapedList : await this.fetchPage({ page });
if (this.done)
return;
this.paginationToken = ((_a = list === null || list === void 0 ? void 0 : list.pagination) === null || _a === void 0 ? void 0 : _a.paginationToken) || null;
let details = [];
if ((_b = list === null || list === void 0 ? void 0 : list.elements) === null || _b === void 0 ? void 0 : _b.length) {
details = await this.scrapePageItems({ list });
}
else {
this.scrapePagesDone = true;
}
if (this.done)
return;
if (!(details === null || details === void 0 ? void 0 : details.length)) {
this.scrapePagesDone = true;
}
else {
this.scrapePagesDone = false;
}
this.log(`Scraped ${this.options.entityName} page ${page}. Items found: ${details.length}. Requests/second: ${(this.stats.requests /
((Date.now() - this.stats.requestsStartTime.getTime()) / 1000)).toFixed(2)}`);
}
async fetchPage({ page }) {
var _a, _b;
this.log(`Scraping page ${page} of ${this.options.entityName}...`);
const result = await this.options
.fetchList({
page,
paginationToken: this.paginationToken,
sessionId: this.options.sessionId,
addHeaders: this.options.addListingHeaders,
})
.catch((error) => {
this.errorLog('Error fetching page', page, error);
return null;
});
(_b = (_a = this.options).onPageFetched) === null || _b === void 0 ? void 0 : _b.call(_a, { page: 1, data: result });
if ((result === null || result === void 0 ? void 0 : result.status) === 402) {
this.done = true;
this.error = result.error || 'Request limit exceeded - upgrade your plan';
return null;
}
this.stats.pages++;
this.stats.requests++;
if (result === null || result === void 0 ? void 0 : result.entityId) {
this.stats.pagesSuccess++;
}
return result;
}
async scrapePageItems({ list }) {
if (!(list === null || list === void 0 ? void 0 : list.elements)) {
return [];
}
const details = [];
const itemPromises = list.elements.map(async (item) => {
let itemDetails = null;
this.stats.items++;
if (!(item === null || item === void 0 ? void 0 : item.id) || this.scrapedItems[item.id]) {
return null;
}
this.scrapedItems[item.id] = { found: true, scraped: false };
if (this.options.scrapeDetails) {
itemDetails = await this.fetchItemQueue({ item, pagination: list.pagination });
if ((itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.status) === 402) {
this.done = true;
this.error = (itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.error) || 'Request limit exceeded - upgrade your plan';
return null;
}
}
else {
itemDetails = {
entityId: item === null || item === void 0 ? void 0 : item.id,
element: item,
status: list.status,
error: list.error,
query: list.query,
pagination: list.pagination,
};
}
if (this.options.scrapeDetails && !(itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.skipped)) {
this.stats.requests++;
}
if (itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.done) {
this.scrapePagesDone = true;
this.done = true;
}
if ((itemDetails === null || itemDetails === void 0 ? void 0 : itemDetails.element) && itemDetails.entityId) {
if (this.options.maxItems && this.stats.itemsSuccess + 1 > this.options.maxItems) {
this.done = true;
this.error = `Max items limit reached: ${this.options.maxItems}`;
return null;
}
if (!this.scrapedItems[item.id].scraped) {
this.scrapedItems[item.id].scraped = true;
this.stats.itemsSuccess++;
await this.onItemScrapedQueue({ item: itemDetails.element, ...itemDetails });
details.push(itemDetails.element);
}
}
});
await Promise.all(itemPromises).catch((error) => {
this.errorLog('Error scraping items', error);
});
if (this.options.maxItems && this.stats.itemsSuccess + 1 > this.options.maxItems) {
this.done = true;
this.error = `Max items limit reached: ${this.options.maxItems}`;
}
return details;
}
async createSqliteDatabase() {
try {
const open = require('sqlite').open;
const sqlite3 = require('sqlite3');
await fs.ensureDir(path.dirname(this.filePath));
this.db = await open({
filename: `${this.filePath}.sqlite`,
driver: sqlite3.Database,
});
await this.db.exec(`CREATE TABLE IF NOT EXISTS "${this.tableName}" (db_id INTEGER PRIMARY KEY AUTOINCREMENT)`);
}
catch (error) {
this.error = ['Error creating SQLite database:', error];
this.done = true;
}
}
async insertSqliteItem(item) {
await this.sqliteDatabaseOpenPromise;
const existingColumns = await this.db.all(`PRAGMA table_info("${this.tableName}")`);
const existingColumnNames = existingColumns.map((col) => col.name);
for (const key of Object.keys(item)) {
if (!existingColumnNames.includes(key)) {
await this.db.exec(`ALTER TABLE "${this.tableName}" ADD COLUMN "${key}" TEXT`);
}
}
const keys = Object.keys(item)
.map((key) => key)
.map((key) => `"${key}"`);
const insertSQL = `INSERT INTO "${this.tableName}" (${keys.join(', ')}) VALUES (${keys
.map(() => '?')
.join(', ')})`;
await this.db.run(insertSQL, Object.values(item).map((value) => typeof value === 'object' ? JSON.stringify(value) : String(value)));
}
async finalize() {
if (this.options.outputType === 'json') {
fs.outputJson(`${this.filePath}.json`, {
stats: this.stats,
list: this.inMemoryItems,
}, { spaces: 2 });
}
if (this.db) {
await this.db.close();
}
}
}
class LinkedinScraper {
constructor(options) {
this.options = options;
this.scraper = new BaseScraper(options);
}
async getProfile(params) {
return this.scraper.fetchApi({ path: 'linkedin/profile', params });
}
async getProfileId(params) {
return this.scraper.fetchApi({ path: 'linkedin/profile-id', params });
}
async searchProfileEmail({ profile, ...params }) {
return this.scraper.fetchApi({
path: 'linkedin/email-search-by-profile',
params,
method: 'POST',
body: {
profile,
},
});
}
async searchProfiles(params) {
return this.scraper.fetchApi({ path: 'linkedin/profile-search', params });
}
async getCompany(params) {
return this.scraper.fetchApi({ path: 'linkedin/company', params });
}
async searchCompanies(params) {
return this.scraper.fetchApi({ path: 'linkedin/company-search', params });
}
async getJob(params) {
const results = await this.scraper.fetchApi({ path: 'linkedin/job', params });
return results;
}
async searchJobs(params) {
const results = await this.scraper.fetchApi({ path: 'linkedin/job-search', params });
return results;
}
async searchPosts(params) {
return this.scraper.fetchApi({ path: 'linkedin/post-search', params });
}
async getPostReactions(params) {
return this.scraper.fetchApi({ path: 'linkedin/post-reactions', params });
}
async getPostComments(params) {
return this.scraper.fetchApi({ path: 'linkedin/post-comments', params });
}
async getProfileComments(params) {
return this.scraper.fetchApi({ path: 'linkedin/profile-comments', params });
}
async getProfileReactions(params) {
return this.scraper.fetchApi({ path: 'linkedin/profile-reactions', params });
}
async scrapeJobs({ query, ...options }) {
return new ListingScraper({
fetchList: (listParams) => this.searchJobs({ ...query, ...listParams }),
fetchItem: async ({ item, ...rest }) => (item === null || item === void 0 ? void 0 : item.id) ? this.getJob({ jobId: item.id, ...rest }) : { skipped: true },
scrapeDetails: true,
entityName: 'jobs',
...options,
maxPageNumber: 40,
}).scrapeStart();
}
async scrapeCompanies({ query, ...options }) {
return new ListingScraper({
fetchList: (listParams) => this.searchCompanies({ ...query, ...listParams }),
fetchItem: async ({ item, ...rest }) => (item === null || item === void 0 ? void 0 : item.universalName)
? this.getCompany({ universalName: item.universalName, ...rest })
: { skipped: true },
scrapeDetails: true,
entityName: 'companies',
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async scrapeProfiles({ query, findEmail, ...options }) {
return new ListingScraper({
fetchList: (listParams) => this.searchProfiles({ ...query, ...listParams }),
fetchItem: async ({ item, ...rest }) => (item === null || item === void 0 ? void 0 : item.publicIdentifier)
? this.getProfile({ publicIdentifier: item.publicIdentifier, findEmail, ...rest })
: { skipped: true },
scrapeDetails: true,
entityName: 'profiles',
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async scrapePosts({ query, ...options }) {
return new ListingScraper({
fetchList: (listParams) => this.searchPosts({ ...query, ...listParams }),
fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id)
? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item }
: { skipped: true },
scrapeDetails: false,
entityName: 'posts',
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async scrapePostReactions({ query, ...options }) {
return new ListingScraper({
fetchList: (listParams) => this.getPostReactions({ ...query, ...listParams }),
fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id)
? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item }
: { skipped: true },
scrapeDetails: false,
entityName: 'post-reactions',
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async scrapePostComments({ query, ...options }) {
return new ListingScraper({
fetchList: (fetchArgs) => this.getPostComments({ ...query, ...fetchArgs }),
fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id)
? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item }
: { skipped: true },
scrapeDetails: false,
entityName: 'post-comments',
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async scrapeProfileComments({ query, ...options }) {
return new ListingScraper({
fetchList: (fetchArgs) => this.getProfileComments({ ...query, ...fetchArgs }),
fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id)
? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item }
: { skipped: true },
scrapeDetails: false,
entityName: 'profile-comments',
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async scrapeProfileReactions({ query, ...options }) {
return new ListingScraper({
fetchList: (fetchArgs) => {
return this.getProfileReactions({ ...query, ...fetchArgs });
},
fetchItem: async ({ item }) => (item === null || item === void 0 ? void 0 : item.id)
? { entityId: item === null || item === void 0 ? void 0 : item.id, element: item }
: { skipped: true },
scrapeDetails: false,
entityName: 'profile-reactions',
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async searchSalesNavigatorLeads(params) {
return this.scraper.fetchApi({ path: 'linkedin-sales-nav/lead-search', params });
}
async scrapeSalesNavigatorLeads({ query, findEmail, ...options }) {
return new ListingScraper({
fetchList: (listParams) => this.searchSalesNavigatorLeads({ ...query, ...listParams }),
fetchItem: async ({ item, ...rest }) => {
return (item === null || item === void 0 ? void 0 : item.id)
? this.getProfile({ profileId: item.id, findEmail, ...rest })
: { skipped: true };
},
scrapeDetails: true,
entityName: 'profiles',
warnPageLimit: true,
...options,
maxPageNumber: 100,
}).scrapeStart();
}
async getGroup(params) {
return this.scraper.fetchApi({ path: 'linkedin/group', params });
}
async searchGroups(params) {
return this.scraper.fetchApi({ path: 'linkedin/group-search', params });
}
async test() {
return this.scraper.fetchApi({ path: 'linkedin/test' });
}
}
function createLinkedinScraper(options) {
return new LinkedinScraper(options);
}
exports.LinkedinScraper = LinkedinScraper;
exports.createLinkedinScraper = createLinkedinScraper;
//# sourceMappingURL=index.cjs.js.map