@lyuboslavlyubenov/se-scraper
Version:
A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo
440 lines (380 loc) • 17 kB
JavaScript
'use strict';
const fs = require('fs');
const os = require('os');
const _ = require('lodash');
const { createLogger, format, transports } = require('winston');
const { combine, timestamp, printf } = format;
const debug = require('debug')('se-scraper:ScrapeManager');
const { Cluster } = require('puppeteer-cluster');
const UserAgent = require('user-agents');
const google = require('./modules/google.js');
const bing = require('./modules/bing.js');
const yandex = require('./modules/yandex.js');
const infospace = require('./modules/infospace.js');
const duckduckgo = require('./modules/duckduckgo.js');
const CustomConcurrencyImpl = require('./concurrency-implementation');
const MAX_ALLOWED_BROWSERS = 6;
function write_results(fname, data) {
fs.writeFileSync(fname, data, (err) => {
if (err) throw err;
console.log(`Results written to file ${fname}`);
});
}
function read_keywords_from_file(fname) {
let kws = fs.readFileSync(fname).toString().split(os.EOL);
// clean keywords
kws = kws.filter((kw) => {
return kw.trim().length > 0;
});
return kws;
}
function getScraper(search_engine, args) {
if (typeof search_engine === 'string') {
return new {
google: google.GoogleScraper,
google_news_old: google.GoogleNewsOldScraper,
google_news: google.GoogleNewsScraper,
google_image: google.GoogleImageScraper,
bing: bing.BingScraper,
yandex: yandex.YandexScraper,
bing_news: bing.BingNewsScraper,
duckduckgo: duckduckgo.DuckduckgoScraper,
infospace: infospace.InfospaceScraper,
webcrawler: infospace.WebcrawlerNewsScraper,
}[search_engine](args);
} else if (typeof search_engine === 'function') {
return new search_engine(args);
} else {
throw new Error(`search_engine must either be a string of class (function)`);
}
}
class ScrapeManager {
constructor(config, context={}) {
this.cluster = null;
this.pluggable = null;
this.scraper = null;
this.context = context;
this.config = _.defaults(config, {
// the user agent to scrape with
user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36',
// if random_user_agent is set to True, a random user agent is chosen
random_user_agent: false,
// whether to select manual settings in visible mode
set_manual_settings: false,
// log ip address data
log_ip_address: false,
// log http headers
log_http_headers: false,
// how long to sleep between requests. a random sleep interval within the range [a,b]
// is drawn before every request. empty string for no sleeping.
sleep_range: null,
/**
* which search engine to scrape
*/
search_engine: 'google',
search_engine_name: 'google',
logger: createLogger({
level: 'info',
format: combine(
timestamp(),
printf(({ level, message, timestamp }) => {
return `${timestamp} [${level}] ${message}`;
})
),
transports: [
new transports.Console()
]
}),
/**
* Combination of string that will be used in a search
*/
keywords: [],
/**
* whether to start the browser in headless mode
*/
headless: true,
/**
* specify flags passed to chrome here
About our defaults values https://peter.sh/experiments/chromium-command-line-switches/
*/
chrome_flags: [
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920,1040',
'--start-fullscreen',
'--hide-scrollbars',
'--disable-notifications',
],
/**
* the number of pages to scrape for each keyword
*/
num_pages: 1,
// path to output file, data will be stored in JSON
output_file: '',
// whether to also passthru all the html output of the serp pages
html_output: false,
// whether to strip JS and CSS from the html_output
// has only an effect if `html_output` is true
clean_html_output: false,
// remove all data images from the html
clean_data_images: false,
// whether to return a screenshot of serp pages as b64 data
screen_output: false,
// Scrape url from local file. Mainly used for testing.
scrape_from_file: '',
/**
* whether to prevent images, css, fonts and media from being loaded
* will speed up scraping a great deal
*/
block_assets: true,
/**
* path to js module that extends functionality
* this module should export the functions:
* get_browser, handle_metadata, close_browser
* custom_func: resolve('examples/pluggable.js'),
*/
custom_func: null,
throw_on_detection: false,
/**
List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080']
*
*/
proxies: null,
/**
* a file with one proxy per line. Example:
socks5://78.94.172.42:1080
http://118.174.233.10:48400
*/
proxy_file: '',
/**
whether to use proxies only
when this is set to true, se-scraper will not use
your default IP address
*/
use_proxies_only: false,
/**
* check if headless chrome escapes common detection techniques
* this is a quick test and should be used for debugging
*/
test_evasion: false,
apply_evasion_techniques: true,
/**
* settings for puppeteer-cluster
*/
puppeteer_cluster_config: {
timeout: 30 * 60 * 1000, // max timeout set to 30 minutes
monitor: false,
concurrency: Cluster.CONCURRENCY_BROWSER,
maxConcurrency: 1,
},
/**
* Groups scraped result in pages in the output. Setting to false will make it into one array
*/
paginate: true,
});
this.logger = this.config.logger;
if (config.sleep_range) {
// parse an array
if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') {
throw "sleep_range is not a valid array of two integers.";
}
}
if (fs.existsSync(this.config.keyword_file)) {
this.config.keywords = read_keywords_from_file(this.config.keyword_file);
}
if (this.config.proxies && this.config.proxy_file) {
throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.');
}
if (this.config.proxy_file) {
this.config.proxies = read_keywords_from_file(this.config.proxy_file);
this.logger.info(`${this.config.proxies.length} proxies read from file.`);
}
if (!this.config.proxies && this.config.use_proxies_only) {
throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only');
}
debug('this.config=%O', this.config);
}
/*
* Launches the puppeteer cluster or browser.
*
* Returns true if the browser was successfully launched. Otherwise will return false.
*/
async start() {
if (this.config.custom_func) {
if (fs.existsSync(this.config.custom_func)) {
try {
const PluggableClass = require(this.config.custom_func);
this.pluggable = new PluggableClass({
config: this.config,
context: this.context
});
} catch (exception) {
console.error(exception);
return false;
}
} else {
console.error(`File "${this.config.custom_func}" does not exist!`);
return false;
}
}
const chrome_flags = _.clone(this.config.chrome_flags);
if (this.pluggable && this.pluggable.start_browser) {
launch_args.config = this.config;
this.browser = await this.pluggable.start_browser({
config: this.config,
});
this.page = await this.browser.newPage();
} else {
// if no custom start_browser functionality was given
// use puppeteer-cluster for scraping
let proxies;
// if we have at least one proxy, always use CONCURRENCY_BROWSER
// and set maxConcurrency to this.config.proxies.length + 1
// else use whatever this.configuration was passed
if (this.config.proxies && this.config.proxies.length > 0) {
// because we use real browsers, we ran out of memory on normal laptops
// when using more than maybe 5 or 6 browsers.
// therefore hardcode a limit here
// TODO not sure this what we want
this.numClusters = Math.min(
this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1),
MAX_ALLOWED_BROWSERS
);
proxies = _.clone(this.config.proxies);
// Insert a first config without proxy if use_proxy_only is false
if (this.config.use_proxies_only === false) {
proxies.unshift(null);
}
} else {
this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency;
proxies = _.times(this.numClusters, null);
}
this.logger.info(`Using ${this.numClusters} clusters.`);
// Give the per browser options
const perBrowserOptions = _.map(proxies, (proxy) => {
const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent;
let args = chrome_flags.concat([`--user-agent=${userAgent}`]);
if (proxy) {
args = args.concat([`--proxy-server=${proxy}`]);
}
return {
headless: this.config.headless,
ignoreHTTPSErrors: true,
args
};
});
debug('perBrowserOptions=%O', perBrowserOptions)
this.cluster = await Cluster.launch({
monitor: this.config.puppeteer_cluster_config.monitor,
timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes
concurrency: CustomConcurrencyImpl,
maxConcurrency: this.numClusters,
puppeteerOptions: {
perBrowserOptions: perBrowserOptions
}
});
}
}
/*
* Scrapes the keywords specified by the config.
*/
async scrape(scrape_config = {}) {
if (!scrape_config.keywords && !scrape_config.keyword_file) {
throw new Error('Either keywords or keyword_file must be supplied to scrape()');
}
Object.assign(this.config, scrape_config);
var results = {};
var num_requests = 0;
var metadata = {};
var startTime = Date.now();
this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine;
this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`);
if (this.pluggable && this.pluggable.start_browser) {
this.scraper = getScraper(this.config.search_engine, {
config: this.config,
context: this.context,
pluggable: this.pluggable,
page: this.page,
});
var {results, metadata, num_requests} = await this.scraper.run(this.page);
} else {
// Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine.
// https://github.com/GoogleChrome/puppeteer/issues/678
// The question is: Is it possible to set proxies per Page? Per Browser?
// as far as I can see, puppeteer cluster uses the same puppeteerOptions
// for every browser instance. We will use our custom puppeteer-cluster version.
// https://www.npmjs.com/package/proxy-chain
// this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077
let chunks = [];
for (var n = 0; n < this.numClusters; n++) {
chunks.push([]);
}
for (var k = 0; k < this.config.keywords.length; k++) {
chunks[k % this.numClusters].push(this.config.keywords[k]);
}
debug('chunks=%o', chunks);
let execPromises = [];
for (var c = 0; c < chunks.length; c++) {
const config = _.clone(this.config);
config.keywords = chunks[c];
var obj = getScraper(this.config.search_engine, {
config: config,
context: {},
pluggable: this.pluggable,
});
var boundMethod = obj.run.bind(obj);
execPromises.push(this.cluster.execute({}, boundMethod));
}
let promiseReturns = await Promise.all(execPromises);
// Merge results and metadata per keyword
for (let promiseReturn of promiseReturns) {
Object.assign(results, promiseReturn.results);
Object.assign(metadata, promiseReturn.metadata);
num_requests += promiseReturn.num_requests;
}
}
let timeDelta = Date.now() - startTime;
let ms_per_request = timeDelta/num_requests;
this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`);
this.logger.info(`On average ms/request: ${ms_per_request}ms/request`);
if (this.pluggable && this.pluggable.handle_results) {
await this.pluggable.handle_results(results);
}
metadata.elapsed_time = timeDelta.toString();
metadata.ms_per_keyword = ms_per_request.toString();
metadata.num_requests = num_requests;
debug('metadata=%O', metadata);
if (this.pluggable && this.pluggable.handle_metadata) {
await this.pluggable.handle_metadata(metadata);
}
if (this.config.output_file) {
this.logger.info(`Writing results to ${this.config.output_file}`);
write_results(this.config.output_file, JSON.stringify(results, null, 4));
}
return {
results: results,
metadata: metadata || {},
};
}
/*
* Quit the puppeteer cluster/browser.
*/
async quit() {
if (this.pluggable && this.pluggable.close_browser) {
await this.pluggable.close_browser();
} else {
await this.cluster.idle();
await this.cluster.close();
}
}
}
module.exports = {
ScrapeManager: ScrapeManager,
};