UNPKG

@lyuboslavlyubenov/se-scraper

Version:

A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo

scrapeulous.com

NikolaiT/se-scraper

440 lines (380 loc) • 17 kB

JavaScript

'use strict'; const fs = require('fs'); const os = require('os'); const _ = require('lodash'); const { createLogger, format, transports } = require('winston'); const { combine, timestamp, printf } = format; const debug = require('debug')('se-scraper:ScrapeManager'); const { Cluster } = require('puppeteer-cluster'); const UserAgent = require('user-agents'); const google = require('./modules/google.js'); const bing = require('./modules/bing.js'); const yandex = require('./modules/yandex.js'); const infospace = require('./modules/infospace.js'); const duckduckgo = require('./modules/duckduckgo.js'); const CustomConcurrencyImpl = require('./concurrency-implementation'); const MAX_ALLOWED_BROWSERS = 6; function write_results(fname, data) { fs.writeFileSync(fname, data, (err) => { if (err) throw err; console.log(`Results written to file ${fname}`); }); } function read_keywords_from_file(fname) { let kws = fs.readFileSync(fname).toString().split(os.EOL); // clean keywords kws = kws.filter((kw) => { return kw.trim().length > 0; }); return kws; } function getScraper(search_engine, args) { if (typeof search_engine === 'string') { return new { google: google.GoogleScraper, google_news_old: google.GoogleNewsOldScraper, google_news: google.GoogleNewsScraper, google_image: google.GoogleImageScraper, bing: bing.BingScraper, yandex: yandex.YandexScraper, bing_news: bing.BingNewsScraper, duckduckgo: duckduckgo.DuckduckgoScraper, infospace: infospace.InfospaceScraper, webcrawler: infospace.WebcrawlerNewsScraper, }[search_engine](args); } else if (typeof search_engine === 'function') { return new search_engine(args); } else { throw new Error(`search_engine must either be a string of class (function)`); } } class ScrapeManager { constructor(config, context={}) { this.cluster = null; this.pluggable = null; this.scraper = null; this.context = context; this.config = _.defaults(config, { // the user agent to scrape with user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36', // if random_user_agent is set to True, a random user agent is chosen random_user_agent: false, // whether to select manual settings in visible mode set_manual_settings: false, // log ip address data log_ip_address: false, // log http headers log_http_headers: false, // how long to sleep between requests. a random sleep interval within the range [a,b] // is drawn before every request. empty string for no sleeping. sleep_range: null, /** * which search engine to scrape */ search_engine: 'google', search_engine_name: 'google', logger: createLogger({ level: 'info', format: combine( timestamp(), printf(({ level, message, timestamp }) => { return `${timestamp} [${level}] ${message}`; }) ), transports: [ new transports.Console() ] }), /** * Combination of string that will be used in a search */ keywords: [], /** * whether to start the browser in headless mode */ headless: true, /** * specify flags passed to chrome here About our defaults values https://peter.sh/experiments/chromium-command-line-switches/ */ chrome_flags: [ '--disable-infobars', '--window-position=0,0', '--ignore-certifcate-errors', '--ignore-certifcate-errors-spki-list', '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920,1040', '--start-fullscreen', '--hide-scrollbars', '--disable-notifications', ], /** * the number of pages to scrape for each keyword */ num_pages: 1, // path to output file, data will be stored in JSON output_file: '', // whether to also passthru all the html output of the serp pages html_output: false, // whether to strip JS and CSS from the html_output // has only an effect if `html_output` is true clean_html_output: false, // remove all data images from the html clean_data_images: false, // whether to return a screenshot of serp pages as b64 data screen_output: false, // Scrape url from local file. Mainly used for testing. scrape_from_file: '', /** * whether to prevent images, css, fonts and media from being loaded * will speed up scraping a great deal */ block_assets: true, /** * path to js module that extends functionality * this module should export the functions: * get_browser, handle_metadata, close_browser * custom_func: resolve('examples/pluggable.js'), */ custom_func: null, throw_on_detection: false, /** List of proxies to use ['socks5://78.94.172.42:1080', 'http://localhost:1080'] * */ proxies: null, /** * a file with one proxy per line. Example: socks5://78.94.172.42:1080 http://118.174.233.10:48400 */ proxy_file: '', /** whether to use proxies only when this is set to true, se-scraper will not use your default IP address */ use_proxies_only: false, /** * check if headless chrome escapes common detection techniques * this is a quick test and should be used for debugging */ test_evasion: false, apply_evasion_techniques: true, /** * settings for puppeteer-cluster */ puppeteer_cluster_config: { timeout: 30 * 60 * 1000, // max timeout set to 30 minutes monitor: false, concurrency: Cluster.CONCURRENCY_BROWSER, maxConcurrency: 1, }, /** * Groups scraped result in pages in the output. Setting to false will make it into one array */ paginate: true, }); this.logger = this.config.logger; if (config.sleep_range) { // parse an array if (config.sleep_range.length !== 2 && typeof i[0] !== 'number' && typeof i[1] !== 'number') { throw "sleep_range is not a valid array of two integers."; } } if (fs.existsSync(this.config.keyword_file)) { this.config.keywords = read_keywords_from_file(this.config.keyword_file); } if (this.config.proxies && this.config.proxy_file) { throw new Error('Either use a proxy_file or specify a proxy for all connections. Do not use both options.'); } if (this.config.proxy_file) { this.config.proxies = read_keywords_from_file(this.config.proxy_file); this.logger.info(`${this.config.proxies.length} proxies read from file.`); } if (!this.config.proxies && this.config.use_proxies_only) { throw new Error('Must provide at least one proxy in proxies if you enable use_proxies_only'); } debug('this.config=%O', this.config); } /* * Launches the puppeteer cluster or browser. * * Returns true if the browser was successfully launched. Otherwise will return false. */ async start() { if (this.config.custom_func) { if (fs.existsSync(this.config.custom_func)) { try { const PluggableClass = require(this.config.custom_func); this.pluggable = new PluggableClass({ config: this.config, context: this.context }); } catch (exception) { console.error(exception); return false; } } else { console.error(`File "${this.config.custom_func}" does not exist!`); return false; } } const chrome_flags = _.clone(this.config.chrome_flags); if (this.pluggable && this.pluggable.start_browser) { launch_args.config = this.config; this.browser = await this.pluggable.start_browser({ config: this.config, }); this.page = await this.browser.newPage(); } else { // if no custom start_browser functionality was given // use puppeteer-cluster for scraping let proxies; // if we have at least one proxy, always use CONCURRENCY_BROWSER // and set maxConcurrency to this.config.proxies.length + 1 // else use whatever this.configuration was passed if (this.config.proxies && this.config.proxies.length > 0) { // because we use real browsers, we ran out of memory on normal laptops // when using more than maybe 5 or 6 browsers. // therefore hardcode a limit here // TODO not sure this what we want this.numClusters = Math.min( this.config.proxies.length + (this.config.use_proxies_only ? 0 : 1), MAX_ALLOWED_BROWSERS ); proxies = _.clone(this.config.proxies); // Insert a first config without proxy if use_proxy_only is false if (this.config.use_proxies_only === false) { proxies.unshift(null); } } else { this.numClusters = this.config.puppeteer_cluster_config.maxConcurrency; proxies = _.times(this.numClusters, null); } this.logger.info(`Using ${this.numClusters} clusters.`); // Give the per browser options const perBrowserOptions = _.map(proxies, (proxy) => { const userAgent = (this.config.random_user_agent) ? (new UserAgent({deviceCategory: 'desktop'})).toString() : this.config.user_agent; let args = chrome_flags.concat([`--user-agent=${userAgent}`]); if (proxy) { args = args.concat([`--proxy-server=${proxy}`]); } return { headless: this.config.headless, ignoreHTTPSErrors: true, args }; }); debug('perBrowserOptions=%O', perBrowserOptions) this.cluster = await Cluster.launch({ monitor: this.config.puppeteer_cluster_config.monitor, timeout: this.config.puppeteer_cluster_config.timeout, // max timeout set to 30 minutes concurrency: CustomConcurrencyImpl, maxConcurrency: this.numClusters, puppeteerOptions: { perBrowserOptions: perBrowserOptions } }); } } /* * Scrapes the keywords specified by the config. */ async scrape(scrape_config = {}) { if (!scrape_config.keywords && !scrape_config.keyword_file) { throw new Error('Either keywords or keyword_file must be supplied to scrape()'); } Object.assign(this.config, scrape_config); var results = {}; var num_requests = 0; var metadata = {}; var startTime = Date.now(); this.config.search_engine_name = typeof this.config.search_engine === 'function' ? this.config.search_engine.name : this.config.search_engine; this.logger.info(`scrapes ${this.config.search_engine_name} with ${this.config.keywords.length} keywords on ${this.config.num_pages} pages each.`); if (this.pluggable && this.pluggable.start_browser) { this.scraper = getScraper(this.config.search_engine, { config: this.config, context: this.context, pluggable: this.pluggable, page: this.page, }); var {results, metadata, num_requests} = await this.scraper.run(this.page); } else { // Each browser will get N/(K+1) keywords and will issue N/(K+1) * M total requests to the search engine. // https://github.com/GoogleChrome/puppeteer/issues/678 // The question is: Is it possible to set proxies per Page? Per Browser? // as far as I can see, puppeteer cluster uses the same puppeteerOptions // for every browser instance. We will use our custom puppeteer-cluster version. // https://www.npmjs.com/package/proxy-chain // this answer looks nice: https://github.com/GoogleChrome/puppeteer/issues/678#issuecomment-389096077 let chunks = []; for (var n = 0; n < this.numClusters; n++) { chunks.push([]); } for (var k = 0; k < this.config.keywords.length; k++) { chunks[k % this.numClusters].push(this.config.keywords[k]); } debug('chunks=%o', chunks); let execPromises = []; for (var c = 0; c < chunks.length; c++) { const config = _.clone(this.config); config.keywords = chunks[c]; var obj = getScraper(this.config.search_engine, { config: config, context: {}, pluggable: this.pluggable, }); var boundMethod = obj.run.bind(obj); execPromises.push(this.cluster.execute({}, boundMethod)); } let promiseReturns = await Promise.all(execPromises); // Merge results and metadata per keyword for (let promiseReturn of promiseReturns) { Object.assign(results, promiseReturn.results); Object.assign(metadata, promiseReturn.metadata); num_requests += promiseReturn.num_requests; } } let timeDelta = Date.now() - startTime; let ms_per_request = timeDelta/num_requests; this.logger.info(`Scraper took ${timeDelta}ms to perform ${num_requests} requests.`); this.logger.info(`On average ms/request: ${ms_per_request}ms/request`); if (this.pluggable && this.pluggable.handle_results) { await this.pluggable.handle_results(results); } metadata.elapsed_time = timeDelta.toString(); metadata.ms_per_keyword = ms_per_request.toString(); metadata.num_requests = num_requests; debug('metadata=%O', metadata); if (this.pluggable && this.pluggable.handle_metadata) { await this.pluggable.handle_metadata(metadata); } if (this.config.output_file) { this.logger.info(`Writing results to ${this.config.output_file}`); write_results(this.config.output_file, JSON.stringify(results, null, 4)); } return { results: results, metadata: metadata || {}, }; } /* * Quit the puppeteer cluster/browser. */ async quit() { if (this.pluggable && this.pluggable.close_browser) { await this.pluggable.close_browser(); } else { await this.cluster.idle(); await this.cluster.close(); } } } module.exports = { ScrapeManager: ScrapeManager, };