UNPKG

@lyuboslavlyubenov/se-scraper

Version:

A module using puppeteer to scrape several search engines such as Google, Bing and Duckduckgo

255 lines (198 loc) 8.04 kB
const cheerio = require('cheerio'); const Scraper = require('./se_scraper'); class BingScraper extends Scraper { async parse_async(html) { let results = await this.page.evaluate(() => { let _text = (el, s) => { let n = el.querySelector(s); if (n) { return n.innerText; } else { return ''; } }; let _attr = (el, s, attr) => { let n = el.querySelector(s); if (n) { return n.getAttribute(attr); } else { return null; } }; let results = { num_results: '', no_results: false, effective_query: '', results: [], ads: [], right_side_ads: [], }; let num_results_el = document.querySelector('#b_content .sb_count'); if (num_results_el) { results.num_results = num_results_el.innerText; } let organic_results = document.querySelectorAll('#b_content #b_results .b_algo'); organic_results.forEach((el) => { let serp_obj = { link: _attr(el, 'h2 a', 'href'), title: _text(el, 'h2'), snippet: _text(el, '.b_caption p'), visible_link: _text(el, 'cite'), }; results.results.push(serp_obj); }); // check if no results results.no_results = (results.results.length === 0); // parse bing ads let ads = document.querySelectorAll('#b_results .b_ad .sb_add'); ads.forEach((el) => { let ad_obj = { title: _text(el, 'h2 a'), snippet: _text(el, '.b_caption p'), visible_link: _text(el, '.b_adurl cite'), tracking_link: _attr(el, 'h2 a', 'href'), }; results.ads.push(ad_obj); }); // right side ads let right_side_ads = document.querySelectorAll('#b_context .b_ad .sb_add'); right_side_ads.forEach((el) => { let ad_obj = { title: _text(el, 'h2 a'), snippet: _text(el, '.b_caption p'), visible_link: _text(el, '.b_adurl cite'), tracking_link: _attr(el, 'h2 a', 'href'), }; results.right_side_ads.push(ad_obj); }); let effective_query_el = document.querySelector('#sp_requery a'); if (effective_query_el) { results.effective_query = effective_query_el.innerText; } return results; }); results.results = this.clean_results(results.results, ['title', 'link']); results.ads = this.clean_results(results.ads, ['title', 'visible_link', 'tracking_link']); results.time = (new Date()).toUTCString(); return results; } async load_start_page() { let startUrl = this.build_start_url('https://www.bing.com/search?') || 'https://www.bing.com/'; if (this.config.bing_settings) { startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`; if (this.config.bing_settings.bing_domain) { startUrl = `https://www.${this.config.bing_settings.bing_domain}/search?`; } else { startUrl = `https://www.bing.com/search?`; } for (var key in this.config.bing_settings) { if (key !== 'bing_domain') { startUrl += `${key}=${this.config.bing_settings[key]}&` } } } await this.page.goto(startUrl); await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); return true; } async search_keyword(keyword) { const input = await this.page.$('input[name="q"]'); await this.set_input_value(`input[name="q"]`, keyword); await this.sleep(50); await input.focus(); await this.page.keyboard.press("Enter"); } async next_page() { let next_page_link = await this.page.$('.sb_pagN', {timeout: 1000}); if (!next_page_link) { return false; } this.last_response = await Promise.all([ next_page_link.click(), // The promise resolves after navigation has finished this.page.waitForNavigation(), // Clicking the link will indirectly cause a navigation ]); return true; } async wait_for_results() { await this.page.waitForSelector('#b_content', { timeout: this.STANDARD_TIMEOUT }); } async detected() { // TODO: I was actually never detected by bing. those are good boys. } } class BingNewsScraper extends Scraper { parse(html) { // load the page source into cheerio const $ = cheerio.load(html); // perform queries const results = []; $('#news .news-card').each((i, link) => { const relativeImageUrl = $(link).find('.image img').attr('src'); const resultObj = { link: $(link).attr('url'), title: $(link).find('a.title').text(), snippet: $(link).find('.snippet').text(), date: $(link).find('.source span').last().text() }; if (relativeImageUrl) { resultObj['image'] = 'https://bing.com' + relativeImageUrl; } results.push(resultObj); }); const cleaned = this.clean_results(results, ['title', 'link']); return { time: (new Date()).toUTCString(), results: cleaned, } } async load_start_page() { let startUrl = 'https://www.bing.com/news/search?'; try { await this.page.goto(startUrl); if (this.config.set_manual_settings === true) { console.log('Sleeping 30 seconds. Set your settings now.'); await this.sleep(30000); } await this.page.waitForSelector('input[name="q"]', { timeout: this.STANDARD_TIMEOUT }); } catch (e) { return false; } return true; } async search_keyword(keyword) { const input = await this.page.$('input[name="q"]'); await this.set_input_value(`input[name="q"]`, keyword); await this.sleep(50); await input.focus(); await this.page.keyboard.press("Enter"); } async next_page() { await this.page.evaluate('window.scrollBy(0, 99999)'); return true; } async wait_for_results() { await this.page.waitForSelector('#news .news-card', { timeout: this.STANDARD_TIMEOUT }); await this.page.evaluate(() => { window.scrollBy({ behavior: 'smooth', top: 9999 }); }); await this.page.waitForFunction((pageNum) => { const pageNews = document.querySelectorAll('#news .news-card'); const finishedWaiting = (pageNews.length - (pageNum * 5)) >= 5; if (!finishedWaiting) { window.scrollBy({ behavior: 'smooth', top: 9999 }); } else { return true; } return ; }, { }, this.page_num); } } module.exports = { BingNewsScraper: BingNewsScraper, BingScraper: BingScraper, };