UNPKG

serp-parser

Version:
585 lines (584 loc) 22.6 kB
"use strict"; var _GoogleMobileSERP_DEF_OPTIONS; Object.defineProperty(exports, "__esModule", { value: true }); exports.GoogleMobileSERP = void 0; const tslib_1 = require("tslib"); const cheerio = tslib_1.__importStar(require("cheerio")); const models_1 = require("./models"); const utils = tslib_1.__importStar(require("./utils")); class GoogleMobileSERP { constructor(html, options) { this.serp = { keyword: '', organic: [], pagination: [], relatedKeywords: [], }; _GoogleMobileSERP_DEF_OPTIONS.set(this, { organic: true, related: true, ads: true, hotels: false, videos: false, thumbnails: false, shop: false, stories: false, locals: false, }); this.$ = cheerio.load(html, { normalizeWhitespace: true, xmlMode: false, }); this.parse(options); } parse(opt) { const $ = this.$; const serp = this.serp; const options = opt ? opt : tslib_1.__classPrivateFieldGet(this, _GoogleMobileSERP_DEF_OPTIONS, "f"); const CONFIG = { keyword: 'input[aria-label="Search"]', noResults: '#topstuff .card-section p:contains(" - did not match any documents.")', }; if ($(CONFIG.noResults).length === 1) { this.serp.error = 'No results page'; // No need to parse anything for no results page return; } if ($('body').hasClass('srp')) { serp.keyword = $(CONFIG.keyword).val(); if (options.organic) { this.getFeatured(); this.getOrganic(); } if (options.ads) { this.getAdwords(); } if (options.related) { this.getRelatedKeywords(); } // if (options.hotels) { // this.getHotels(); // } // if (options.videos) { // this.getVideos(); // } // if (options.thumbnails) { // this.getThumbnails(); // } // if (options.shop) { // this.getShopResults(); // } // if (options.stories) { // this.getTopStories(); // } // if (options.locals) { // this.getLocals(); // } // this.getAvailableOn(); } } getOrganic() { const $ = this.$; const CONFIG = { results: '.mnr-c.xpd a.C8nzq', }; $(CONFIG.results).each((index, element) => { const position = this.serp.organic.length + 1; const url = $(element).prop('href'); const domain = utils.getDomain(url); const title = this.elementText(element, 'div[role="heading"] div'); const snippet = this.getSnippet(element); const linkType = utils.getLinkType(url); const result = { domain, linkType, position, snippet, title, url, }; this.parseSitelinks(element, result); this.serp.organic.push(result); }); } getFeatured() { const $ = this.$; const CONFIG = { results: '#rso .xpdopen .ifM9O .mnr-c a', }; $(CONFIG.results).each((index, element) => { const position = this.serp.organic.length + 1; const url = $(element).prop('href'); const domain = utils.getDomain(url); const title = $(element).text(); const snippet = this.$(element).closest('.mnr-c').prev().text(); const linkType = utils.getLinkType(url); const featured = true; const result = { domain, linkType, position, snippet, title, url, featured, }; this.serp.organic.push(result); }); } getSnippet(element) { const text = this.$(element).closest('.mnr-c').find('div.MUxGbd.yDYNvb').text().replace(/\s+/g, ' ').trim(); return text; } parseSitelinks(element, result) { const $ = this.$; const CONFIG = { closestInline: '.mnr-c.xpd', inline: '[jsname="m7irxf"] a', }; const sitelinks = []; const links = $(element).closest(CONFIG.closestInline).find(CONFIG.inline); const type = models_1.SitelinkType.inline; links.each((i, el) => { const sitelink = { href: $(el).attr('href'), title: $(el).text(), type, }; sitelinks.push(sitelink); }); if (sitelinks.length > 0) { result.sitelinks = sitelinks; } } getRelatedKeywords() { const relatedKeywords = []; const query = 'a.F3dFTe'; this.$(query).each((i, elem) => { relatedKeywords.push({ keyword: this.$(elem).text(), path: this.$(elem).prop('href'), }); }); this.serp.relatedKeywords = relatedKeywords; } // private getVideos() { // const $ = this.$; // const serp = this.serp; // const CONFIG = { // channel: '.GlPvmc.YnLDzf', // date: '.rjmdhd', // sitelink: 'a', // source: '.hDeAhf', // title: '.fJiQld.oz3cqf.p5AXld', // videoDuration: '.J2i9Hd', // videosCards: '.VibNM', // }; // const videosCards = $(CONFIG.videosCards); // if (videosCards.length === 0) { // return; // } // const videos: VideoCard[] = []; // videosCards.each((index, element) => { // const videoCard = { // channel: this.elementText(element, CONFIG.channel).substr(3), // date: new Date(this.elementText(element, CONFIG.date)), // sitelink: this.elementHref(element, CONFIG.sitelink), // source: this.elementText(element, CONFIG.source), // title: this.elementText(element, CONFIG.title), // videoDuration: this.elementText(element, CONFIG.videoDuration), // }; // videos.push(videoCard); // }); // serp.videos = videos; // } // private getThumbnails() { // const $ = this.$; // const serp = this.serp; // const CONFIG = { // heading: '.sV2QOc.Ss2Faf.zbA8Me.mfMhoc[role="heading"]', // headingMore: '.sV2QOc.Ss2Faf.zbA8Me.mfMhoc[role="heading"] .VLkRKc', // relatedGroup: '#bres .xpdopen', // relatedThumbnail: '.zVvuGd > div', // sitelink: 'a', // title: '.fl', // }; // const relatedGroup = $(CONFIG.relatedGroup); // if (relatedGroup.length === 0) { // return; // } // const thumbnailGroups: ThumbnailGroup[] = []; // relatedGroup.each((index, element) => { // let heading = ''; // if ($(element).find(CONFIG.headingMore).length === 1) { // heading = $(element).find(CONFIG.headingMore).text(); // } else { // heading = $(element).find(CONFIG.heading).text(); // } // // const heading = this.elementText(element, CONFIG.heading); // const thumbnailGroup: ThumbnailGroup = { // heading, // thumbnails: [], // }; // const relatedThumbnail = $(element).find(CONFIG.relatedThumbnail); // relatedThumbnail.each((ind, el) => { // thumbnailGroup.thumbnails.push({ // sitelink: this.elementHref(el, CONFIG.sitelink), // title: this.elementText(el, CONFIG.title), // }); // }); // thumbnailGroups.push(thumbnailGroup); // }); // serp.thumbnailGroups = thumbnailGroups; // } // private getHotels() { // const $ = this.$; // const hotelsFeature = $('.zd2Jbb'); // if (!hotelsFeature.length) { // return; // } // const CONFIG = { // moreHotelsRegex: /(\d+,?)+/, // moreHotelsText: '.wUrVib', // }; // // FILTERS // const searchFilters: HotelsSearchFilters = this.getHotelSearchFilters(hotelsFeature); // // HOTELS (HOTEL CARDS) // const hotels: Hotel[] = this.getHotelOffers(hotelsFeature); // // MORE HOTELS // // const moreHotelsText = hotelsFeature.find(CONFIG.moreHotelsText).text(); // const moreHotelsText = hotelsFeature.find(CONFIG.moreHotelsText).text(); // const moreHotels = parseInt(utils.getFirstMatch(moreHotelsText, CONFIG.moreHotelsRegex).replace(',', ''), 10); // this.serp.hotels = { // hotels, // moreHotels, // searchFilters, // }; // } // private getHotelSearchFilters(hotelsFeature: cheerio.Cheerio<cheerio.Element>): HotelsSearchFilters { // const $ = this.$; // const CONFIG = { // activeFilter: '.CWGqFd', // checkInString: '.vpggTd.ed5F6c span', // checkOutString: '.vpggTd:not(.ed5F6c) span', // filterGroupsTitles: '.d2IDkc', // guests: '.viupMc', // hotelFiltersSection: '.x3UtIe', // searchTitle: '.gsmmde', // }; // const hotelFiltersSection = hotelsFeature.find(CONFIG.hotelFiltersSection); // const searchTitle = hotelFiltersSection.find(CONFIG.searchTitle).text(); // const checkInString = `${hotelFiltersSection.find(CONFIG.checkInString).text()} ${new Date().getFullYear()}`; // const checkIn = new Date(checkInString); // const checkOutString = `${hotelFiltersSection.find(CONFIG.checkOutString).text()} ${new Date().getFullYear()}`; // const checkOut = new Date(checkOutString); // const guests = parseInt(hotelFiltersSection.find(CONFIG.guests).text(), 10); // const filters: HotelFilters[] = []; // const filterGroupsTitles = hotelFiltersSection.find(CONFIG.filterGroupsTitles); // filterGroupsTitles.each((ind, el) => { // const hotelFilters: HotelFilters = { // explanation: $(el).next().text(), // title: $(el).text(), // }; // if ($(el).closest(CONFIG.activeFilter).length) { // hotelFilters.isActive = true; // } // filters.push(hotelFilters); // }); // return { // checkIn, // checkOut, // filters, // guests, // searchTitle, // }; // } // private getHotelOffers(hotelsFeature: cheerio.Cheerio<cheerio.Element>): Hotel[] { // const $ = this.$; // const CONFIG = { // amenities: '.I9B2He', // currency: '.dv1Q3e', // currencyRegex: /\D+/, // dealDetails: '.kOTJue.jj25pf', // dealType: '.NNPnSe', // featuredReview: '.DabgJ .gisIHb', // hotelCards: '.ntKMYc .hmHBZd', // name: '.BTPx6e', // originalPrice: '.AfCRQd', // originalPriceRegex: /\d+/, // price: '.dv1Q3e', // priceRegex: /\d+/, // rating: 'g-review-stars span', // ratingRegex: /\d\.\d/, // votes: 'g-review-stars+span', // }; // const hotels: Hotel[] = []; // const hotelCards = hotelsFeature.find(CONFIG.hotelCards); // hotelCards.each((ind, el) => { // const name = this.elementText(el, CONFIG.name); // const price = parseInt(utils.getFirstMatch(this.elementText(el, CONFIG.price), CONFIG.priceRegex), 10); // const originalPrice = parseInt( // utils.getFirstMatch(this.elementText(el, CONFIG.originalPrice), CONFIG.originalPriceRegex), // 10, // ); // const currency = utils.getFirstMatch(this.elementText(el, CONFIG.currency), CONFIG.currencyRegex); // const ratingString = $(el).find(CONFIG.rating).attr('aria-label') as string; // const rating = parseFloat(utils.getFirstMatch(ratingString, CONFIG.ratingRegex)); // const votes = parseInt(this.elementText(el, CONFIG.votes).slice(1, -1).replace(',', ''), 10); // Getting rid of parentheses with slice() // // Make this better, maybe something instead of slice ? // const dealType = this.elementText(el, CONFIG.dealType); // const dealDetails = this.elementText(el, CONFIG.dealDetails); // const amenities = this.elementText(el, CONFIG.amenities); // const featuredReview = this.elementText(el, CONFIG.featuredReview).trim().slice(1, -1); // Getting rid of quotes with slice() // // Make this better, maybe something instead of slice ? // const hotelDeal: HotelDeal = { // dealType, // }; // if (dealDetails) { // hotelDeal.dealDetails = dealDetails; // } // if (originalPrice) { // hotelDeal.originalPrice = originalPrice; // } // const hotel: Hotel = { // currency, // name, // price, // rating, // votes, // }; // if (dealType) { // hotel.deal = hotelDeal; // } // if (amenities) { // hotel.amenities = amenities; // } // if (featuredReview) { // hotel.featuredReview = featuredReview; // } // hotels.push(hotel); // }); // return hotels; // } getAdwords() { const $ = this.$; const serp = this.serp; const CONFIG = { bottom: '#tadsb', top: '#tads', }; const adwords = {}; // TODO: refactor this if ($(CONFIG.top).length) { adwords.adwordsTop = []; this.getAds(CONFIG.top, adwords.adwordsTop); } if ($(CONFIG.bottom).length) { adwords.adwordsBottom = []; this.getAds(CONFIG.bottom, adwords.adwordsBottom); } serp.adwords = adwords.adwordsTop || adwords.adwordsBottom ? adwords : undefined; } getAds(search, adsList) { const $ = this.$; const CONFIG = { ads: '.uEierd', snippet: '.MUxGbd.yDYNvb.lEBKkf', title: '[role="heading"]', url: 'a.C8nzq.d5oMvf.BmP5tf', }; $(search) .find(CONFIG.ads) .each((i, e) => { const title = this.elementText(e, CONFIG.title); const url = this.elementHref(e, CONFIG.url); const domain = utils.getDomain(url); const linkType = utils.getLinkType(url); const snippet = $(e).find(CONFIG.snippet).text(); const sitelinks = this.getAdSitelinks(e); const position = i + 1; const ad = { domain, linkType, position, sitelinks, snippet, title, url, }; adsList.push(ad); }); } getAdSitelinks(ad) { const $ = this.$; const CONFIG = { inline: '.MUxGbd.v0nnCb.lyLwlc a,.Uq7H1 a', }; const sitelinks = []; const inlineSiteLinks = $(ad).find(CONFIG.inline); inlineSiteLinks.each((i, e) => { const sitelink = { href: $(e).attr('href'), title: $(e).text(), type: models_1.SitelinkType.inline, }; sitelinks.push(sitelink); }); return sitelinks; } // Moved to knowledge graph // private getAvailableOn() { // const $ = this.$; // const serp = this.serp; // const CONFIG = { // price: '.V8xno span', // query: 'a.JkUS4b', // service: '.i3LlFf', // }; // const list = $(CONFIG.query); // const availableOn: AvailableOn[] = []; // if (list.length) { // list.each((i, e) => { // const url = $(e).attr('href') as string; // const service = this.elementText(e, CONFIG.service); // const price = this.elementText(e, CONFIG.price); // availableOn.push({ url, service, price }); // }); // serp.availableOn = availableOn; // } // } // private getLocals() { // const $ = this.$; // const serp = this.serp; // const CONFIG = { // name: '.dbg0pd', // rating: '.BTtC6e', // reviews: '.rllt__details.lqhpac div:nth-child(1) span:nth-child(3)', // reviewsRegex: /[0-9]+/, // expensiveness: '.rllt__details.lqhpac div:nth-child(1)', // expensivenessRegex: /·([^]+)·/, // type: '.rllt__details.lqhpac div:nth-child(1)', // typeRegex: /\w+\s\w+/, // distance: '.rllt__details.lqhpac div:nth-child(2) > span:nth-child(1)', // address: '.rllt__details.lqhpac div:nth-child(2)', // description: 'div.rllt__wrapped > span', // localsFeature: '.AEprdc', // local: '.C8TUKc', // }; // const localsFeature = $(CONFIG.localsFeature); // if (!localsFeature.length) { // return; // } // const locals: Local[] = []; // const local = localsFeature.find(CONFIG.local); // local.each((ind, el) => { // const name = this.elementText(el, CONFIG.name); // const rating = this.elementText(el, CONFIG.rating); // const reviews = utils.getFirstMatch($(el).find(CONFIG.reviews).text(), CONFIG.reviewsRegex); // const expensiveness = utils // .getFirstMatch($(el).find(CONFIG.expensiveness).text(), CONFIG.expensivenessRegex) // .slice(1, -1) // .trim().length; // const type = utils.getFirstMatch($(el).find(CONFIG.type).text(), CONFIG.typeRegex); // const distance = ''; // const address = this.elementText(el, CONFIG.address); // const description = ''; // locals.push({ name, rating, reviews, expensiveness, type, distance, address, description }); // }); // serp.locals = locals; // } // private getTopStories() { // const $ = this.$; // const serp = this.serp; // const CONFIG = { // published: '.K4LhXb', // publisher: '.wqg8ad', // title: 'div[role="heading"]', // topStoriesFeature: 'g-section-with-header [data-hveid=CA0QAQ]', // topStory: 'a[data-jsarwt="1"]', // }; // const topStoriesFeature = $(CONFIG.topStoriesFeature); // if (!topStoriesFeature.length) { // return; // } // const topStories: TopStory[] = []; // const topStory = topStoriesFeature.find(CONFIG.topStory); // topStory.each((ind, el) => { // const url = $(el).attr('href') as string; // const title = this.elementText(el, CONFIG.title); // const publisher = this.elementText(el, CONFIG.publisher); // const published = this.elementText(el, CONFIG.published); // topStories.push({ url, title, publisher, published }); // }); // serp.topStories = topStories; // } // private getShopResults() { // const $ = this.$; // const serp = this.serp; // const CONFIG = { // commodity: '.cYBBsb', // currency: '.e10twf', // currencyRegex: /\D+/, // imgLink: 'a.pla-unit-img-container-link', // price: '.e10twf', // priceRegex: /[\d,.]+/, // ratingRegex: /\d\.\d/, // ratingString: 'a > span > g-review-stars > span', // shopFeature: '.top-pla-group-inner', // shopOffer: '.pla-unit:not(.view-all-unit)', // shoppingSite: '.LbUacb', // specialOffer: '.gyXcee', // title: 'a > .pymv4e', // votes: '.nbd1Bd .QhqGkb.RnJeZd', // }; // const shopFeature = $(CONFIG.shopFeature); // if (shopFeature.length) { // const shopResults: ShopResult[] = []; // const shopOffer = shopFeature.find(CONFIG.shopOffer); // shopOffer.each((ind, el) => { // const imgLink = this.elementHref(el, CONFIG.imgLink); // const title = this.elementText(el, CONFIG.title); // const price = parseFloat( // utils.getFirstMatch(this.elementText(el, CONFIG.price), CONFIG.priceRegex).replace(/,/g, ''), // ); // const currency = utils.getFirstMatch(this.elementText(el, CONFIG.currency), CONFIG.currencyRegex); // const shoppingSite = this.elementText(el, CONFIG.shoppingSite); // const shopResult: ShopResult = { // currency, // imgLink, // price, // shoppingSite, // title, // }; // const specialOffer = $(el).find(CONFIG.specialOffer).first().text(); // if (specialOffer) { // shopResult.specialOffer = specialOffer; // } // const ratingString = $(el).find(CONFIG.ratingString).attr('aria-label'); // if (ratingString) { // const rating = parseFloat(utils.getFirstMatch(ratingString, CONFIG.ratingRegex)); // shopResult.rating = rating; // } // const votes = this.elementText(el, CONFIG.votes).trim().slice(1, -1); // if (votes) { // shopResult.votes = votes; // } // const commodity = this.elementText(el, CONFIG.commodity); // if (commodity) { // shopResult.commodity = commodity; // } // shopResults.push(shopResult); // }); // serp.shopResults = shopResults; // } // } // Helper methods elementText(el, query) { return this.$(el).find(query).text(); } elementHref(el, query) { return this.$(el).find(query).attr('href'); } } exports.GoogleMobileSERP = GoogleMobileSERP; _GoogleMobileSERP_DEF_OPTIONS = new WeakMap();