serp-parser
Version:
SERP Parser for Google
653 lines (652 loc) • 25.4 kB
JavaScript
"use strict";
var _GoogleSERP_DEF_OPTIONS;
Object.defineProperty(exports, "__esModule", { value: true });
exports.GoogleSERP = void 0;
const tslib_1 = require("tslib");
const cheerio = tslib_1.__importStar(require("cheerio"));
const models_1 = require("./models");
const utils = tslib_1.__importStar(require("./utils"));
class GoogleSERP {
constructor(html, options) {
this.serp = {
currentPage: 1,
keyword: '',
organic: [],
pagination: [],
relatedKeywords: [],
};
_GoogleSERP_DEF_OPTIONS.set(this, {
organic: true,
related: true,
pagination: true,
ads: true,
hotels: true,
videos: true,
thumbnails: true,
shop: true,
stories: true,
locals: true,
});
this.$ = cheerio.load(html, {
normalizeWhitespace: true,
xmlMode: false,
});
this.parse(options);
}
parse(opt) {
const $ = this.$;
const serp = this.serp;
const options = opt ? opt : tslib_1.__classPrivateFieldGet(this, _GoogleSERP_DEF_OPTIONS, "f");
const CONFIG = {
currentPage: 'table.AaVjTc td.YyVfkd',
keyword: 'input[aria-label="Search"]',
noResults: "#topstuff .card-section p:contains(' - did not match any documents.')",
resultText: '#result-stats',
};
if ($(CONFIG.noResults).length === 1) {
this.serp.error = 'No results page';
// No need to parse anything for no results page
return;
}
if ($('body').hasClass('srp')) {
serp.keyword = $(CONFIG.keyword).val();
serp.totalResults = utils.getTotalResults($(CONFIG.resultText).text());
serp.timeTaken = utils.getTimeTaken($(CONFIG.resultText).text());
serp.currentPage = parseInt($(CONFIG.currentPage).text(), 10);
if (options.organic) {
this.getFeatured();
this.getOrganic();
}
if (options.related) {
this.getRelatedKeywords();
}
if (options.pagination) {
this.getPagination();
}
if (options.ads) {
this.getAdwords();
}
if (options.hotels) {
this.getHotels();
}
if (options.videos) {
this.getVideos();
}
// if (options.thumbnails) {
// this.getThumbnails();
// }
if (options.shop) {
this.getShopResults();
}
if (options.stories) {
this.getTopStories();
}
if (options.locals) {
this.getLocals();
}
// this.getAvailableOn();
}
}
getOrganic() {
const $ = this.$;
const CONFIG = {
results: '#search #rso > .g div .yuRUbf > a, #search #rso > .g.tF2Cxc .yuRUbf > a, #search #rso > .hlcw0c div .yuRUbf > a, #search #rso .kp-wholepage .g div .yuRUbf > a, #search #rso > div .g.jNVrwc.Y4pkMc div .yuRUbf > a',
};
$(CONFIG.results).each((index, element) => {
const position = this.serp.organic.length + 1;
const url = $(element).prop('href');
const domain = utils.getDomain(url);
const title = this.elementText(element, 'h3');
const snippet = this.getSnippet(element);
const linkType = utils.getLinkType(url);
const result = {
domain,
linkType,
position,
snippet,
title,
url,
};
this.parseSitelinks(element, result);
this.parseCachedAndSimilarUrls(element, result);
this.serp.organic.push(result);
});
}
getFeatured() {
const $ = this.$;
const CONFIG = {
results: '#search #rso>.ULSxyf>.g.mnr-c .c2xzTb div .yuRUbf > a',
};
$(CONFIG.results).each((index, element) => {
const position = this.serp.organic.length + 1;
const url = $(element).prop('href');
const domain = utils.getDomain(url);
const title = this.elementText(element, 'h3');
const snippet = this.$(element).closest('.g').prev().text();
const linkType = utils.getLinkType(url);
const featured = true;
const result = {
domain,
linkType,
position,
snippet,
title,
url,
featured,
};
this.serp.organic.push(result);
});
}
getSnippet(element) {
const text = this.$(element).parent().next().text();
return text;
}
parseSitelinks(element, result) {
const $ = this.$;
const CONFIG = {
cards: '.usJj9c',
closestCards: 'div.g',
closestInline: '.tF2Cxc',
href: 'a',
inline: '.HiHjCd a',
snippet: '.st',
title: 'h3 a',
};
const sitelinks = [];
let type;
if ($(element).closest(CONFIG.closestCards).find(CONFIG.cards).length > 0) {
type = models_1.SitelinkType.card;
}
else if ($(element).closest(CONFIG.closestInline).find(CONFIG.inline).length > 0) {
type = models_1.SitelinkType.inline;
}
else {
return;
}
const links = $(element)
.closest(type === models_1.SitelinkType.card ? CONFIG.closestCards : CONFIG.closestInline)
.find(type === models_1.SitelinkType.card ? CONFIG.cards : CONFIG.inline);
links.each((i, el) => {
const sitelink = {
href: type === models_1.SitelinkType.card ? this.elementHref(el, CONFIG.href) : $(el).attr('href'),
snippet: type === models_1.SitelinkType.card ? this.elementText(el, CONFIG.snippet) : undefined,
title: type === models_1.SitelinkType.card ? this.elementText(el, CONFIG.title) : $(el).text().replace(/\s+/g, ' ').trim(),
type,
};
sitelinks.push(sitelink);
});
if (sitelinks.length > 0) {
result.sitelinks = sitelinks;
}
}
getRelatedKeywords() {
const relatedKeywords = [];
const query = '.k8XOCe';
this.$(query).each((i, elem) => {
relatedKeywords.push({
keyword: this.$(elem).text(),
path: this.$(elem).prop('href'),
});
});
this.serp.relatedKeywords = relatedKeywords;
}
parseCachedAndSimilarUrls(element, result) {
const $ = this.$;
const CONFIG = {
closest: '.yuRUbf',
find: 'span ol > li.action-menu-item > a',
};
const urls = $(element).closest(CONFIG.closest).find(CONFIG.find);
urls.each((i, el) => {
switch ($(el).text()) {
case 'Cached':
result.cachedUrl = $(el).prop('href');
break;
case 'Similar':
result.similarUrl = $(el).prop('href');
break;
}
});
}
getPagination() {
const $ = this.$;
const serp = this.serp;
const CONFIG = {
pages: 'td:not(.b) a.fl',
pagination: 'table.AaVjTc',
};
const pagination = $(CONFIG.pagination);
serp.pagination.push({
page: serp.currentPage || 1,
path: '',
});
pagination.find(CONFIG.pages).each((index, element) => {
serp.pagination.push({
page: parseInt($(element).text(), 10),
path: $(element).prop('href'),
});
});
}
getVideos() {
const $ = this.$;
const serp = this.serp;
const CONFIG = {
channel: '.pcJO7e span',
date: '.hMJ0yc span',
sitelink: 'a',
source: '.pcJO7e cite',
title: '.fc9yUc.oz3cqf.p5AXld',
videoDuration: '.J1mWY',
videosCards: '.RzdJxc',
};
const videosCards = $(CONFIG.videosCards);
if (videosCards.length === 0) {
return;
}
const videos = [];
videosCards.each((index, element) => {
const videoCard = {
channel: this.elementText(element, CONFIG.channel).substr(3),
date: new Date(this.elementText(element, CONFIG.date)),
sitelink: this.elementHref(element, CONFIG.sitelink),
source: this.elementText(element, CONFIG.source),
title: this.elementText(element, CONFIG.title),
videoDuration: this.elementText(element, CONFIG.videoDuration),
};
videos.push(videoCard);
});
serp.videos = videos;
}
// private getThumbnails() {
// const $ = this.$;
// const serp = this.serp;
// const CONFIG = {
// heading: '.sV2QOc.Ss2Faf.zbA8Me.mfMhoc[role="heading"]',
// headingMore: '.sV2QOc.Ss2Faf.zbA8Me.mfMhoc[role="heading"] .VLkRKc',
// relatedGroup: '#bres .xpdopen',
// relatedThumbnail: '.zVvuGd > div',
// sitelink: 'a',
// title: '.fl',
// };
// const relatedGroup = $(CONFIG.relatedGroup);
// if (relatedGroup.length === 0) {
// return;
// }
// const thumbnailGroups: ThumbnailGroup[] = [];
// relatedGroup.each((index, element) => {
// let heading = '';
// if ($(element).find(CONFIG.headingMore).length === 1) {
// heading = $(element).find(CONFIG.headingMore).text();
// } else {
// heading = $(element).find(CONFIG.heading).text();
// }
// // const heading = this.elementText(element, CONFIG.heading);
// const thumbnailGroup: ThumbnailGroup = {
// heading,
// thumbnails: [],
// };
// const relatedThumbnail = $(element).find(CONFIG.relatedThumbnail);
// relatedThumbnail.each((ind, el) => {
// thumbnailGroup.thumbnails.push({
// sitelink: this.elementHref(el, CONFIG.sitelink),
// title: this.elementText(el, CONFIG.title),
// });
// });
// thumbnailGroups.push(thumbnailGroup);
// });
// serp.thumbnailGroups = thumbnailGroups;
// }
getHotels() {
const $ = this.$;
const hotelsFeature = $('.zd2Jbb');
if (!hotelsFeature.length) {
return;
}
const CONFIG = {
moreHotelsRegex: /(\d+,?)+/,
moreHotelsText: '.wUrVib',
};
// FILTERS
const searchFilters = this.getHotelSearchFilters(hotelsFeature);
// HOTELS (HOTEL CARDS)
const hotels = this.getHotelOffers(hotelsFeature);
// MORE HOTELS
// const moreHotelsText = hotelsFeature.find(CONFIG.moreHotelsText).text();
const moreHotelsText = hotelsFeature.find(CONFIG.moreHotelsText).text();
const moreHotels = parseInt(utils.getFirstMatch(moreHotelsText, CONFIG.moreHotelsRegex).replace(',', ''), 10);
this.serp.hotels = {
hotels,
moreHotels,
searchFilters,
};
}
getHotelSearchFilters(hotelsFeature) {
const $ = this.$;
const CONFIG = {
activeFilter: '.CWGqFd',
checkInString: '.vpggTd.ed5F6c span',
checkOutString: '.vpggTd:not(.ed5F6c) span',
filterGroupsTitles: '.d2IDkc',
guests: '.viupMc',
hotelFiltersSection: '.x3UtIe',
searchTitle: '.gsmmde',
};
const hotelFiltersSection = hotelsFeature.find(CONFIG.hotelFiltersSection);
const searchTitle = hotelFiltersSection.find(CONFIG.searchTitle).text();
const checkInString = `${hotelFiltersSection.find(CONFIG.checkInString).text()} ${new Date().getFullYear()}`;
const checkIn = new Date(checkInString);
const checkOutString = `${hotelFiltersSection.find(CONFIG.checkOutString).text()} ${new Date().getFullYear()}`;
const checkOut = new Date(checkOutString);
const guests = parseInt(hotelFiltersSection.find(CONFIG.guests).text(), 10);
const filters = [];
const filterGroupsTitles = hotelFiltersSection.find(CONFIG.filterGroupsTitles);
filterGroupsTitles.each((ind, el) => {
const hotelFilters = {
explanation: $(el).next().text(),
title: $(el).text(),
};
if ($(el).closest(CONFIG.activeFilter).length) {
hotelFilters.isActive = true;
}
filters.push(hotelFilters);
});
return {
checkIn,
checkOut,
filters,
guests,
searchTitle,
};
}
getHotelOffers(hotelsFeature) {
const $ = this.$;
const CONFIG = {
amenities: '.I9B2He',
currency: '.dv1Q3e',
currencyRegex: /\D+/,
dealDetails: '.kOTJue.jj25pf',
dealType: '.NNPnSe',
hotelCards: '.ntKMYc .hmHBZd',
name: '.BTPx6e',
originalPrice: '.AfCRQd',
originalPriceRegex: /\d+/,
price: '.dv1Q3e',
priceRegex: /\d+/,
rating: '.YDIN4c.YrbPuc',
ratingRegex: /\d\.\d/,
votes: '.HypWnf.YrbPuc',
};
const hotels = [];
const hotelCards = hotelsFeature.find(CONFIG.hotelCards);
hotelCards.each((ind, el) => {
const name = this.elementText(el, CONFIG.name);
const price = parseInt(utils.getFirstMatch(this.elementText(el, CONFIG.price), CONFIG.priceRegex), 10);
const originalPrice = parseInt(utils.getFirstMatch(this.elementText(el, CONFIG.originalPrice), CONFIG.originalPriceRegex), 10);
const currency = utils.getFirstMatch(this.elementText(el, CONFIG.currency), CONFIG.currencyRegex);
const ratingString = $(el).find(CONFIG.rating).text();
const rating = parseFloat(utils.getFirstMatch(ratingString, CONFIG.ratingRegex));
const votes = parseInt(this.elementText(el, CONFIG.votes).slice(1, -1).replace(',', ''), 10); // Getting rid of parentheses with slice()
// Make this better, maybe something instead of slice ?
const dealType = this.elementText(el, CONFIG.dealType);
const dealDetails = this.elementText(el, CONFIG.dealDetails);
const amenities = this.elementText(el, CONFIG.amenities);
const hotelDeal = {
dealType,
};
if (dealDetails) {
hotelDeal.dealDetails = dealDetails;
}
if (originalPrice) {
hotelDeal.originalPrice = originalPrice;
}
const hotel = {
currency,
name,
price,
rating,
votes,
};
if (dealType) {
hotel.deal = hotelDeal;
}
if (amenities) {
hotel.amenities = amenities;
}
hotels.push(hotel);
});
return hotels;
}
getAdwords() {
const $ = this.$;
const serp = this.serp;
const CONFIG = {
bottom: '#tadsb',
top: '#tads',
};
const adwords = {};
// TODO: refactor this
if ($(CONFIG.top).length) {
adwords.adwordsTop = [];
this.getAds(CONFIG.top, adwords.adwordsTop);
}
if ($(CONFIG.bottom).length) {
adwords.adwordsBottom = [];
this.getAds(CONFIG.bottom, adwords.adwordsBottom);
}
serp.adwords = adwords.adwordsTop || adwords.adwordsBottom ? adwords : undefined;
}
getAds(search, adsList) {
const $ = this.$;
const CONFIG = {
ads: '.uEierd',
snippet: '.MUxGbd.yDYNvb.lyLwlc:not(.fCBnFe .MUxGbd.yDYNvb.lyLwlc):not(.qjtaSd.MUxGbd.yDYNvb.lyLwlc)',
title: '[role="heading"]',
url: 'a.sVXRqc',
};
$(search)
.find(CONFIG.ads)
.each((i, e) => {
const title = this.elementText(e, CONFIG.title);
const url = this.elementHref(e, CONFIG.url);
const domain = utils.getDomain(url);
const linkType = utils.getLinkType(url);
const snippet = $(e).find(CONFIG.snippet).text();
const sitelinks = this.getAdSitelinks(e);
const position = i + 1;
const ad = {
domain,
linkType,
position,
sitelinks,
snippet,
title,
url,
};
adsList.push(ad);
});
}
getAdSitelinks(ad) {
const $ = this.$;
const CONFIG = {
card: '.fCBnFe,.MhgNwc',
cardHref: 'h3 a',
cardSnippet: ':not(h3)',
cardTitle: 'h3',
inline: '.bOeY0b a',
test: 'St0YAf',
};
const sitelinks = [];
const cardSitelinks = $(ad).find(CONFIG.card);
cardSitelinks.each((ind, e) => {
const sitelink = {
href: this.elementHref(e, CONFIG.cardHref),
snippet: $(e).children(CONFIG.cardSnippet).text(),
title: this.elementText(e, CONFIG.cardTitle),
type: models_1.SitelinkType.card,
};
sitelinks.push(sitelink);
});
const inlineSiteLinks = $(ad).find(CONFIG.inline);
inlineSiteLinks.each((i, e) => {
const sitelink = {
href: $(e).attr('href'),
title: $(e).text(),
type: models_1.SitelinkType.inline,
};
sitelinks.push(sitelink);
});
return sitelinks;
}
// Moved to knowledge graph
// private getAvailableOn() {
// const $ = this.$;
// const serp = this.serp;
// const CONFIG = {
// price: '.V8xno span',
// query: 'a.JkUS4b',
// service: '.i3LlFf',
// };
// const list = $(CONFIG.query);
// const availableOn: AvailableOn[] = [];
// if (list.length) {
// list.each((i, e) => {
// const url = $(e).attr('href') as string;
// const service = this.elementText(e, CONFIG.service);
// const price = this.elementText(e, CONFIG.price);
// availableOn.push({ url, service, price });
// });
// serp.availableOn = availableOn;
// }
// }
getLocals() {
const $ = this.$;
const serp = this.serp;
const CONFIG = {
name: '.dbg0pd',
rating: '.YDIN4c.YrbPuc',
reviews: '.HypWnf.YrbPuc',
reviewsRegex: /[0-9]+/,
expensiveness: '[role="img"]',
expensivenessRegex: /·([^]+)·/,
type: '.rllt__details div:nth-child(1)',
typeRegex: /\w+\s\w+/,
address: '.rllt__details div:nth-child(2)',
addressRegex: /[^·]*$/,
localsFeature: '[data-hveid="CBYQAQ"]',
local: '.C8TUKc',
distance: '.rllt__details div:nth-child(2)',
distanceRegex: /^([^·])+/,
description: 'div.rllt__wrapped > span',
};
const localsFeature = $(CONFIG.localsFeature);
if (!localsFeature.length) {
return;
}
const locals = [];
const local = localsFeature.find(CONFIG.local);
local.each((ind, el) => {
const name = this.elementText(el, CONFIG.name);
const rating = this.elementText(el, CONFIG.rating);
const reviews = utils.getFirstMatch($(el).find(CONFIG.reviews).text(), CONFIG.reviewsRegex);
const expensiveness = this.elementText(el, CONFIG.expensiveness).trim().length;
const type = utils.getFirstMatch($(el).find(CONFIG.type).text(), CONFIG.typeRegex);
const distance = utils.getFirstMatch($(el).find(CONFIG.distance).text(), CONFIG.distanceRegex).trim();
const address = utils.getFirstMatch($(el).find(CONFIG.address).text(), CONFIG.addressRegex).trim();
const description = this.elementText(el, CONFIG.description);
locals.push({ name, rating, reviews, expensiveness, type, address, distance, description });
});
serp.locals = locals;
}
getTopStories() {
const $ = this.$;
const serp = this.serp;
const CONFIG = {
published: '.S1FAPd',
publisher: '.CEMjEf span',
title: '[role="heading"]',
topStoriesFeature: '.F8yfEe',
topStory: '.WlydOe',
};
const topStoriesFeature = $(CONFIG.topStoriesFeature);
if (!topStoriesFeature.length) {
return;
}
const topStories = [];
const topStory = topStoriesFeature.find(CONFIG.topStory);
topStory.each((ind, el) => {
const url = $(el).attr('href');
const title = this.elementText(el, CONFIG.title);
const publisher = this.elementText(el, CONFIG.publisher);
const published = this.elementText(el, CONFIG.published);
topStories.push({ url, title, publisher, published });
});
serp.topStories = topStories;
}
getShopResults() {
const $ = this.$;
const serp = this.serp;
const CONFIG = {
commodity: '.cYBBsb',
currency: '.e10twf',
currencyRegex: /\D+/,
imgLink: 'a.pla-unit-img-container-link',
price: '.e10twf',
priceRegex: /[\d,.]+/,
ratingRegex: /\d\.\d/,
ratingString: 'a > span > g-review-stars > span',
shopFeature: '.top-pla-group-inner',
shopOffer: '.pla-unit:not(.view-all-unit)',
shoppingSite: '.LbUacb',
// specialOffer: '.gyXcee',
title: 'a > .hCK2Zc',
votes: '.nbd1Bd .QhqGkb.RnJeZd',
};
const shopFeature = $(CONFIG.shopFeature);
if (shopFeature.length) {
const shopResults = [];
const shopOffer = shopFeature.find(CONFIG.shopOffer);
shopOffer.each((ind, el) => {
const imgLink = this.elementHref(el, CONFIG.imgLink);
const title = this.elementText(el, CONFIG.title);
const price = parseFloat(utils.getFirstMatch(this.elementText(el, CONFIG.price), CONFIG.priceRegex).replace(/,/g, ''));
const currency = utils.getFirstMatch(this.elementText(el, CONFIG.currency), CONFIG.currencyRegex);
const shoppingSite = this.elementText(el, CONFIG.shoppingSite);
const shopResult = {
currency,
imgLink,
price,
shoppingSite,
title,
};
// const specialOffer = $(el).find(CONFIG.specialOffer).first().text();
// if (specialOffer) {
// shopResult.specialOffer = specialOffer;
// }
const ratingString = $(el).find(CONFIG.ratingString).attr('aria-label');
if (ratingString) {
const rating = parseFloat(utils.getFirstMatch(ratingString, CONFIG.ratingRegex));
shopResult.rating = rating;
}
const votes = this.elementText(el, CONFIG.votes).trim().slice(1, -1);
if (votes) {
shopResult.votes = votes;
}
const commodity = this.elementText(el, CONFIG.commodity);
if (commodity) {
shopResult.commodity = commodity;
}
shopResults.push(shopResult);
});
serp.shopResults = shopResults;
}
}
// Helper methods
elementText(el, query) {
return this.$(el).find(query).text();
}
elementHref(el, query) {
return this.$(el).find(query).attr('href');
}
}
exports.GoogleSERP = GoogleSERP;
_GoogleSERP_DEF_OPTIONS = new WeakMap();