UNPKG

gm-review-scraper

Version:

A tool to scrape Google Maps reviews

245 lines (230 loc) 7.12 kB
import { hexToDec } from "hex2dec"; export class GMRScraper { constructor(options = {}) { this.url = options.url || ""; const { sort_type = "relevent", search_query = "", pages = "max", clean = false, } = options; this.sort_type = sort_type; this.search_query = search_query; this.pages = pages; this.clean = clean; this.SortEnum = { relevent: 1, newest: 2, highest_rating: 3, lowest_rating: 4, }; this.key = options.key || null; } validateParams() { const parsedUrl = new URL(this.url); if ( parsedUrl.host !== "www.google.com" || !parsedUrl.pathname.startsWith("/maps/place/") ) { throw new Error(`Invalid URL: ${this.url}`); } if (!this.SortEnum[this.sort_type]) { throw new Error(`Invalid sort type: ${this.sort_type}`); } if (this.pages !== "max" && isNaN(this.pages)) { throw new Error(`Invalid pages value: ${this.pages}`); } if (typeof this.clean !== "boolean") { throw new Error(`Invalid clean value: ${this.clean}`); } } async getUrlFromShortUrl(url) { const resp = await fetch(url, { redirect: "manual", }); const locationHeader = resp.headers.get("location"); return locationHeader || url; } parseReviewURL(url, p = "") { const m = url.match(/!1s([a-zA-Z0-9_:]+)!/); if (!m || !m[1]) throw new Error("Invalid URL"); const [h1, h2] = m[1].split(":").map(hexToDec); const pS = p ? `!2m2!2i10!3s${p}` : `!2m1!2i10`; return `https://www.google.com/maps/preview/review/listentitiesreviews?authuser=0&hl=en&gl=in&pb=!1m2!1y${h1}!2y${h2}${pS}!3e1!4m5!3b1!4b1!5b1!6b1!7b1!5m2!1sdzvaXrvAMImImAXHsLPICA!7e81`; } buildListUGCURL(url, so, pg = "", sq = "") { const matches = [...url.matchAll(/!1s([a-zA-Z0-9_:]+)!/g)]; if (!matches || !matches[0][1]) throw new Error("Invalid URL"); const placeId = matches[1]?.[1] || matches[0][1]; return `https://www.google.com/maps/rpc/listugcposts?authuser=0&hl=en&gl=in&pb=!1m7!1s${placeId}!3s${sq}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${pg}!5m2!1sBnOwZvzePPfF4-EPy7LK0Ak!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m6!1e3!2e1!3sen!4slk!6m1!1i2!13m1!1e${so}`; } async fetchReviewsPage(pg = "", so, sq = "") { const apiUrl = this.buildListUGCURL(this.url, so, pg, sq); const resp = await fetch(apiUrl); if (!resp.ok) throw new Error(`Failed to fetch reviews: ${resp.statusText}`); const text = await resp.text(); const raw = text.split(")]}'")[1]; return JSON.parse(raw); } async parseReviews(reviews) { const parsed = await Promise.all( reviews.map(([review]) => { const hasResp = !!review[3][14]?.[0]?.[0]; return { review_id: review[0], time: { published: review[1][2], last_edited: review[1][3], }, author: { name: review[1][4][5][0], profile_url: review[1][4][5][1], url: review[1][4][5][2][0], id: review[1][4][5][3], }, review: { rating: review[2][0][0], text: review[2][15]?.[0]?.[0] || null, language: review[2][14]?.[0] || null, }, images: review[2][2]?.map((img) => ({ id: img[0], url: img[1][6][0], size: { width: img[1][6][2][0], height: img[1][6][2][1], }, location: { friendly: img[1][21][3][7]?.[0], lat: img[1][8][0][2], long: img[1][8][0][1], }, caption: img[1][21][3][5]?.[0] || null, })) || null, source: review[1][13][0], response: hasResp ? { text: review[3][14]?.[0]?.[0] || null, time: { published: review[3]?.[1] || null, last_edited: review[3]?.[2] || null, }, } : null, }; }) ); return parsed; } async paginate(initialData, so) { let reviews = initialData[2]; let nextPage = initialData[1]?.replace(/"/g, ""); let pageNum = 2; while (nextPage && (this.pages === "max" || pageNum <= +this.pages)) { const data = await this.fetchReviewsPage(nextPage, so, this.search_query); reviews = reviews.concat(data[2]); nextPage = data[1]?.replace(/"/g, "") || ""; if (!nextPage) break; await new Promise((r) => setTimeout(r, 1000)); pageNum++; } return reviews; } async fetchStatistical(url) { try { const [lat, lng] = url.split("/@")[1].split("/")[0].split(","); const input = url.split("place/")[1].split("/")[0]; const config = { url: `https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input=${input}&inputtype=textquery&locationbias=point:${lat},${lng}&fields=place_id,name,rating,user_ratings_total,formatted_address&key=${this.key}`, method: "POST", headers: { "Content-Type": "application/json", }, }; const resp = await fetch(config.url); if (!resp.ok) { return { rating: 0, total_reviews: 0, location: { lat: 0, lng: 0, }, name: "", address: "", }; } const data = await resp.json(); const result = data.candidates[0]; return { rating: result.rating || 0, total_reviews: result.user_ratings_total || 0, location: { lat, lng, }, name: result?.name || input || "", address: result?.formatted_address || "", }; } catch (error) { return { rating: 0, total_reviews: 0, location: { lat: 0, lng: 0, }, name: "", address: "", }; } } async scrape(url) { this.url = url; if (this.url.includes("maps.app.goo.gl")) { this.url = await this.getUrlFromShortUrl(this.url); } this.validateParams(); const so = this.SortEnum[this.sort_type]; let statistical = { rating: 0, total_reviews: 0, name: "", address: "", location: { lat: 0, lng: 0, }, }; if (this.key) { statistical = await this.fetchStatistical(this.url); } if (this.pages === 0) { return { reviews: [], statistical, }; } const initial = await this.fetchReviewsPage("", so, this.search_query); if (!initial?.[2]?.length) return 0; if (!initial[1] || this.pages === 1) { const reviews = this.clean ? await this.parseReviews(initial[2]) : initial[2]; return { reviews, statistical, }; } const allReviews = await this.paginate(initial, so); const reviews = this.clean ? await this.parseReviews(allReviews) : allReviews; return { reviews, statistical, }; } }