gm-review-scraper
Version:
A tool to scrape Google Maps reviews
245 lines (230 loc) • 7.12 kB
JavaScript
import { hexToDec } from "hex2dec";
export class GMRScraper {
constructor(options = {}) {
this.url = options.url || "";
const {
sort_type = "relevent",
search_query = "",
pages = "max",
clean = false,
} = options;
this.sort_type = sort_type;
this.search_query = search_query;
this.pages = pages;
this.clean = clean;
this.SortEnum = {
relevent: 1,
newest: 2,
highest_rating: 3,
lowest_rating: 4,
};
this.key = options.key || null;
}
validateParams() {
const parsedUrl = new URL(this.url);
if (
parsedUrl.host !== "www.google.com" ||
!parsedUrl.pathname.startsWith("/maps/place/")
) {
throw new Error(`Invalid URL: ${this.url}`);
}
if (!this.SortEnum[this.sort_type]) {
throw new Error(`Invalid sort type: ${this.sort_type}`);
}
if (this.pages !== "max" && isNaN(this.pages)) {
throw new Error(`Invalid pages value: ${this.pages}`);
}
if (typeof this.clean !== "boolean") {
throw new Error(`Invalid clean value: ${this.clean}`);
}
}
async getUrlFromShortUrl(url) {
const resp = await fetch(url, {
redirect: "manual",
});
const locationHeader = resp.headers.get("location");
return locationHeader || url;
}
parseReviewURL(url, p = "") {
const m = url.match(/!1s([a-zA-Z0-9_:]+)!/);
if (!m || !m[1]) throw new Error("Invalid URL");
const [h1, h2] = m[1].split(":").map(hexToDec);
const pS = p ? `!2m2!2i10!3s${p}` : `!2m1!2i10`;
return `https://www.google.com/maps/preview/review/listentitiesreviews?authuser=0&hl=en&gl=in&pb=!1m2!1y${h1}!2y${h2}${pS}!3e1!4m5!3b1!4b1!5b1!6b1!7b1!5m2!1sdzvaXrvAMImImAXHsLPICA!7e81`;
}
buildListUGCURL(url, so, pg = "", sq = "") {
const matches = [...url.matchAll(/!1s([a-zA-Z0-9_:]+)!/g)];
if (!matches || !matches[0][1]) throw new Error("Invalid URL");
const placeId = matches[1]?.[1] || matches[0][1];
return `https://www.google.com/maps/rpc/listugcposts?authuser=0&hl=en&gl=in&pb=!1m7!1s${placeId}!3s${sq}!6m4!4m1!1e1!4m1!1e3!2m2!1i10!2s${pg}!5m2!1sBnOwZvzePPfF4-EPy7LK0Ak!7e81!8m5!1b1!2b1!3b1!5b1!7b1!11m6!1e3!2e1!3sen!4slk!6m1!1i2!13m1!1e${so}`;
}
async fetchReviewsPage(pg = "", so, sq = "") {
const apiUrl = this.buildListUGCURL(this.url, so, pg, sq);
const resp = await fetch(apiUrl);
if (!resp.ok)
throw new Error(`Failed to fetch reviews: ${resp.statusText}`);
const text = await resp.text();
const raw = text.split(")]}'")[1];
return JSON.parse(raw);
}
async parseReviews(reviews) {
const parsed = await Promise.all(
reviews.map(([review]) => {
const hasResp = !!review[3][14]?.[0]?.[0];
return {
review_id: review[0],
time: {
published: review[1][2],
last_edited: review[1][3],
},
author: {
name: review[1][4][5][0],
profile_url: review[1][4][5][1],
url: review[1][4][5][2][0],
id: review[1][4][5][3],
},
review: {
rating: review[2][0][0],
text: review[2][15]?.[0]?.[0] || null,
language: review[2][14]?.[0] || null,
},
images:
review[2][2]?.map((img) => ({
id: img[0],
url: img[1][6][0],
size: {
width: img[1][6][2][0],
height: img[1][6][2][1],
},
location: {
friendly: img[1][21][3][7]?.[0],
lat: img[1][8][0][2],
long: img[1][8][0][1],
},
caption: img[1][21][3][5]?.[0] || null,
})) || null,
source: review[1][13][0],
response: hasResp
? {
text: review[3][14]?.[0]?.[0] || null,
time: {
published: review[3]?.[1] || null,
last_edited: review[3]?.[2] || null,
},
}
: null,
};
})
);
return parsed;
}
async paginate(initialData, so) {
let reviews = initialData[2];
let nextPage = initialData[1]?.replace(/"/g, "");
let pageNum = 2;
while (nextPage && (this.pages === "max" || pageNum <= +this.pages)) {
const data = await this.fetchReviewsPage(nextPage, so, this.search_query);
reviews = reviews.concat(data[2]);
nextPage = data[1]?.replace(/"/g, "") || "";
if (!nextPage) break;
await new Promise((r) => setTimeout(r, 1000));
pageNum++;
}
return reviews;
}
async fetchStatistical(url) {
try {
const [lat, lng] = url.split("/@")[1].split("/")[0].split(",");
const input = url.split("place/")[1].split("/")[0];
const config = {
url: `https://maps.googleapis.com/maps/api/place/findplacefromtext/json?input=${input}&inputtype=textquery&locationbias=point:${lat},${lng}&fields=place_id,name,rating,user_ratings_total,formatted_address&key=${this.key}`,
method: "POST",
headers: {
"Content-Type": "application/json",
},
};
const resp = await fetch(config.url);
if (!resp.ok) {
return {
rating: 0,
total_reviews: 0,
location: {
lat: 0,
lng: 0,
},
name: "",
address: "",
};
}
const data = await resp.json();
const result = data.candidates[0];
return {
rating: result.rating || 0,
total_reviews: result.user_ratings_total || 0,
location: {
lat,
lng,
},
name: result?.name || input || "",
address: result?.formatted_address || "",
};
} catch (error) {
return {
rating: 0,
total_reviews: 0,
location: {
lat: 0,
lng: 0,
},
name: "",
address: "",
};
}
}
async scrape(url) {
this.url = url;
if (this.url.includes("maps.app.goo.gl")) {
this.url = await this.getUrlFromShortUrl(this.url);
}
this.validateParams();
const so = this.SortEnum[this.sort_type];
let statistical = {
rating: 0,
total_reviews: 0,
name: "",
address: "",
location: {
lat: 0,
lng: 0,
},
};
if (this.key) {
statistical = await this.fetchStatistical(this.url);
}
if (this.pages === 0) {
return {
reviews: [],
statistical,
};
}
const initial = await this.fetchReviewsPage("", so, this.search_query);
if (!initial?.[2]?.length) return 0;
if (!initial[1] || this.pages === 1) {
const reviews = this.clean
? await this.parseReviews(initial[2])
: initial[2];
return {
reviews,
statistical,
};
}
const allReviews = await this.paginate(initial, so);
const reviews = this.clean
? await this.parseReviews(allReviews)
: allReviews;
return {
reviews,
statistical,
};
}
}