UNPKG

google-sr

Version:

Fast and efficient Package for scraping Google search results without the need for an API key

374 lines (366 loc) 15.2 kB
"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var src_exports = {}; __export(src_exports, { CurrencyResult: () => CurrencyResult, DictionaryResult: () => DictionaryResult, KnowledgePanelResult: () => KnowledgePanelResult, OrganicResult: () => OrganicResult, ResultTypes: () => ResultTypes, TimeResult: () => TimeResult, TranslateResult: () => TranslateResult, TranslateSourceTextRegex: () => TranslateSourceTextRegex, search: () => search, searchWithPages: () => searchWithPages }); module.exports = __toCommonJS(src_exports); // src/search.ts var import_axios = __toESM(require("axios")); var import_cheerio = require("cheerio"); // src/results.ts var import_google_sr_selectors = require("google-sr-selectors"); // src/constants.ts var ResultTypes = { OrganicResult: "ORGANIC", TranslateResult: "TRANSLATE", DictionaryResult: "DICTIONARY", TimeResult: "TIME", CurrencyResult: "CURRENCY", KnowledgePanelResult: "KNOWLEDGE_PANEL" }; var TranslateSourceTextRegex = /"(.+?)"/; // src/utils.ts var baseHeaders = { Accept: "text/html", "Accept-Encoding": "gzip, deflate", "Accept-Language": "en-US,en", Referer: "https://www.google.com/", "upgrade-insecure-requests": 1, // the tested user agent is for Chrome 103 on Windows 10 "User-Agent": "Links (2.29; Linux 6.11.0-13-generic x86_64; GNU C 13.2; text)" }; function extractUrlFromGoogleLink(googleLink) { if (!googleLink) return null; const regex = /[?&](q|imgurl)=([^&]+)/; const match = googleLink.match(regex); if (match?.[2]) { try { return decodeURIComponent(match[2]); } catch { return null; } } return null; } __name(extractUrlFromGoogleLink, "extractUrlFromGoogleLink"); function prepareRequestConfig(opts) { const requestConfig = opts.requestConfig ?? {}; if (typeof opts.query !== "string") throw new TypeError( `Search query must be a string, received ${typeof opts.query} instead.` ); if (typeof requestConfig !== "object") throw new TypeError( `Request config must be an object if specified, received ${typeof requestConfig}.` ); requestConfig.headers = requestConfig.headers ? Object.assign({}, baseHeaders, requestConfig.headers) : baseHeaders; requestConfig.url = requestConfig.url ?? "https://www.google.com/search"; if (!(requestConfig.params instanceof URLSearchParams)) { requestConfig.params = new URLSearchParams(requestConfig.params); } requestConfig.params.set("q", opts.query); requestConfig.params.set("gbv", "1"); requestConfig.responseType = "text"; return requestConfig; } __name(prepareRequestConfig, "prepareRequestConfig"); function throwNoCheerioError(resultParserName) { throw new TypeError( `CheerioAPI instance is missing, if using as a selector make sure to pass the raw function and not the result of calling it. (ex: [${resultParserName}] instead of [${resultParserName}()])` ); } __name(throwNoCheerioError, "throwNoCheerioError"); function isEmpty(strictSelector, ...values) { if (strictSelector) return values.some( (value) => value === "" || value === void 0 || value === null ); return values.every( (value) => value === "" || value === void 0 || value === null ); } __name(isEmpty, "isEmpty"); // src/results.ts var OrganicResult = /* @__PURE__ */ __name(($, strictSelector) => { if (!$) throwNoCheerioError("OrganicResult"); const parsedResults = []; const organicSearchBlocks = $(import_google_sr_selectors.GeneralSelector.block).toArray(); for (const element of organicSearchBlocks) { let link = $(element).find(import_google_sr_selectors.OrganicSearchSelector.link).attr("href") ?? null; const description = $(element).find(import_google_sr_selectors.OrganicSearchSelector.description).text(); const title = $(element).find(import_google_sr_selectors.OrganicSearchSelector.title).text(); link = extractUrlFromGoogleLink(link); if (typeof link !== "string") continue; if (isEmpty(strictSelector, description, title)) continue; parsedResults.push({ type: ResultTypes.OrganicResult, link, description, title }); } return parsedResults; }, "OrganicResult"); var TranslateResult = /* @__PURE__ */ __name(($, strictSelector) => { if (!$) throwNoCheerioError("TranslateResult"); const translateBlock = $(import_google_sr_selectors.GeneralSelector.block).first(); if (!translateBlock) return null; const translatedFromTo = translateBlock.find(import_google_sr_selectors.TranslateSearchSelector.translateFromTo).text(); const fromTo = translatedFromTo.split(" to "); if (fromTo.length !== 2) return null; const sourceLanguage = fromTo[0].trim(); const translationLanguage = fromTo[1].trim(); const sourceTextBlock = translateBlock.find(import_google_sr_selectors.TranslateSearchSelector.sourceText).text().trim(); const sourceText = sourceTextBlock.match(TranslateSourceTextRegex)?.[1] ?? ""; const translatedText = translateBlock.find(import_google_sr_selectors.TranslateSearchSelector.translatedText).text().trim(); if (isEmpty( strictSelector, sourceLanguage, translationLanguage, sourceText, translatedText )) return null; return { type: ResultTypes.TranslateResult, sourceLanguage, translationLanguage, sourceText, translatedText }; }, "TranslateResult"); var parseDefinitionBlock = /* @__PURE__ */ __name((definitionBlock) => { const definitionTextBlock = definitionBlock.find( import_google_sr_selectors.DictionarySearchSelector.definitionTextBlock ); const definitionText = definitionTextBlock.eq(0).text().trim(); const example = definitionTextBlock.eq(1).text().trim(); const synonyms = definitionTextBlock.eq(2).text().trim().replace("synonyms: ", "").split(", ").filter((s) => s !== ""); if (!definitionText) return null; const definition = { definition: definitionText }; if (example && example !== "") definition.example = example; if (synonyms && synonyms.length > 0) definition.synonyms = synonyms; return definition; }, "parseDefinitionBlock"); var DictionaryResult = /* @__PURE__ */ __name(($, strictSelector) => { if (!$) throwNoCheerioError("DictionaryResult"); const dictionaryBlock = $(import_google_sr_selectors.GeneralSelector.block).first(); if (!dictionaryBlock) return null; const phonetic = dictionaryBlock.find(import_google_sr_selectors.DictionarySearchSelector.phonetic).first().text().trim(); const word = dictionaryBlock.find(import_google_sr_selectors.DictionarySearchSelector.word).text().trim(); const meanings = []; const definitionContainer = dictionaryBlock.find(import_google_sr_selectors.DictionarySearchSelector.definitionsContainer).first(); if (!definitionContainer) return null; const definitionBlocks = definitionContainer.find(import_google_sr_selectors.DictionarySearchSelector.definitionsBlock).toArray(); let partOfSpeech = null; for (const definitionBlock of definitionBlocks) { if (!partOfSpeech) { partOfSpeech = $(definitionBlock).find(import_google_sr_selectors.DictionarySearchSelector.definitionPartOfSpeech).first().text().trim(); } else { const definitionLists = $(definitionBlock).find(import_google_sr_selectors.DictionarySearchSelector.definitionList).toArray(); let definitions; if (definitionLists.length > 0) { definitions = definitionLists.map((item) => parseDefinitionBlock($(item))).filter((d) => d !== null); } else { const definition = parseDefinitionBlock($(definitionBlock)); if (definition) definitions = [definition]; else definitions = []; } if (definitions.length > 0) { meanings.push({ partOfSpeech, definitions }); } partOfSpeech = null; } } if (isEmpty(strictSelector, phonetic, word)) return null; return { type: ResultTypes.DictionaryResult, phonetic, word, meanings }; }, "DictionaryResult"); var TimeResult = /* @__PURE__ */ __name(($, strictSelector) => { if (!$) throwNoCheerioError("TimeResult"); const block = $(import_google_sr_selectors.TimeSearchSelector.block).first(); const location = block.find(import_google_sr_selectors.TimeSearchSelector.location).text(); if (location === "") return null; const layoutTable = block.find(import_google_sr_selectors.TimeSearchSelector.timeLayoutTable).first(); if (!layoutTable) return null; const time = layoutTable.find(import_google_sr_selectors.TimeSearchSelector.time).text(); const timeInWords = layoutTable.find(import_google_sr_selectors.TimeSearchSelector.timeInWords).text(); if (isEmpty(strictSelector, time, timeInWords)) return null; return { type: ResultTypes.TimeResult, location, time, timeInWords }; }, "TimeResult"); var CurrencyResult = /* @__PURE__ */ __name(($, strictSelector) => { if (!$) throwNoCheerioError("CurrencyResult"); const block = $(import_google_sr_selectors.GeneralSelector.block).first(); const from = block.find(import_google_sr_selectors.CurrencyConvertSelector.from).text().replace("=", "").trim(); const to = block.find(import_google_sr_selectors.CurrencyConvertSelector.to).text().trim(); if (isEmpty(strictSelector, from, to)) return null; return { type: ResultTypes.CurrencyResult, from, to }; }, "CurrencyResult"); var KnowledgePanelResult = /* @__PURE__ */ __name(($, strictSelector) => { if (!$) throwNoCheerioError("KnowledgePanelResult"); const blocks = $(import_google_sr_selectors.GeneralSelector.block); let knowledgePanel = null; blocks.each((index, element) => { if (index > 5) return false; const block = $(element); const headerContainer = block.find(import_google_sr_selectors.KnowledgePanelSelector.headerBlock); const headerBlock = headerContainer.first(); const imageContainer = headerBlock.next(); if (!headerBlock) return; const title = headerBlock.find(import_google_sr_selectors.KnowledgePanelSelector.title).text().trim(); const label = headerBlock.find(import_google_sr_selectors.KnowledgePanelSelector.label).text().trim(); const imageLink = imageContainer.find(import_google_sr_selectors.KnowledgePanelSelector.imageUrl).attr("src"); if (title === "" || label === "") return; const descriptionBlock = block.find( import_google_sr_selectors.KnowledgePanelSelector.descriptionBlock ); const description = descriptionBlock.find("span").first().text().trim(); const sourceLink = descriptionBlock.find("a").attr("href"); const cleanSourceLink = extractUrlFromGoogleLink(sourceLink ?? null); const metadataBlocks = block.find(import_google_sr_selectors.KnowledgePanelSelector.metadataBlock).toArray(); const metadata = []; for (const metadataContainerElement of metadataBlocks) { const metadataContainer = $(metadataContainerElement); const label2 = metadataContainer.find(import_google_sr_selectors.KnowledgePanelSelector.metadataLabel).first().text().trim(); if (label2 === "") continue; const value = metadataContainer.find(import_google_sr_selectors.KnowledgePanelSelector.metadataValue).text().trim(); if (value === "") continue; metadata.push({ label: label2, value }); } if (!isEmpty(strictSelector, title, description, label)) knowledgePanel = { type: ResultTypes.KnowledgePanelResult, title, label, description, sourceLink: cleanSourceLink, imageLink: imageLink ?? null, metadata }; return false; }); return knowledgePanel; }, "KnowledgePanelResult"); // src/search.ts async function search(options) { if (!options) throw new TypeError( `Search options must be provided. Received ${typeof options}` ); const requestConfig = prepareRequestConfig(options); const { data } = await (0, import_axios.default)(requestConfig); const cheerioApi = (0, import_cheerio.load)(data); const selectors = options.resultTypes || [OrganicResult]; let searchResults = []; for (const selector of selectors) { const result = selector( cheerioApi, Boolean(options.strictSelector) ); if (result) searchResults = searchResults.concat(result); } return searchResults; } __name(search, "search"); async function searchWithPages(options) { if (!options) throw new TypeError( `Search options must be provided. Received ${typeof options}` ); if (typeof options.pages !== "number" && !Array.isArray(options.pages)) throw new TypeError( `Page must be a number or an array of numbers. Received ${typeof options.pages}` ); const searchResults = []; const pages = Array.isArray(options.pages) ? options.pages : Array.from({ length: options.pages }, (_, i) => i * 10); const baseRequestConfig = prepareRequestConfig(options); const selectors = options.resultTypes || [OrganicResult]; for (const page of pages) { baseRequestConfig.params.set("start", String(page)); const { data } = await (0, import_axios.default)(baseRequestConfig); const cheerioApi = (0, import_cheerio.load)(data); let pageResults = []; for (const selector of selectors) { const result = selector( cheerioApi, Boolean(options.strictSelector) ); if (result) pageResults = pageResults.concat(result); } searchResults.push(pageResults); } return searchResults; } __name(searchWithPages, "searchWithPages"); // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { CurrencyResult, DictionaryResult, KnowledgePanelResult, OrganicResult, ResultTypes, TimeResult, TranslateResult, TranslateSourceTextRegex, search, searchWithPages });