UNPKG

algolia-crawl

Version:

Crawl your site and sync your Algolia search index

90 lines 2.77 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.algoliaCrawl = exports.crawl = exports.getUrls = exports.indexObjects = void 0; const cosmic_1 = require("@anandchowdhary/cosmic"); const crypto_1 = require("crypto"); const puppeteer_1 = require("puppeteer"); const algolia_1 = require("./algolia"); /** Index objects in Algolia search */ const indexObjects = async (objects) => algolia_1.index.saveObjects(objects); exports.indexObjects = indexObjects; const items = new Set(); let done = []; const getUrls = async (page, _url, baseUrl) => { const url = _url.split("#")[0]; if (done.includes(url)) return; done.push(url); console.log("Fetching", url); try { await page.goto(url); } catch (error) { } let description = undefined; try { description = (await page.$eval("head > meta[name='description']", (element) => element.getAttribute("content"))) ?? undefined; } catch (error) { } let text = undefined; try { text = (await page.$eval("main, body, html", (element) => element.innerText)) ?? undefined; } catch (error) { } let title = ""; try { title = await page.title(); } catch (error) { } items.add({ objectID: crypto_1.createHash("md5").update(url).digest("hex"), url, title, description, text, }); let hrefs = []; try { hrefs = await page.$$eval("a", (as) => as.map((a) => a.href)); } catch (error) { } for await (const href of hrefs) { if (href) { if (baseUrl) { if (href.startsWith(baseUrl)) await exports.getUrls(page, href, baseUrl); } else { await exports.getUrls(page, href, baseUrl); } } } }; exports.getUrls = getUrls; const crawl = async () => { const browser = await puppeteer_1.launch(); const page = await browser.newPage(); if (Array.isArray(cosmic_1.config("algoliaCrawlStartUrl"))) { for await (const url of cosmic_1.config("algoliaCrawlStartUrl")) { await exports.getUrls(page, url, cosmic_1.config("algoliaCrawlBaseUrl")); } } else { await exports.getUrls(page, cosmic_1.config("algoliaCrawlStartUrl"), cosmic_1.config("algoliaCrawlBaseUrl")); } await browser.close(); return items; }; exports.crawl = crawl; const algoliaCrawl = async () => { try { const items = await exports.crawl(); await exports.indexObjects(Array.from(items)); console.log("Done!"); } catch (error) { console.log(error); } }; exports.algoliaCrawl = algoliaCrawl; //# sourceMappingURL=crawler.js.map