algolia-crawl
Version:
Crawl your site and sync your Algolia search index
90 lines • 2.77 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.algoliaCrawl = exports.crawl = exports.getUrls = exports.indexObjects = void 0;
const cosmic_1 = require("@anandchowdhary/cosmic");
const crypto_1 = require("crypto");
const puppeteer_1 = require("puppeteer");
const algolia_1 = require("./algolia");
/** Index objects in Algolia search */
const indexObjects = async (objects) => algolia_1.index.saveObjects(objects);
exports.indexObjects = indexObjects;
const items = new Set();
let done = [];
const getUrls = async (page, _url, baseUrl) => {
const url = _url.split("#")[0];
if (done.includes(url))
return;
done.push(url);
console.log("Fetching", url);
try {
await page.goto(url);
}
catch (error) { }
let description = undefined;
try {
description =
(await page.$eval("head > meta[name='description']", (element) => element.getAttribute("content"))) ?? undefined;
}
catch (error) { }
let text = undefined;
try {
text = (await page.$eval("main, body, html", (element) => element.innerText)) ?? undefined;
}
catch (error) { }
let title = "";
try {
title = await page.title();
}
catch (error) { }
items.add({
objectID: crypto_1.createHash("md5").update(url).digest("hex"),
url,
title,
description,
text,
});
let hrefs = [];
try {
hrefs = await page.$$eval("a", (as) => as.map((a) => a.href));
}
catch (error) { }
for await (const href of hrefs) {
if (href) {
if (baseUrl) {
if (href.startsWith(baseUrl))
await exports.getUrls(page, href, baseUrl);
}
else {
await exports.getUrls(page, href, baseUrl);
}
}
}
};
exports.getUrls = getUrls;
const crawl = async () => {
const browser = await puppeteer_1.launch();
const page = await browser.newPage();
if (Array.isArray(cosmic_1.config("algoliaCrawlStartUrl"))) {
for await (const url of cosmic_1.config("algoliaCrawlStartUrl")) {
await exports.getUrls(page, url, cosmic_1.config("algoliaCrawlBaseUrl"));
}
}
else {
await exports.getUrls(page, cosmic_1.config("algoliaCrawlStartUrl"), cosmic_1.config("algoliaCrawlBaseUrl"));
}
await browser.close();
return items;
};
exports.crawl = crawl;
const algoliaCrawl = async () => {
try {
const items = await exports.crawl();
await exports.indexObjects(Array.from(items));
console.log("Done!");
}
catch (error) {
console.log(error);
}
};
exports.algoliaCrawl = algoliaCrawl;
//# sourceMappingURL=crawler.js.map