UNPKG

rag-crawler

Version:

Crawl a website to generate knowledge file for RAG

160 lines (159 loc) 5.06 kB
import { Octokit } from "@octokit/rest"; import * as cheerio from "cheerio"; import { URL } from "node:url"; import TurndownService from "turndown"; import fetch from "node-fetch"; const turndownService = new TurndownService(); turndownService.remove("script"); const IS_GITHUB_REPO = /^https:\/\/github\.com\/([^/]+)\/([^/]+)\/tree\/([^/]+)/; export async function* crawlWebsite(startUrl, options_) { const options = { maxConnections: 5, exclude: [], fetchOptions: {}, breakOnError: true, logEnabled: true, ...(options_ || {}), }; const startUrlObj = new URL(startUrl); let paths = [startUrlObj.pathname]; startUrl = normalizeStartUrl(startUrl); if (IS_GITHUB_REPO.test(startUrl)) { paths = await crawlGHTree(startUrlObj, options.exclude); } let index = 0; while (index < paths.length) { const batch = paths.slice(index, index + options.maxConnections); const promises = batch.map((path) => crawlPage(startUrl, path, options)); const results = await Promise.all(promises); for (const { links, text, path } of results) { if (text !== "") { yield { path: new URL(path, startUrlObj).toString(), text, }; } for (let link of links) { if (!paths.some((path) => matchLink(path, link))) { paths.push(link); } } } index += batch.length; } if (options.logEnabled) { console.log("✨ Crawl completed"); } } async function crawlGHTree(startUrl, exclude) { const octokit = new Octokit({ auth: undefined, }); let [_, owner, repo, _scope, branch, ...parts] = startUrl.pathname.split("/"); const rootPath = parts.join("/"); const tree = await octokit.request("GET /repos/{owner}/{repo}/git/trees/{tree_sha}", { owner, repo, tree_sha: branch, headers: { "X-GitHub-Api-Version": "2022-11-28", }, recursive: "true", }); const paths = tree.data.tree .filter((file) => file.type === "blob" && (file.path?.endsWith(".md") || file.path?.endsWith(".MD")) && file.path.startsWith(rootPath) && !shouldExcludeLink(file.path, exclude)) .map((file) => `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${file.path}`); return paths; } async function crawlPage(startUrl, path, options) { const location = new URL(path, startUrl).toString(); let html = ""; try { const response = await fetch(location, options.fetchOptions); html = await response.text(); if (options.logEnabled) { console.log(`🚀 Crawled ${location}`); } } catch (err) { if (options.breakOnError) { throw err; } else if (options.logEnabled) { console.error(err); } } let links = []; if (IS_GITHUB_REPO.test(startUrl)) { return { path, text: html, links, }; } const $ = cheerio.load(html); $("a").each((_, element) => { try { const href = $(element).attr("href"); if (!href || href.startsWith("#")) { return; } const parsedUrl = new URL(href, location); if (parsedUrl.toString().startsWith(startUrl)) { const link = parsedUrl.pathname; if (!shouldExcludeLink(link, options.exclude)) { links.push(link); } } } catch { } }); let text = html; if (options.extract) { text = $(options.extract)?.html(); } text = turndownService.turndown(text); return { path, text, links: [...new Set(links)], }; } function shouldExcludeLink(link, exclude) { if (link.includes("#")) { return true; } const parts = link.replace(/\/$/, "").split("/"); let name = (parts[parts.length - 1] || "").toLowerCase(); for (const excludeName of exclude) { let cond = false; if (/\.[^.]+$/.test(excludeName)) { cond = excludeName.toLowerCase() === name.toLowerCase(); } else { cond = excludeName.toLowerCase() === name.toLowerCase().replace(/\.[^.]+$/, ""); } if (cond) { return true; } } return false; } function normalizeStartUrl(startUrl) { const parsedUrl = new URL(startUrl); parsedUrl.search = ""; parsedUrl.hash = ""; let lastSlashIndex = parsedUrl.pathname.lastIndexOf("/"); if (lastSlashIndex !== -1) { parsedUrl.pathname = parsedUrl.pathname.substring(0, lastSlashIndex + 1); } return parsedUrl.toString(); } function matchLink(path, link) { return path === link || path === link.replace(/\/index\.(html|htm)$/, "/"); }