UNPKG

rag-crawler

Version:

Crawl a website to generate knowledge file for RAG

github.com/sigoden/rag-crawler

sigoden/rag-crawler

134 lines (132 loc) • 4.86 kB

JavaScript

#!/usr/bin/env node import { program } from "commander"; import { HttpsProxyAgent } from "https-proxy-agent"; import path from "node:path"; import { mkdirSync, readFileSync, statSync, writeFileSync } from "node:fs"; import { crawlWebsite } from "./index.js"; import PRESET_LIST from "./preset.js"; async function main() { const { preset, maxConnections, exclude = [], extract, log } = program.opts(); const [startUrl, outPath] = program.args; const fetchOptions = { headers: { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", }, follow: 3, }; if (startUrl.startsWith("https://") && process.env["HTTPS_PROXY"]) { fetchOptions.agent = new HttpsProxyAgent(process.env["HTTPS_PROXY"]); } let options = { maxConnections, exclude, extract, logEnabled: !!log, fetchOptions, }; applyPreset(preset, startUrl, options); if (!options.maxConnections) { options.maxConnections = 5; } if (!outPath) { options.logEnabled = false; } if (options.logEnabled) { console.log(`⚙️ maxConnections=${options.maxConnections} exclude='${options.exclude.join(",")}' extract='${options.extract || ""}'`); } const pages = []; for await (const page of crawlWebsite(startUrl, options)) { pages.push(page); } const data = JSON.stringify(pages, null, 2); if (outPath) { if (/(\/|\\)$/.test(outPath) || isDir(outPath)) { for (const page of pages) { let filePath = page.path.replace(/(\/|\.html)$/, ""); filePath = path.join(outPath, new URL(filePath).pathname + ".md"); mkdirSync(path.dirname(filePath), { recursive: true }); writeFileSync(filePath, page.text); } } else { mkdirSync(path.dirname(outPath), { recursive: true }); writeFileSync(outPath, data); } } else { console.log(data); } } function applyPreset(preset, startUrl, options) { let presetOptions; let presets = loadPresets(); if (preset === "auto") { presetOptions = presets.find((p) => new RegExp(p.test).test(startUrl)) ?.options; } else if (preset) { presetOptions = presets.find((p) => p.name === preset)?.options; } if (presetOptions) { if (presetOptions.maxConnections && !options.maxConnections) { options.maxConnections = presetOptions.maxConnections; } if (presetOptions.exclude?.length && !options.exclude?.length) { options.exclude = presetOptions.exclude; } if (presetOptions.extract && !options.extract) { options.extract = presetOptions.extract; } if (presetOptions.headers && Object.getPrototypeOf(presetOptions.headers) === Object.prototype) { options.fetchOptions.headers = { ...options.fetchOptions.headers, ...presetOptions.headers, }; } } } function loadPresets() { const homeDir = process.env.HOME; const filePath = path.join(homeDir, ".rag-crawler.json"); try { const data = readFileSync(filePath, "utf-8"); const jsonData = JSON.parse(data); if (Array.isArray(jsonData)) { return [...jsonData, ...PRESET_LIST]; } } catch { return PRESET_LIST; } } function isDir(path) { try { const stat = statSync(path); return stat.isDirectory(); } catch { return false; } } program .name("rag-crawler") .description(`Crawl a website to generate knowledge file for RAG Examples: rag-crawler https://sigoden.github.io/mynotes/languages/ rag-crawler https://sigoden.github.io/mynotes/languages/ data.json rag-crawler https://sigoden.github.io/mynotes/languages/ pages/ rag-crawler https://github.com/sigoden/mynotes/tree/main/src/languages/`) .argument("<startUrl>", "The URL to start crawling from. Don't forget trailing slash. [required]") .argument("[outPath]", "The output path. If omitted, output to stdout") .option("--preset <value>", "Use predefined crawl options", "auto") .option("-c, --max-connections <int>", "Maximum concurrent connections when crawling the pages", parseInt) .option("-e, --exclude <values>", "Comma-separated list of path names to exclude from crawling", (value) => value.split(",")) .option("--extract <css-selector>", "Extract specific content using a CSS selector, If omitted, extract all content") .option("--no-log", "Disable logging") .version("1.6.0"); program.parse(); main().catch((err) => { console.error(err); process.exit(1); });