rag-crawler
Version:
Crawl a website to generate knowledge file for RAG
134 lines (132 loc) • 4.86 kB
JavaScript
import { program } from "commander";
import { HttpsProxyAgent } from "https-proxy-agent";
import path from "node:path";
import { mkdirSync, readFileSync, statSync, writeFileSync } from "node:fs";
import { crawlWebsite } from "./index.js";
import PRESET_LIST from "./preset.js";
async function main() {
const { preset, maxConnections, exclude = [], extract, log } = program.opts();
const [startUrl, outPath] = program.args;
const fetchOptions = {
headers: {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
},
follow: 3,
};
if (startUrl.startsWith("https://") && process.env["HTTPS_PROXY"]) {
fetchOptions.agent = new HttpsProxyAgent(process.env["HTTPS_PROXY"]);
}
let options = {
maxConnections,
exclude,
extract,
logEnabled: !!log,
fetchOptions,
};
applyPreset(preset, startUrl, options);
if (!options.maxConnections) {
options.maxConnections = 5;
}
if (!outPath) {
options.logEnabled = false;
}
if (options.logEnabled) {
console.log(`⚙️ maxConnections=${options.maxConnections} exclude='${options.exclude.join(",")}' extract='${options.extract || ""}'`);
}
const pages = [];
for await (const page of crawlWebsite(startUrl, options)) {
pages.push(page);
}
const data = JSON.stringify(pages, null, 2);
if (outPath) {
if (/(\/|\\)$/.test(outPath) || isDir(outPath)) {
for (const page of pages) {
let filePath = page.path.replace(/(\/|\.html)$/, "");
filePath = path.join(outPath, new URL(filePath).pathname + ".md");
mkdirSync(path.dirname(filePath), { recursive: true });
writeFileSync(filePath, page.text);
}
}
else {
mkdirSync(path.dirname(outPath), { recursive: true });
writeFileSync(outPath, data);
}
}
else {
console.log(data);
}
}
function applyPreset(preset, startUrl, options) {
let presetOptions;
let presets = loadPresets();
if (preset === "auto") {
presetOptions = presets.find((p) => new RegExp(p.test).test(startUrl))
?.options;
}
else if (preset) {
presetOptions = presets.find((p) => p.name === preset)?.options;
}
if (presetOptions) {
if (presetOptions.maxConnections && !options.maxConnections) {
options.maxConnections = presetOptions.maxConnections;
}
if (presetOptions.exclude?.length && !options.exclude?.length) {
options.exclude = presetOptions.exclude;
}
if (presetOptions.extract && !options.extract) {
options.extract = presetOptions.extract;
}
if (presetOptions.headers &&
Object.getPrototypeOf(presetOptions.headers) === Object.prototype) {
options.fetchOptions.headers = {
...options.fetchOptions.headers,
...presetOptions.headers,
};
}
}
}
function loadPresets() {
const homeDir = process.env.HOME;
const filePath = path.join(homeDir, ".rag-crawler.json");
try {
const data = readFileSync(filePath, "utf-8");
const jsonData = JSON.parse(data);
if (Array.isArray(jsonData)) {
return [...jsonData, ...PRESET_LIST];
}
}
catch {
return PRESET_LIST;
}
}
function isDir(path) {
try {
const stat = statSync(path);
return stat.isDirectory();
}
catch {
return false;
}
}
program
.name("rag-crawler")
.description(`Crawl a website to generate knowledge file for RAG
Examples:
rag-crawler https://sigoden.github.io/mynotes/languages/
rag-crawler https://sigoden.github.io/mynotes/languages/ data.json
rag-crawler https://sigoden.github.io/mynotes/languages/ pages/
rag-crawler https://github.com/sigoden/mynotes/tree/main/src/languages/`)
.argument("<startUrl>", "The URL to start crawling from. Don't forget trailing slash. [required]")
.argument("[outPath]", "The output path. If omitted, output to stdout")
.option("--preset <value>", "Use predefined crawl options", "auto")
.option("-c, --max-connections <int>", "Maximum concurrent connections when crawling the pages", parseInt)
.option("-e, --exclude <values>", "Comma-separated list of path names to exclude from crawling", (value) => value.split(","))
.option("--extract <css-selector>", "Extract specific content using a CSS selector, If omitted, extract all content")
.option("--no-log", "Disable logging")
.version("1.6.0");
program.parse();
main().catch((err) => {
console.error(err);
process.exit(1);
});