UNPKG

crawldown

Version:

Crawl websites and convert their content into clean, readable Markdown using Mozilla's Readability and Turndown

180 lines (175 loc) 4.89 kB
#!/usr/bin/env node import { BrowserManager, DEFAULT_OPTIONS, crawl } from "./chunk-MHYQT2UX.js"; // src/cli.ts import { defineCommand, runMain } from "citty"; import consola from "consola"; import { writeFile, mkdir } from "fs/promises"; import { join } from "path"; var main = defineCommand({ meta: { name: "crawldown", description: "Crawl websites and convert their content into clean, readable Markdown using Mozilla's Readability and Turndown" }, args: { url: { type: "positional", description: "URL to scrape", valueHint: "https://example.com", required: true }, depth: { alias: "d", type: "string", default: DEFAULT_OPTIONS.depth.toString(), description: "Number of levels to crawl", required: false }, verbose: { alias: "v", type: "boolean", default: false, description: "Verbose logging", required: false }, "browser-path": { type: "string", description: "Path to browser executable. Will use playwright default if not provided", required: false }, output: { alias: "o", type: "string", default: "output", description: "Output directory", required: false }, "single-file": { type: "boolean", default: false, description: "Output all results to a single markdown file", required: false }, concurrency: { alias: "c", type: "string", default: DEFAULT_OPTIONS.concurrency.toString(), description: "Number of concurrent pages to use", required: false }, "scope-url": { type: "string", description: "URL that defines the crawling scope. Links outside this scope will be ignored", required: false }, headless: { type: "boolean", default: !DEFAULT_OPTIONS.noHeadless, description: "Disable headless mode - will show browser UI. Useful for debugging.", required: false }, force: { alias: "f", type: "boolean", default: DEFAULT_OPTIONS.force, description: "Force scraping content even if page hasn't fully loaded, 1 second before timeout", required: false }, timeout: { alias: "t", type: "string", default: DEFAULT_OPTIONS.timeout.toString(), description: "Navigation timeout in milliseconds", required: false } }, run: async ({ args }) => { const { url, depth: depthString, verbose, "browser-path": browserPath, output, "single-file": singleFile, concurrency: concurrencyString, "scope-url": scopeUrl, headless, force, timeout: timeoutString } = args; const depth = parseInt(depthString, 10); const concurrency = parseInt(concurrencyString, 10); const timeout = parseInt(timeoutString, 10); if (verbose) { consola.level = 4; } const results = await crawl({ url, depth, browserPath, concurrency, scopeUrl, noHeadless: !headless, force, timeout }); if (singleFile) { const outputFile = output.endsWith(".md") ? output : `${output}.md`; await writeToSingleFile(outputFile, results); return; } await mkdir(output, { recursive: true }); for (const result of results) { try { const urlObj = new URL(result.url); const sanitizedPath = urlObj.pathname.replace(/\/$/, "").replace(/^\//, "").replace(/[^a-z0-9/]/gi, "_"); let dirPath; let fileName; if (!sanitizedPath) { dirPath = join(output, urlObj.hostname); fileName = "index.md"; } else { const pathParts = sanitizedPath.split("/"); fileName = `${pathParts.pop()}.md`; dirPath = join(output, urlObj.hostname, pathParts.join("/")); } await mkdir(dirPath, { recursive: true }); await writeMarkdownFile(join(dirPath, fileName), result); } catch (error) { consola.error(`Failed to write file for ${result.url}:`, error); } } } }); async function writeMarkdownFile(filePath, result) { const content = [ "---", `title: ${JSON.stringify(result.title)}`, `url: ${result.url}`, "---", "", result.markdown ].join("\n"); await writeFile(filePath, content); consola.success(`Written: ${filePath}`); } async function writeToSingleFile(outputPath, results) { const content = results.map( (result) => `# ${result.title} Source: ${result.url} ${result.markdown} --- ` ).join("\n"); await writeFile(outputPath, content); consola.success(`Written all content to: ${outputPath}`); } void runMain(main).finally(async () => { const browserManager = BrowserManager.getInstance(); await browserManager.cleanup(); process.exit(0); }); //# sourceMappingURL=cli.js.map