@promptordie/siphon-knowledge

import { chromium, type Browser, type Page } from "playwright"; import { writeFile } from "node:fs/promises"; import { logger } from "../logger.ts"; // Configurable parameters - can be set via environment variables const START_URL: string = process.env.START_URL ?? "https://example.com"; const MAX_PAGES: number = Number(process.env.MAX_PAGES ?? 1000); // hard stop const CONCURRENCY: number = Number(process.env.CONCURRENCY ?? 10); // workers const ORIGIN: string = new URL(START_URL).origin; const PATH_PREFIX: string = new URL(START_URL).pathname.replace(/\/+$/,"") || "/"; // Output file name based on domain const DOMAIN_NAME = new URL(START_URL).hostname.replace(/\./g, '_'); const OUTPUT_FILE = `${DOMAIN_NAME}_links.txt`; type QItem = { url: string, depth: number }; const normalize = (raw: string): string | null => { try { const u = new URL(raw, START_URL); u.hash = ""; // Remove hash fragments // Only crawl same origin and under the repo path if (u.origin !== ORIGIN) return null; if (!u.pathname.startsWith(PATH_PREFIX)) return null; // Avoid binary or obvious non-HTML routes if (/\.(png|jpe?g|gif|webp|svg|mp4|mp3|pdf|zip|gz|tgz|bz2|7z|ico|css|js|woff|woff2|ttf|otf)$/i.test(u.pathname)) return null; return u.toString(); } catch { return null; } }; const sleep = (ms: number) => new Promise(res => setTimeout(res, ms)); async function extractLinks(page: Page): Promise<string[]> { // Grab every anchor href on the rendered SPA page const hrefs: string[] = await page.$$eval("a[href]", (as: HTMLAnchorElement[]) => as.map((a: HTMLAnchorElement) => a.href as string).filter(Boolean)); return hrefs; } async function crawl(): Promise<void> { logger.info(`🚀 Starting crawl of: ${START_URL}`); logger.info(`📊 Max pages: ${MAX_PAGES}`); logger.info(`⚡ Concurrency: ${CONCURRENCY}`); logger.info(`📁 Output file: ${OUTPUT_FILE}`); const browser: Browser = await chromium.launch({ headless: true }); try { const visited = new Set<string>(); const results = new Set<string>(); const queue: QItem[] = [{ url: START_URL, depth: 0 }]; const routeBlock = [ "**/*.{png,jpg,jpeg,gif,webp,mp4,mp3,woff,woff2,ttf,otf,zip,svg,css,js}" ]; // Create a pool of browser contexts for better performance const contexts = await Promise.all( Array.from({ length: CONCURRENCY }, () => browser.newContext()) ); const processPage = async (url: string, depth: number, contextIndex: number) => { const context = contexts[contextIndex % contexts.length]; if (!context) { logger.warn(`⚠️ No context available for index ${contextIndex}`); return; } const page = await context.newPage(); try { const routePattern = routeBlock[0]; if (routePattern) { await page.route(routePattern, route => route.abort()); } await page.goto(url, { waitUntil: "domcontentloaded", timeout: 60000 }); // Let the SPA fetch data and paint try { await page.waitForLoadState("networkidle", { timeout: 8000 }); } catch {} await sleep(800); // tiny settle const hrefs = await extractLinks(page); for (const h of hrefs) { const n = normalize(h); if (!n) continue; if (!visited.has(n) && !queue.some(q => q.url === n)) { queue.push({ url: n, depth: depth + 1 }); } } } catch (error) { logger.warn(`⚠️ Error processing ${url}: ${(error as Error).message}`); } finally { await page.close().catch(() => {}); } }; const processBatch = async (urls: string[], depth: number) => { const promises = urls.map(async (url, index) => { if (visited.has(url)) return; visited.add(url); results.add(url); logger.info(`🔍 Crawling: ${url} (${visited.size}/${MAX_PAGES}) [Active: ${urls.length}]`); await processPage(url, depth, index); }); await Promise.all(promises); }; // Main processing loop with true concurrency while (visited.size < MAX_PAGES && queue.length > 0) { const batch = []; const batchSize = Math.min(CONCURRENCY, queue.length, MAX_PAGES - visited.size); for (let i = 0; i < batchSize; i++) { const item = queue.shift(); if (item && !visited.has(item.url)) { batch.push(item.url); } } if (batch.length === 0) break; await processBatch(batch, 0); } // Close all contexts await Promise.all(contexts.map(context => context.close())); const sorted = Array.from(results).sort(); await writeFile(OUTPUT_FILE, sorted.join("\n") + "\n", "utf8"); logger.success(`✅ Crawled pages: ${visited.size}`); logger.info(`📄 Saved: ${OUTPUT_FILE}`); logger.info(`🌐 Domain: ${DOMAIN_NAME}`); } finally { await browser.close(); } } crawl().catch(err => { logger.error((err as Error).message); process.exit(1); });