@promptordie/siphon-knowledge
Version:
AI-powered documentation generation system for AI Coding Agents.
159 lines (125 loc) • 5.19 kB
text/typescript
import { chromium, type Browser, type Page } from "playwright";
import { writeFile } from "node:fs/promises";
import { logger } from "../logger.ts";
// Configurable parameters - can be set via environment variables
const START_URL: string = process.env.START_URL ?? "https://example.com";
const MAX_PAGES: number = Number(process.env.MAX_PAGES ?? 1000); // hard stop
const CONCURRENCY: number = Number(process.env.CONCURRENCY ?? 10); // workers
const ORIGIN: string = new URL(START_URL).origin;
const PATH_PREFIX: string = new URL(START_URL).pathname.replace(/\/+$/,"") || "/";
// Output file name based on domain
const DOMAIN_NAME = new URL(START_URL).hostname.replace(/\./g, '_');
const OUTPUT_FILE = `${DOMAIN_NAME}_links.txt`;
type QItem = { url: string, depth: number };
const normalize = (raw: string): string | null => {
try {
const u = new URL(raw, START_URL);
u.hash = ""; // Remove hash fragments
// Only crawl same origin and under the repo path
if (u.origin !== ORIGIN) return null;
if (!u.pathname.startsWith(PATH_PREFIX)) return null;
// Avoid binary or obvious non-HTML routes
if (/\.(png|jpe?g|gif|webp|svg|mp4|mp3|pdf|zip|gz|tgz|bz2|7z|ico|css|js|woff|woff2|ttf|otf)$/i.test(u.pathname)) return null;
return u.toString();
} catch {
return null;
}
};
const sleep = (ms: number) => new Promise(res => setTimeout(res, ms));
async function extractLinks(page: Page): Promise<string[]> {
// Grab every anchor href on the rendered SPA page
const hrefs: string[] = await page.$$eval("a[href]", (as: HTMLAnchorElement[]) => as.map((a: HTMLAnchorElement) => a.href as string).filter(Boolean));
return hrefs;
}
async function crawl(): Promise<void> {
logger.info(`🚀 Starting crawl of: ${START_URL}`);
logger.info(`📊 Max pages: ${MAX_PAGES}`);
logger.info(`⚡ Concurrency: ${CONCURRENCY}`);
logger.info(`📁 Output file: ${OUTPUT_FILE}`);
const browser: Browser = await chromium.launch({
headless: true
});
try {
const visited = new Set<string>();
const results = new Set<string>();
const queue: QItem[] = [{ url: START_URL, depth: 0 }];
const routeBlock = [
"**/*.{png,jpg,jpeg,gif,webp,mp4,mp3,woff,woff2,ttf,otf,zip,svg,css,js}"
];
// Create a pool of browser contexts for better performance
const contexts = await Promise.all(
Array.from({ length: CONCURRENCY }, () => browser.newContext())
);
const processPage = async (url: string, depth: number, contextIndex: number) => {
const context = contexts[contextIndex % contexts.length];
if (!context) {
logger.warn(`⚠️ No context available for index ${contextIndex}`);
return;
}
const page = await context.newPage();
try {
const routePattern = routeBlock[0];
if (routePattern) {
await page.route(routePattern, route => route.abort());
}
await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 60000
});
// Let the SPA fetch data and paint
try {
await page.waitForLoadState("networkidle", { timeout: 8000 });
} catch {}
await sleep(800); // tiny settle
const hrefs = await extractLinks(page);
for (const h of hrefs) {
const n = normalize(h);
if (!n) continue;
if (!visited.has(n) && !queue.some(q => q.url === n)) {
queue.push({ url: n, depth: depth + 1 });
}
}
} catch (error) {
logger.warn(`⚠️ Error processing ${url}: ${(error as Error).message}`);
} finally {
await page.close().catch(() => {});
}
};
const processBatch = async (urls: string[], depth: number) => {
const promises = urls.map(async (url, index) => {
if (visited.has(url)) return;
visited.add(url);
results.add(url);
logger.info(`🔍 Crawling: ${url} (${visited.size}/${MAX_PAGES}) [Active: ${urls.length}]`);
await processPage(url, depth, index);
});
await Promise.all(promises);
};
// Main processing loop with true concurrency
while (visited.size < MAX_PAGES && queue.length > 0) {
const batch = [];
const batchSize = Math.min(CONCURRENCY, queue.length, MAX_PAGES - visited.size);
for (let i = 0; i < batchSize; i++) {
const item = queue.shift();
if (item && !visited.has(item.url)) {
batch.push(item.url);
}
}
if (batch.length === 0) break;
await processBatch(batch, 0);
}
// Close all contexts
await Promise.all(contexts.map(context => context.close()));
const sorted = Array.from(results).sort();
await writeFile(OUTPUT_FILE, sorted.join("\n") + "\n", "utf8");
logger.success(`✅ Crawled pages: ${visited.size}`);
logger.info(`📄 Saved: ${OUTPUT_FILE}`);
logger.info(`🌐 Domain: ${DOMAIN_NAME}`);
} finally {
await browser.close();
}
}
crawl().catch(err => {
logger.error((err as Error).message);
process.exit(1);
});