UNPKG

@promptordie/siphon-knowledge

Version:

AI-powered documentation generation system for AI Coding Agents.

629 lines (532 loc) 19.9 kB
import { chromium, type Browser, type Page } from "playwright"; import { readFile, writeFile, mkdir } from "node:fs/promises"; import { existsSync } from "node:fs"; import path from "node:path"; import { logger } from "../logger.ts"; interface Category { name: string; description: string; patterns: string[]; urls: string[]; } interface Context { name: string; description: string; categories: Category[]; } interface EnhancedScrapedContent { url: string; title: string; content: string; timestamp: string; category: string; context: string; metadata: { sourceFiles: string[]; codeReferences: string[]; diagrams: string[]; crossReferences: string[]; navigation: string[]; breadcrumbs: string[]; tags: string[]; lastModified?: string; contributors?: string[]; }; rawHtml?: string; screenshots?: string[]; } const extractEnhancedPageContent = async (page: Page): Promise<{ title: string; content: string; metadata: EnhancedScrapedContent['metadata']; rawHtml?: string; }> => { // Wait for content to load await page.waitForLoadState("domcontentloaded"); // Extract title const title = await page.title(); // Extract comprehensive metadata and content const pageData = await page.evaluate(() => { const metadata = { sourceFiles: [] as string[], codeReferences: [] as string[], diagrams: [] as string[], crossReferences: [] as string[], navigation: [] as string[], breadcrumbs: [] as string[], tags: [] as string[], lastModified: undefined as string | undefined, contributors: [] as string[] }; const title = document.title; // Extract source file references (common in deepwiki) const sourceElements = document.querySelectorAll('code, pre, .source, .file, [data-source]'); sourceElements.forEach(el => { const text = el.textContent?.trim(); if (text && (text.includes('.ts') || text.includes('.js') || text.includes('.json') || text.includes('packages/'))) { metadata.sourceFiles.push(text); } }); // Extract code references with line numbers const codeRefs = document.querySelectorAll('a[href*="packages/"], a[href*="src/"], .code-ref, [data-line]'); codeRefs.forEach(el => { const href = (el as HTMLAnchorElement).href; const text = el.textContent?.trim(); if (href || text) { metadata.codeReferences.push(`${text} (${href})`); } }); // Extract diagrams and images const images = document.querySelectorAll('img, svg, .diagram, .chart'); images.forEach(img => { const src = (img as HTMLImageElement).src; const alt = (img as HTMLImageElement).alt; if (src || alt) { metadata.diagrams.push(`${alt} (${src})`); } }); // Extract cross-references and links const links = document.querySelectorAll('a[href*="/elizaOS/eliza/"]'); links.forEach(link => { const href = (link as HTMLAnchorElement).href; const text = link.textContent?.trim(); if (href && text && !href.includes('#')) { metadata.crossReferences.push(`${text} (${href})`); } }); // Extract navigation elements const navElements = document.querySelectorAll('nav, .nav, .navigation, .sidebar, .menu'); navElements.forEach(nav => { const links = nav.querySelectorAll('a'); links.forEach(link => { const text = link.textContent?.trim(); const href = (link as HTMLAnchorElement).href; if (text && href) { metadata.navigation.push(`${text} (${href})`); } }); }); // Extract breadcrumbs const breadcrumbElements = document.querySelectorAll('.breadcrumb, .breadcrumbs, [aria-label*="breadcrumb"]'); breadcrumbElements.forEach(bc => { const items = bc.querySelectorAll('a, span'); items.forEach(item => { const text = item.textContent?.trim(); if (text) { metadata.breadcrumbs.push(text); } }); }); // Extract tags and metadata const tagElements = document.querySelectorAll('.tag, .label, .badge, [data-tag]'); tagElements.forEach(tag => { const text = tag.textContent?.trim(); if (text) { metadata.tags.push(text); } }); // Try to find last modified date const timeElements = document.querySelectorAll('time, .date, .modified, [datetime]'); timeElements.forEach(time => { const datetime = (time as HTMLTimeElement).dateTime; const text = time.textContent?.trim(); if (datetime || text) { metadata.lastModified = datetime || text; } }); // Extract main content with enhanced selectors const contentSelectors = [ 'main', '[role="main"]', '.content', '.main-content', 'article', '.article', '#content', '.markdown-body', '.prose', '.documentation', '.wiki-content' ]; let content = ''; let mainElement: Element | null = null; for (const selector of contentSelectors) { try { const element = document.querySelector(selector); if (element) { content = element.textContent || ''; if (content.trim().length > 100) { mainElement = element; break; } } } catch (e) { // Continue to next selector } } // If no main content found, get body text if (!content.trim()) { // Remove script and style elements const scripts = document.querySelectorAll('script, style, nav, header, footer, .nav, .header, .footer'); scripts.forEach(el => el.remove()); content = document.body.textContent || ''; } // Get raw HTML for additional processing const rawHtml = mainElement ? mainElement.innerHTML : document.body.innerHTML; return { title: title, content: content.trim(), metadata, rawHtml }; }); return pageData; }; const scrapeCategoryEnhanced = async ( browser: Browser, category: Category, contextName: string, categoryName: string ): Promise<EnhancedScrapedContent[]> => { logger.info(`🔍 Enhanced scraping ${category.urls.length} URLs for ${categoryName}...`); const results: EnhancedScrapedContent[] = []; const timestamp = new Date().toISOString(); // Create promises for all URLs in this category const scrapePromises = category.urls.map(async (url, index) => { try { const context = await browser.newContext({ userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36", viewport: { width: 1920, height: 1080 } }); const page = await context.newPage(); // Block unnecessary resources but allow images for diagrams await page.route("**/*.{mp4,mp3,woff,woff2,ttf,otf,zip,svg}", route => route.abort()); logger.info(` [${index + 1}/${category.urls.length}] Enhanced scraping: ${url}`); await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 }); // Wait for content to load try { await page.waitForLoadState("networkidle", { timeout: 5000 }); } catch (e) { // Continue if network doesn't become idle } // Small delay to ensure dynamic content loads await page.waitForTimeout(2000); const { title, content, metadata, rawHtml } = await extractEnhancedPageContent(page); // Take screenshot for visual context let screenshot = ''; try { const screenshotBuffer = await page.screenshot({ type: 'png', fullPage: true }); screenshot = screenshotBuffer.toString('base64'); } catch (e) { logger.warn(` ⚠️ Could not capture screenshot for ${url}`); } await context.close(); return { url, title, content, timestamp, category: categoryName, context: contextName, metadata, rawHtml, screenshots: screenshot ? [screenshot] : [] }; } catch (error) { logger.error(` ❌ Error enhanced scraping ${url}: ${(error as Error).message}`); return { url, title: 'Error loading page', content: `Error: ${(error as Error).message}`, timestamp, category: categoryName, context: contextName, metadata: { sourceFiles: [], codeReferences: [], diagrams: [], crossReferences: [], navigation: [], breadcrumbs: [], tags: [] } }; } }); // Wait for all pages in this category to be scraped const categoryResults = await Promise.all(scrapePromises); results.push(...categoryResults); logger.success(`✅ Enhanced scraping completed for ${categoryName}: ${categoryResults.length} pages`); return results; }; const saveEnhancedContent = async (content: EnhancedScrapedContent[], outputDir: string): Promise<void> => { // Validate content array is not empty if (!content || content.length === 0) { throw new Error('Content array is empty or undefined'); } // Create directory structure with proper null checks const firstItem = content[0]; const contextName = firstItem?.context || 'unknown-context'; const categoryName = firstItem?.category || 'unknown-category'; const contextDir = path.join(outputDir, contextName.replace(/\s+/g, '-').toLowerCase()); const categoryDir = path.join(contextDir, categoryName.replace(/\s+/g, '-').toLowerCase()); if (!existsSync(contextDir)) { await mkdir(contextDir, { recursive: true }); } if (!existsSync(categoryDir)) { await mkdir(categoryDir, { recursive: true }); } // Save each page as individual file with enhanced content for (const item of content) { const filename = item.url.split('/').pop() || 'index'; const safeFilename = filename.replace(/[^a-zA-Z0-9-_]/g, '_') + '.md'; const filepath = path.join(categoryDir, safeFilename); const markdownContent = `# ${item.title} **URL:** ${item.url} **Category:** ${item.category} **Context:** ${item.context} **Scraped:** ${item.timestamp} ## Metadata ### Source Files ${item.metadata.sourceFiles.length > 0 ? item.metadata.sourceFiles.map(file => `- \`${file}\``).join('\n') : '- None found'} ### Code References ${item.metadata.codeReferences.length > 0 ? item.metadata.codeReferences.map(ref => `- ${ref}`).join('\n') : '- None found'} ### Diagrams & Images ${item.metadata.diagrams.length > 0 ? item.metadata.diagrams.map(diagram => `- ${diagram}`).join('\n') : '- None found'} ### Cross-References ${item.metadata.crossReferences.length > 0 ? item.metadata.crossReferences.map(ref => `- ${ref}`).join('\n') : '- None found'} ### Navigation ${item.metadata.navigation.length > 0 ? item.metadata.navigation.map(nav => `- ${nav}`).join('\n') : '- None found'} ### Breadcrumbs ${item.metadata.breadcrumbs.length > 0 ? item.metadata.breadcrumbs.join(' > ') : '- None found'} ### Tags ${item.metadata.tags.length > 0 ? item.metadata.tags.map(tag => `- ${tag}`).join('\n') : '- None found'} ${item.metadata.lastModified ? `### Last Modified\n${item.metadata.lastModified}\n` : ''} ${item.metadata.contributors && item.metadata.contributors.length > 0 ? `### Contributors\n${item.metadata.contributors.map(contrib => `- ${contrib}`).join('\n')}\n` : ''} ## Content ${item.content} ${item.rawHtml ? ` ## Raw HTML Context \`\`\`html ${item.rawHtml.substring(0, 2000)}... \`\`\` ` : ''} ${item.screenshots && item.screenshots.length > 0 ? ` ## Screenshot ![Page Screenshot](data:image/png;base64,${item.screenshots[0]}) ` : ''} `; await writeFile(filepath, markdownContent, 'utf8'); } // Save enhanced summary file for the category const summaryPath = path.join(categoryDir, '_enhanced_summary.md'); const summaryContent = `# ${content[0]?.category || 'Unknown Category'} - Enhanced Summary **Context:** ${content[0]?.context || 'Unknown Context'} **Total Pages:** ${content.length} **Scraped:** ${content[0]?.timestamp || new Date().toISOString()} ## Pages with Rich Context ${content.map(item => { const sourceCount = item.metadata.sourceFiles.length; const codeCount = item.metadata.codeReferences.length; const diagramCount = item.metadata.diagrams.length; const crossCount = item.metadata.crossReferences.length; return `### [${item.title}](${item.url}) - **Source Files:** ${sourceCount} - **Code References:** ${codeCount} - **Diagrams:** ${diagramCount} - **Cross-References:** ${crossCount} - **Tags:** ${item.metadata.tags.join(', ') || 'None'} `; }).join('\n')} ## Context Statistics - **Total Source Files Referenced:** ${content.reduce((sum, item) => sum + item.metadata.sourceFiles.length, 0)} - **Total Code References:** ${content.reduce((sum, item) => sum + item.metadata.codeReferences.length, 0)} - **Total Diagrams:** ${content.reduce((sum, item) => sum + item.metadata.diagrams.length, 0)} - **Total Cross-References:** ${content.reduce((sum, item) => sum + item.metadata.crossReferences.length, 0)} - **Total Tags:** ${content.reduce((sum, item) => sum + item.metadata.tags.length, 0)} `; await writeFile(summaryPath, summaryContent, 'utf8'); }; // Rest of the script remains the same as the original const categorizeUrls = (urls: string[]): Context[] => { const contexts: Context[] = [ { name: "developer-context", description: "Technical documentation for developers", categories: [ { name: "architecture-&-core-concepts", description: "System architecture, core concepts, and fundamental design", patterns: ["architecture", "core", "system", "design", "overview", "concepts", "fundamentals"], urls: [] }, { name: "plugin-development", description: "Plugin system, creation, and management", patterns: ["plugin", "extension", "addon", "module", "component"], urls: [] }, { name: "api-reference", description: "API documentation and reference materials", patterns: ["api", "reference", "endpoint", "interface", "sdk"], urls: [] }, { name: "development-workflow", description: "Development processes, building, testing, and CI/CD", patterns: ["development", "workflow", "build", "test", "ci", "cd", "pipeline", "deploy"], urls: [] }, { name: "server-&-infrastructure", description: "Server setup, deployment, and infrastructure", patterns: ["server", "infrastructure", "deployment", "production", "hosting", "config"], urls: [] }, { name: "data-&-storage", description: "Database, storage, and data management", patterns: ["data", "storage", "database", "memory", "cache", "persistence"], urls: [] }, { name: "advanced-development", description: "Advanced features, integrations, and complex scenarios", patterns: ["advanced", "integration", "scenario", "complex", "enterprise", "scaling"], urls: [] } ] }, { name: "user-context", description: "User-facing documentation and guides", categories: [ { name: "getting-started", description: "Quick start guides and onboarding", patterns: ["start", "quick", "guide", "tutorial", "onboarding", "setup", "install"], urls: [] }, { name: "cli-usage", description: "Command line interface and tools", patterns: ["cli", "command", "terminal", "console", "tool"], urls: [] }, { name: "user-interface", description: "Web interface, client apps, and UI", patterns: ["ui", "interface", "web", "client", "app", "dashboard", "console"], urls: [] }, { name: "real-time-features", description: "Real-time communication and live features", patterns: ["real-time", "live", "communication", "chat", "message", "stream"], urls: [] } ] } ]; // Categorize URLs urls.forEach(url => { const urlPath = new URL(url).pathname.toLowerCase(); let categorized = false; for (const context of contexts) { for (const category of context.categories) { for (const pattern of category.patterns) { if (urlPath.includes(pattern.toLowerCase())) { category.urls.push(url); categorized = true; break; } } if (categorized) break; } if (categorized) break; } // If not categorized, add to first category of developer context if (!categorized && contexts.length > 0) { const firstContext = contexts[0]; if (firstContext && firstContext.categories.length > 0) { const firstCategory = firstContext.categories[0]; if (firstCategory) { firstCategory.urls.push(url); } } } }); return contexts; }; async function main() { try { logger.info("🚀 Starting enhanced ElizaOS content scraping..."); // Find the links file based on domain const { readdir } = await import('fs/promises'); const files = await readdir(".", { withFileTypes: true }); const linkFiles = files.filter(f => f.isFile() && f.name.endsWith('_links.txt')); if (linkFiles.length === 0) { logger.error("❌ No links file found. Please run the crawler first."); process.exit(1); } const linksFile = linkFiles[0]?.name; if (!linksFile) { logger.error("❌ No valid links file found."); process.exit(1); } const domainName = linksFile.replace('_links.txt', '').replace(/_/g, '.'); logger.info(`📁 Reading links from: ${linksFile}`); logger.info(`🌐 Domain: ${domainName}`); // Read URLs logger.info("Reading crawled URLs..."); const content = await readFile(linksFile, "utf8"); const urls = content.trim().split("\n").filter(Boolean); logger.info(`Processing ${urls.length} URLs...`); // Categorize URLs const contexts = categorizeUrls(urls); // Launch browser logger.info("Launching browser..."); const browser = await chromium.launch({ headless: true, }); try { // Create output directory const outputDir = "scraped-content-enhanced"; logger.info(`📁 Output directory: ${outputDir}`); if (!existsSync(outputDir)) { await mkdir(outputDir, { recursive: true }); } // Process each context and category logger.info("📚 Processing Developer Context..."); for (const context of contexts) { for (const category of context.categories) { if (category.urls.length > 0) { logger.info(` 📂 Category: ${category.name} (${category.urls.length} URLs)`); const results = await scrapeCategoryEnhanced( browser, category, context.name, category.name ); await saveEnhancedContent(results, outputDir); } } } logger.success("✅ Enhanced content scraping completed!"); logger.info(`📁 All content saved to: ${outputDir}/`); } finally { await browser.close(); } } catch (error) { logger.error(`❌ Error during enhanced scraping: ${(error as Error).message}`); process.exit(1); } } main();