@promptordie/siphon-knowledge

import { chromium, Browser, Page } from "playwright"; import { readFile, writeFile, mkdir, readdir, unlink, rmdir } from "node:fs/promises"; import { existsSync, statSync } from "node:fs"; import path from "node:path"; import { logger } from "../logger.ts"; import { UserPreferenceManager, UserPreferences } from "./user-preferences.ts"; // Enhanced interfaces for better data organization interface EnhancedScrapedContent { url: string; title: string; content: string; timestamp: string; category: string; context: string; metadata: { sourceFiles: string[]; codeReferences: string[]; diagrams: string[]; crossReferences: string[]; navigation: string[]; breadcrumbs: string[]; tags: string[]; lastModified?: string; contributors?: string[]; wordCount: number; readingTime: number; complexity: 'basic' | 'intermediate' | 'advanced'; }; rawHtml?: string; screenshots?: string[]; quality: { contentLength: number; hasCode: boolean; hasImages: boolean; hasLinks: boolean; completeness: number; // 0-100 }; } interface Category { name: string; description: string; patterns: string[]; urls: string[]; } interface Context { name: string; description: string; categories: Category[]; } interface DataProcessingOptions { level: 'full' | 'categorized' | 'polished'; includeMetadata: boolean; includeRawHtml: boolean; includeScreenshots: boolean; cleanupOldData: boolean; organizeByDate: boolean; compressOutput: boolean; } interface ProcessingStats { totalUrls: number; successfulScrapes: number; failedScrapes: number; totalContentSize: number; processingTime: number; outputSize: number; } class DataOrganizer { private browser: Browser | null = null; private stats: ProcessingStats = { totalUrls: 0, successfulScrapes: 0, failedScrapes: 0, totalContentSize: 0, processingTime: 0, outputSize: 0 }; constructor(private options: DataProcessingOptions) {} async initialize(): Promise<void> { logger.info("🚀 Initializing Data Organizer..."); this.browser = await chromium.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); logger.success("✅ Browser initialized"); } async cleanup(): Promise<void> { if (this.browser) { await this.browser.close(); logger.info("🔒 Browser closed"); } } private async extractEnhancedPageContent(page: Page): Promise<{ title: string; content: string; metadata: EnhancedScrapedContent['metadata']; rawHtml?: string; }> { await page.waitForLoadState("domcontentloaded"); const title = await page.title(); const pageData = await page.evaluate(() => { const metadata = { sourceFiles: [] as string[], codeReferences: [] as string[], diagrams: [] as string[], crossReferences: [] as string[], navigation: [] as string[], breadcrumbs: [] as string[], tags: [] as string[], lastModified: undefined as string | undefined, contributors: [] as string[], wordCount: 0, readingTime: 0, complexity: 'basic' as const }; // Extract source file references const sourceElements = document.querySelectorAll('code, pre, .source, .file, [data-source]'); sourceElements.forEach(el => { const text = el.textContent?.trim(); if (text && (text.includes('.ts') || text.includes('.js') || text.includes('.json') || text.includes('packages/'))) { metadata.sourceFiles.push(text); } }); // Extract code references const codeRefs = document.querySelectorAll('a[href*="packages/"], a[href*="src/"], .code-ref, [data-line]'); codeRefs.forEach(el => { const href = (el as HTMLAnchorElement).href; const text = el.textContent?.trim(); if (href || text) { metadata.codeReferences.push(`${text} (${href})`); } }); // Extract diagrams and images const images = document.querySelectorAll('img, svg, .diagram, .chart'); images.forEach(img => { const src = (img as HTMLImageElement).src; const alt = (img as HTMLImageElement).alt; if (src || alt) { metadata.diagrams.push(`${alt} (${src})`); } }); // Extract cross-references and links const links = document.querySelectorAll('a[href*="/elizaOS/eliza/"]'); links.forEach(link => { const href = (link as HTMLAnchorElement).href; const text = (link as HTMLAnchorElement).textContent?.trim(); if (href && text) { metadata.crossReferences.push(`${text} (${href})`); } }); // Extract navigation elements const navElements = document.querySelectorAll('nav, .navigation, .menu, .sidebar'); navElements.forEach(nav => { const text = nav.textContent?.trim(); if (text) { metadata.navigation.push(text); } }); // Extract breadcrumbs const breadcrumbElements = document.querySelectorAll('.breadcrumb, .breadcrumbs, [aria-label*="breadcrumb"]'); breadcrumbElements.forEach(bc => { const text = bc.textContent?.trim(); if (text) { metadata.breadcrumbs.push(text); } }); // Extract tags const tagElements = document.querySelectorAll('.tag, .tags, [data-tag]'); tagElements.forEach(tag => { const text = tag.textContent?.trim(); if (text) { metadata.tags.push(text); } }); // Extract main content const contentSelectors = [ 'main', '[role="main"]', '.content', '.main-content', 'article', '.article', '#content', '.markdown-body', '.prose' ]; let content = ''; let mainElement: Element | null = null; for (const selector of contentSelectors) { const element = document.querySelector(selector); if (element) { content = element.textContent || ''; if (content.trim().length > 100) { mainElement = element; break; } } } if (!content.trim()) { const scripts = document.querySelectorAll('script, style, nav, header, footer, .nav, .header, .footer'); scripts.forEach(el => el.remove()); content = document.body.textContent || ''; } // Calculate metadata const words = content.trim().split(/\s+/).length; metadata.wordCount = words; metadata.readingTime = Math.ceil(words / 200); // 200 words per minute // Determine complexity based on content analysis const hasCode = metadata.sourceFiles.length > 0 || metadata.codeReferences.length > 0; const hasTechnicalTerms = content.toLowerCase().includes('api') || content.toLowerCase().includes('architecture') || content.toLowerCase().includes('integration'); if (hasCode && hasTechnicalTerms) { metadata.complexity = 'advanced'; } else if (hasCode || hasTechnicalTerms) { metadata.complexity = 'intermediate'; } else { metadata.complexity = 'basic'; } const rawHtml = mainElement ? mainElement.innerHTML : document.body.innerHTML; return { title: title, content: content.trim(), metadata, rawHtml }; }); return pageData; } private async scrapeCategoryEnhanced( category: Category, contextName: string, categoryName: string ): Promise<EnhancedScrapedContent[]> { if (!this.browser) throw new Error("Browser not initialized"); logger.info(`🔍 Enhanced scraping ${category.urls.length} URLs for ${categoryName}...`); const results: EnhancedScrapedContent[] = []; const timestamp = new Date().toISOString(); const scrapePromises = category.urls.map(async (url, index) => { try { const context = await this.browser!.newContext({ userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36", viewport: { width: 1920, height: 1080 } }); const page = await context.newPage(); // Block unnecessary resources based on processing level if (this.options.level === 'polished') { await page.route("**/*.{mp4,mp3,woff,woff2,ttf,otf,zip,svg}", route => route.abort()); } else { await page.route("**/*.{png,jpg,jpeg,gif,webp,mp4,mp3,woff,woff2,ttf,otf,zip,svg}", route => route.abort()); } logger.info(` [${index + 1}/${category.urls.length}] Scraping: ${url}`); await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 }); try { await page.waitForLoadState("networkidle", { timeout: 5000 }); } catch (e) { // Continue if network doesn't become idle } await page.waitForTimeout(2000); const { title, content, metadata, rawHtml } = await this.extractEnhancedPageContent(page); // Take screenshot if requested let screenshot = ''; if (this.options.includeScreenshots) { try { const screenshotBuffer = await page.screenshot({ type: 'png', fullPage: true }); screenshot = screenshotBuffer.toString('base64'); } catch (e) { logger.warn(` ⚠️ Could not capture screenshot for ${url}`); } } await context.close(); // Calculate quality metrics const quality = { contentLength: content.length, hasCode: metadata.sourceFiles.length > 0 || metadata.codeReferences.length > 0, hasImages: metadata.diagrams.length > 0, hasLinks: metadata.crossReferences.length > 0, completeness: Math.min(100, Math.max(0, (content.length / 1000) * 30 + // Content length factor (metadata.sourceFiles.length + metadata.codeReferences.length) * 10 + // Code factor (metadata.diagrams.length * 5) + // Images factor (metadata.crossReferences.length * 3) // Links factor )) }; const result: EnhancedScrapedContent = { url, title, content, timestamp, category: categoryName, context: contextName, metadata, quality, rawHtml: this.options.includeRawHtml ? rawHtml : undefined, screenshots: screenshot ? [screenshot] : undefined }; this.stats.successfulScrapes++; this.stats.totalContentSize += content.length; return result; } catch (error) { logger.error(` ❌ Error scraping ${url}: ${(error as Error).message}`); this.stats.failedScrapes++; return { url, title: 'Error loading page', content: `Error: ${(error as Error).message}`, timestamp, category: categoryName, context: contextName, metadata: { sourceFiles: [], codeReferences: [], diagrams: [], crossReferences: [], navigation: [], breadcrumbs: [], tags: [], wordCount: 0, readingTime: 0, complexity: 'basic' }, quality: { contentLength: 0, hasCode: false, hasImages: false, hasLinks: false, completeness: 0 } }; } }); const categoryResults = await Promise.all(scrapePromises); results.push(...categoryResults); logger.success(`✅ Enhanced scraping completed for ${categoryName}: ${categoryResults.length} pages`); return results; } private async saveOrganizedContent( content: EnhancedScrapedContent[], outputDir: string ): Promise<void> { if (!content || content.length === 0) { throw new Error('Content array is empty or undefined'); } const firstItem = content[0]; const contextName = firstItem?.context || 'unknown-context'; const categoryName = firstItem?.category || 'unknown-category'; // Create organized directory structure let baseDir = outputDir; if (this.options.organizeByDate) { const date = new Date().toISOString().split('T')[0]; baseDir = path.join(outputDir, date); } const contextDir = path.join(baseDir, contextName.replace(/\s+/g, '-').toLowerCase()); const categoryDir = path.join(contextDir, categoryName.replace(/\s+/g, '-').toLowerCase()); if (!existsSync(baseDir)) { await mkdir(baseDir, { recursive: true }); } if (!existsSync(contextDir)) { await mkdir(contextDir, { recursive: true }); } if (!existsSync(categoryDir)) { await mkdir(categoryDir, { recursive: true }); } // Save each page based on processing level for (const item of content) { const filename = item.url.split('/').pop() || 'index'; const safeFilename = filename.replace(/[^a-zA-Z0-9-_]/g, '_'); let filepath: string; let contentToSave: string; switch (this.options.level) { case 'full': filepath = path.join(categoryDir, `${safeFilename}_full.md`); contentToSave = this.generateFullContent(item); break; case 'categorized': filepath = path.join(categoryDir, `${safeFilename}_categorized.md`); contentToSave = this.generateCategorizedContent(item); break; case 'polished': filepath = path.join(categoryDir, `${safeFilename}_polished.md`); contentToSave = this.generatePolishedContent(item); break; default: filepath = path.join(categoryDir, `${safeFilename}.md`); contentToSave = this.generateFullContent(item); } await writeFile(filepath, contentToSave, 'utf8'); } // Save enhanced summary file const summaryPath = path.join(categoryDir, '_summary.md'); const summaryContent = this.generateCategorySummary(content, categoryName, contextName); await writeFile(summaryPath, summaryContent, 'utf8'); // Save metadata index const metadataPath = path.join(categoryDir, '_metadata.json'); const metadataContent = { category: categoryName, context: contextName, totalPages: content.length, timestamp: new Date().toISOString(), processingLevel: this.options.level, qualityMetrics: { averageCompleteness: content.reduce((sum, item) => sum + item.quality.completeness, 0) / content.length, totalCodeReferences: content.reduce((sum, item) => sum + item.metadata.codeReferences.length, 0), totalImages: content.reduce((sum, item) => sum + item.metadata.diagrams.length, 0), totalLinks: content.reduce((sum, item) => sum + item.metadata.crossReferences.length, 0) } }; await writeFile(metadataPath, JSON.stringify(metadataContent, null, 2), 'utf8'); } private generateFullContent(item: EnhancedScrapedContent): string { return `# ${item.title} **URL:** ${item.url} **Category:** ${item.category} **Context:** ${item.context} **Scraped:** ${item.timestamp} **Quality Score:** ${item.quality.completeness}/100 ## Metadata ### Content Statistics - **Word Count:** ${item.metadata.wordCount} - **Reading Time:** ${item.metadata.readingTime} minutes - **Complexity Level:** ${item.metadata.complexity} - **Content Length:** ${item.content.length} characters ### Source Files ${item.metadata.sourceFiles.length > 0 ? item.metadata.sourceFiles.map(file => `- \`${file}\``).join('\n') : '- None found'} ### Code References ${item.metadata.codeReferences.length > 0 ? item.metadata.codeReferences.map(ref => `- ${ref}`).join('\n') : '- None found'} ### Diagrams & Images ${item.metadata.diagrams.length > 0 ? item.metadata.diagrams.map(diagram => `- ${diagram}`).join('\n') : '- None found'} ### Cross-References ${item.metadata.crossReferences.length > 0 ? item.metadata.crossReferences.map(ref => `- ${ref}`).join('\n') : '- None found'} ### Navigation ${item.metadata.navigation.length > 0 ? item.metadata.navigation.map(nav => `- ${nav}`).join('\n') : '- None found'} ### Breadcrumbs ${item.metadata.breadcrumbs.length > 0 ? item.metadata.breadcrumbs.join(' > ') : '- None found'} ### Tags ${item.metadata.tags.length > 0 ? item.metadata.tags.map(tag => `- ${tag}`).join('\n') : '- None found'} ${item.metadata.lastModified ? `### Last Modified\n${item.metadata.lastModified}\n` : ''} ${item.metadata.contributors && item.metadata.contributors.length > 0 ? `### Contributors\n${item.metadata.contributors.map(contrib => `- ${contrib}`).join('\n')}\n` : ''} ## Content ${item.content} ${item.rawHtml ? ` ## Raw HTML Context \`\`\`html ${item.rawHtml.substring(0, 2000)}... \`\`\` ` : ''} ${item.screenshots && item.screenshots.length > 0 ? ` ## Screenshot ![Page Screenshot](data:image/png;base64,${item.screenshots[0]}) ` : ''} `; } private generateCategorizedContent(item: EnhancedScrapedContent): string { return `# ${item.title} **URL:** ${item.url} **Category:** ${item.category} **Context:** ${item.context} **Scraped:** ${item.timestamp} **Quality Score:** ${item.quality.completeness}/100 ## Quick Stats - **Word Count:** ${item.metadata.wordCount} - **Reading Time:** ${item.metadata.readingTime} minutes - **Complexity:** ${item.metadata.complexity} - **Has Code:** ${item.quality.hasCode ? 'Yes' : 'No'} - **Has Images:** ${item.quality.hasImages ? 'Yes' : 'No'} - **Has Links:** ${item.quality.hasLinks ? 'Yes' : 'No'} ## Key References ${item.metadata.codeReferences.length > 0 ? `### Code Files\n${item.metadata.codeReferences.slice(0, 5).map(ref => `- ${ref}`).join('\n')}\n` : ''} ${item.metadata.diagrams.length > 0 ? `### Images & Diagrams\n${item.metadata.diagrams.slice(0, 3).map(diagram => `- ${diagram}`).join('\n')}\n` : ''} ${item.metadata.crossReferences.length > 0 ? `### Related Links\n${item.metadata.crossReferences.slice(0, 5).map(ref => `- ${ref}`).join('\n')}\n` : ''} ## Content Summary ${item.content.length > 500 ? item.content.substring(0, 500) + '...' : item.content} ${item.content.length > 500 ? `\n*[Content truncated for categorized view. Full content available in full data mode.]*` : ''} `; } private generatePolishedContent(item: EnhancedScrapedContent): string { // Clean and polish the content const polishedContent = item.content .replace(/\s+/g, ' ') // Normalize whitespace .replace(/\n\s*\n/g, '\n\n') // Clean up multiple newlines .trim(); return `# ${item.title} **URL:** ${item.url} **Category:** ${item.category} **Context:** ${item.context} **Scraped:** ${item.timestamp} ## Overview - **Complexity:** ${item.metadata.complexity} - **Reading Time:** ${item.metadata.readingTime} minutes - **Quality Score:** ${item.quality.completeness}/100 ## Content ${polishedContent} ${item.metadata.codeReferences.length > 0 ? ` ## Related Code ${item.metadata.codeReferences.slice(0, 3).map(ref => `- ${ref}`).join('\n')} ` : ''} ${item.metadata.crossReferences.length > 0 ? ` ## Related Documentation ${item.metadata.crossReferences.slice(0, 3).map(ref => `- ${ref}`).join('\n')} ` : ''} `; } private generateCategorySummary( content: EnhancedScrapedContent[], categoryName: string, contextName: string ): string { const totalContentSize = content.reduce((sum, item) => sum + item.content.length, 0); const avgQuality = content.reduce((sum, item) => sum + item.quality.completeness, 0) / content.length; return `# ${categoryName} Summary **Context:** ${contextName} **Total Pages:** ${content.length} **Scraped:** ${new Date().toISOString()} **Processing Level:** ${this.options.level} ## Statistics - **Total Content Size:** ${(totalContentSize / 1024).toFixed(2)} KB - **Average Quality Score:** ${avgQuality.toFixed(1)}/100 - **Successful Scrapes:** ${this.stats.successfulScrapes} - **Failed Scrapes:** ${this.stats.failedScrapes} ## Quality Distribution ${this.generateQualityDistribution(content)} ## Pages by Complexity ${this.generateComplexityDistribution(content)} ## Pages ${content.map(item => `- [${item.title}](${item.url}) - ${item.quality.completeness}/100`).join('\n')} ## Processing Options Used - **Level:** ${this.options.level} - **Include Metadata:** ${this.options.includeMetadata} - **Include Raw HTML:** ${this.options.includeRawHtml} - **Include Screenshots:** ${this.options.includeScreenshots} - **Organize by Date:** ${this.options.organizeByDate} `; } private generateQualityDistribution(content: EnhancedScrapedContent[]): string { const distribution = { excellent: content.filter(item => item.quality.completeness >= 80).length, good: content.filter(item => item.quality.completeness >= 60 && item.quality.completeness < 80).length, fair: content.filter(item => item.quality.completeness >= 40 && item.quality.completeness < 60).length, poor: content.filter(item => item.quality.completeness < 40).length }; return `- **Excellent (80-100):** ${distribution.excellent} pages - **Good (60-79):** ${distribution.good} pages - **Fair (40-59):** ${distribution.fair} pages - **Poor (0-39):** ${distribution.poor} pages`; } private generateComplexityDistribution(content: EnhancedScrapedContent[]): string { const distribution = { basic: content.filter(item => item.metadata.complexity === 'basic').length, intermediate: content.filter(item => item.metadata.complexity === 'intermediate').length, advanced: content.filter(item => item.metadata.complexity === 'advanced').length }; return `- **Basic:** ${distribution.basic} pages - **Intermediate:** ${distribution.intermediate} pages - **Advanced:** ${distribution.advanced} pages`; } private async cleanupOldData(outputDir: string): Promise<void> { if (!this.options.cleanupOldData) return; logger.info("🧹 Cleaning up old data..."); try { const items = await readdir(outputDir, { withFileTypes: true }); for (const item of items) { const fullPath = path.join(outputDir, item.name); if (item.isDirectory()) { // Check if directory is older than 7 days const stats = statSync(fullPath); const daysOld = (Date.now() - stats.mtime.getTime()) / (1000 * 60 * 60 * 24); if (daysOld > 7) { await this.removeDirectoryRecursively(fullPath); logger.info(`🗑️ Removed old directory: ${item.name}`); } } } logger.success("✅ Cleanup completed"); } catch (error) { logger.warn(`⚠️ Cleanup warning: ${(error as Error).message}`); } } private async removeDirectoryRecursively(dirPath: string): Promise<void> { const items = await readdir(dirPath, { withFileTypes: true }); for (const item of items) { const fullPath = path.join(dirPath, item.name); if (item.isDirectory()) { await this.removeDirectoryRecursively(fullPath); } else { await unlink(fullPath); } } await rmdir(dirPath); } async processData(): Promise<void> { const startTime = Date.now(); try { logger.info("🚀 Starting data organization process..."); // Find the links file const files = await readdir(".", { withFileTypes: true }); const linkFiles = files.filter(f => f.isFile() && f.name.endsWith('_links.txt')); if (linkFiles.length === 0) { throw new Error("No links file found. Please run the crawler first."); } const linksFile = linkFiles[0].name; const domainName = linksFile.replace('_links.txt', '').replace(/_/g, '.'); logger.info(`📁 Reading links from: ${linksFile}`); logger.info(`🌐 Domain: ${domainName}`); // Read URLs const content = await readFile(linksFile, "utf8"); const urls = content.trim().split("\n").filter(Boolean); this.stats.totalUrls = urls.length; logger.info(`Processing ${urls.length} URLs...`); const contexts = this.categorizeUrls(urls); // Create output directory const outputDir = `organized-data-${this.options.level}`; if (!existsSync(outputDir)) { await mkdir(outputDir, { recursive: true }); } logger.info(`📁 Output directory: ${outputDir}`); // Process each context and category for (const context of contexts) { logger.info(`📚 Processing ${context.name}...`); for (const category of context.categories) { if (category.urls.length === 0) continue; logger.info(` 📂 Category: ${category.name} (${category.urls.length} URLs)`); const scrapedContent = await this.scrapeCategoryEnhanced( category, context.name, category.name ); await this.saveOrganizedContent(scrapedContent, outputDir); } } // Generate overall summary await this.generateOverallSummary(contexts, outputDir); // Cleanup old data if requested await this.cleanupOldData(outputDir); this.stats.processingTime = Date.now() - startTime; logger.success("✅ Data organization completed!"); logger.info(`📁 All content saved to: ${outputDir}/`); this.printFinalStats(); } catch (error) { logger.error(`❌ Error during data organization: ${(error as Error).message}`); throw error; } } private categorizeUrls(urls: string[]): Context[] { const contexts: Context[] = [ { name: "Developer Context", description: "Technical documentation for developers building with ElizaOS", categories: [ { name: "Architecture & Core Concepts", description: "Fundamental system architecture and core concepts", patterns: ["architecture", "core-system", "overview"], urls: [] }, { name: "Plugin Development", description: "Creating and managing plugins for ElizaOS", patterns: ["plugin", "creating-plugins", "plugin-architecture"], urls: [] }, { name: "API Reference", description: "Complete API documentation for developers", patterns: ["api-reference", "core-api", "client-api", "cli-api"], urls: [] }, { name: "Development Workflow", description: "Development tools, testing, and CI/CD", patterns: ["development", "building", "testing", "cicd", "contributing"], urls: [] }, { name: "Server & Infrastructure", description: "Server architecture, deployment, and configuration", patterns: ["server", "deployment", "configuration"], urls: [] }, { name: "Data & Storage", description: "Database integration and data management", patterns: ["data", "database", "memory-management", "data-models"], urls: [] }, { name: "Advanced Development", description: "Advanced features and integrations", patterns: ["advanced-features", "tee-integration", "scenario-testing"], urls: [] } ] }, { name: "User Context", description: "User-facing documentation and guides", categories: [ { name: "Getting Started", description: "Quick start guides and tutorials", patterns: ["getting-started", "quickstart", "tutorial", "guide"], urls: [] }, { name: "User Interface", description: "UI components and user experience", patterns: ["ui", "interface", "components", "ux"], urls: [] }, { name: "Features & Capabilities", description: "End-user features and functionality", patterns: ["features", "capabilities", "functionality"], urls: [] } ] } ]; // Categorize URLs for (const url of urls) { const urlLower = url.toLowerCase(); let categorized = false; for (const context of contexts) { for (const category of context.categories) { if (category.patterns.some(pattern => urlLower.includes(pattern))) { category.urls.push(url); categorized = true; break; } } if (categorized) break; } // If not categorized, add to first available category if (!categorized && contexts[0]?.categories[0]) { contexts[0].categories[0].urls.push(url); } } return contexts; } private async generateOverallSummary(contexts: Context[], outputDir: string): Promise<void> { const summaryPath = path.join(outputDir, "README.md"); const totalUrls = contexts.reduce((sum, context) => sum + context.categories.reduce((catSum, cat) => catSum + cat.urls.length, 0), 0 ); const summaryContent = `# ElizaOS Organized Data This directory contains organized scraped content from the ElizaOS documentation, processed at the **${this.options.level}** level. ## Processing Options - **Level:** ${this.options.level} - **Include Metadata:** ${this.options.includeMetadata} - **Include Raw HTML:** ${this.options.includeRawHtml} - **Include Screenshots:** ${this.options.includeScreenshots} - **Cleanup Old Data:** ${this.options.cleanupOldData} - **Organize by Date:** ${this.options.organizeByDate} - **Compress Output:** ${this.options.compressOutput} ## Structure ${contexts.map(context => { const totalUrls = context.categories.reduce((sum, cat) => sum + cat.urls.length, 0); return `### ${context.name} (${totalUrls} pages) ${context.categories.filter(cat => cat.urls.length > 0).map(cat => `- **${cat.name}**: ${cat.urls.length} pages` ).join('\n')}`; }).join('\n\n')} ## Statistics - **Total URLs:** ${totalUrls} - **Processing Level:** ${this.options.level} - **Generated:** ${new Date().toISOString()} - **Processing Time:** ${this.stats.processingTime}ms - **Successful Scrapes:** ${this.stats.successfulScrapes} - **Failed Scrapes:** ${this.stats.failedScrapes} - **Total Content Size:** ${(this.stats.totalContentSize / 1024).toFixed(2)} KB ## Data Levels Explained ### Full Data Complete scraped content with all metadata, raw HTML, and screenshots. Best for comprehensive analysis and development purposes. ### Categorized Data Organized content with key metadata and summaries. Ideal for quick reference and content discovery. ### Polished Data Cleaned and refined content optimized for readability. Perfect for end-user consumption and documentation. ## File Organization Each category contains: - Individual page files (processed according to selected level) - \`_summary.md\` - Category overview and statistics - \`_metadata.json\` - Detailed metadata and quality metrics ## Quality Metrics Content is automatically scored based on: - Content completeness and length - Presence of code references and examples - Image and diagram content - Cross-reference links - Overall information density `; await writeFile(summaryPath, summaryContent, 'utf8'); logger.info(`📄 Overall summary saved to: ${summaryPath}`); } private printFinalStats(): void { logger.info("\n📊 Final Statistics:"); logger.info(` Total URLs: ${this.stats.totalUrls}`); logger.info(` Successful: ${this.stats.successfulScrapes}`); logger.info(` Failed: ${this.stats.failedScrapes}`); logger.info(` Processing Time: ${this.stats.processingTime}ms`); logger.info(` Total Content: ${(this.stats.totalContentSize / 1024).toFixed(2)} KB`); logger.info(` Output Level: ${this.options.level}`); } } // User preference selection and main execution async function main(): Promise<void> { try { logger.info("🎯 ElizaOS Data Organizer"); // Load user preferences const preferenceManager = new UserPreferenceManager(); const preferences = await preferenceManager.loadPreferences(); // Check command line arguments for overrides const args = process.argv.slice(2); let level = preferences.dataProcessingLevel; if (args.includes('--full')) level = 'full'; else if (args.includes('--polished')) level = 'polished'; else if (args.includes('--categorized')) level = 'categorized'; // Update preferences if changed via command line if (level !== preferences.dataProcessingLevel) { await preferenceManager.updatePreferences({ dataProcessingLevel: level }); } const options: DataProcessingOptions = { level, includeMetadata: preferences.includeMetadata, includeRawHtml: preferences.includeRawHtml, includeScreenshots: preferences.includeScreenshots, cleanupOldData: preferences.cleanupOldData, organizeByDate: preferences.organizeByDate, compressOutput: preferences.compressOutput }; logger.info(`\n🔧 Processing Options (from user preferences):`); logger.info(` Level: ${options.level}`); logger.info(` Include Metadata: ${options.includeMetadata}`); logger.info(` Include Raw HTML: ${options.includeRawHtml}`); logger.info(` Include Screenshots: ${options.includeScreenshots}`); logger.info(` Cleanup Old Data: ${options.cleanupOldData}`); logger.info(` Organize by Date: ${options.organizeByDate}`); logger.info(` Output Directory: ${preferences.outputDirectory}`); const organizer = new DataOrganizer(options); await organizer.initialize(); await organizer.processData(); await organizer.cleanup(); logger.success("🎉 Data organization completed successfully!"); logger.info("💡 Use 'bun run user-preferences.ts show' to view your current preferences"); logger.info("💡 Use 'bun run user-preferences.ts setup' to configure your preferences"); } catch (error) { logger.error(`❌ Fatal error: ${(error as Error).message}`); process.exit(1); } } if (import.meta.main) { main(); }