@promptordie/siphon-knowledge
Version:
AI-powered documentation generation system for AI Coding Agents.
1,024 lines (847 loc) โข 34.9 kB
text/typescript
import { chromium, Browser, Page } from "playwright";
import { readFile, writeFile, mkdir, readdir, unlink, rmdir } from "node:fs/promises";
import { existsSync, statSync } from "node:fs";
import path from "node:path";
import { logger } from "../logger.ts";
import { UserPreferenceManager, UserPreferences } from "./user-preferences.ts";
// Enhanced interfaces for better data organization
interface EnhancedScrapedContent {
url: string;
title: string;
content: string;
timestamp: string;
category: string;
context: string;
metadata: {
sourceFiles: string[];
codeReferences: string[];
diagrams: string[];
crossReferences: string[];
navigation: string[];
breadcrumbs: string[];
tags: string[];
lastModified?: string;
contributors?: string[];
wordCount: number;
readingTime: number;
complexity: 'basic' | 'intermediate' | 'advanced';
};
rawHtml?: string;
screenshots?: string[];
quality: {
contentLength: number;
hasCode: boolean;
hasImages: boolean;
hasLinks: boolean;
completeness: number; // 0-100
};
}
interface Category {
name: string;
description: string;
patterns: string[];
urls: string[];
}
interface Context {
name: string;
description: string;
categories: Category[];
}
interface DataProcessingOptions {
level: 'full' | 'categorized' | 'polished';
includeMetadata: boolean;
includeRawHtml: boolean;
includeScreenshots: boolean;
cleanupOldData: boolean;
organizeByDate: boolean;
compressOutput: boolean;
}
interface ProcessingStats {
totalUrls: number;
successfulScrapes: number;
failedScrapes: number;
totalContentSize: number;
processingTime: number;
outputSize: number;
}
class DataOrganizer {
private browser: Browser | null = null;
private stats: ProcessingStats = {
totalUrls: 0,
successfulScrapes: 0,
failedScrapes: 0,
totalContentSize: 0,
processingTime: 0,
outputSize: 0
};
constructor(private options: DataProcessingOptions) {}
async initialize(): Promise<void> {
logger.info("๐ Initializing Data Organizer...");
this.browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
logger.success("โ
Browser initialized");
}
async cleanup(): Promise<void> {
if (this.browser) {
await this.browser.close();
logger.info("๐ Browser closed");
}
}
private async extractEnhancedPageContent(page: Page): Promise<{
title: string;
content: string;
metadata: EnhancedScrapedContent['metadata'];
rawHtml?: string;
}> {
await page.waitForLoadState("domcontentloaded");
const title = await page.title();
const pageData = await page.evaluate(() => {
const metadata = {
sourceFiles: [] as string[],
codeReferences: [] as string[],
diagrams: [] as string[],
crossReferences: [] as string[],
navigation: [] as string[],
breadcrumbs: [] as string[],
tags: [] as string[],
lastModified: undefined as string | undefined,
contributors: [] as string[],
wordCount: 0,
readingTime: 0,
complexity: 'basic' as const
};
// Extract source file references
const sourceElements = document.querySelectorAll('code, pre, .source, .file, [data-source]');
sourceElements.forEach(el => {
const text = el.textContent?.trim();
if (text && (text.includes('.ts') || text.includes('.js') || text.includes('.json') || text.includes('packages/'))) {
metadata.sourceFiles.push(text);
}
});
// Extract code references
const codeRefs = document.querySelectorAll('a[href*="packages/"], a[href*="src/"], .code-ref, [data-line]');
codeRefs.forEach(el => {
const href = (el as HTMLAnchorElement).href;
const text = el.textContent?.trim();
if (href || text) {
metadata.codeReferences.push(`${text} (${href})`);
}
});
// Extract diagrams and images
const images = document.querySelectorAll('img, svg, .diagram, .chart');
images.forEach(img => {
const src = (img as HTMLImageElement).src;
const alt = (img as HTMLImageElement).alt;
if (src || alt) {
metadata.diagrams.push(`${alt} (${src})`);
}
});
// Extract cross-references and links
const links = document.querySelectorAll('a[href*="/elizaOS/eliza/"]');
links.forEach(link => {
const href = (link as HTMLAnchorElement).href;
const text = (link as HTMLAnchorElement).textContent?.trim();
if (href && text) {
metadata.crossReferences.push(`${text} (${href})`);
}
});
// Extract navigation elements
const navElements = document.querySelectorAll('nav, .navigation, .menu, .sidebar');
navElements.forEach(nav => {
const text = nav.textContent?.trim();
if (text) {
metadata.navigation.push(text);
}
});
// Extract breadcrumbs
const breadcrumbElements = document.querySelectorAll('.breadcrumb, .breadcrumbs, [aria-label*="breadcrumb"]');
breadcrumbElements.forEach(bc => {
const text = bc.textContent?.trim();
if (text) {
metadata.breadcrumbs.push(text);
}
});
// Extract tags
const tagElements = document.querySelectorAll('.tag, .tags, [data-tag]');
tagElements.forEach(tag => {
const text = tag.textContent?.trim();
if (text) {
metadata.tags.push(text);
}
});
// Extract main content
const contentSelectors = [
'main', '[role="main"]', '.content', '.main-content',
'article', '.article', '#content', '.markdown-body', '.prose'
];
let content = '';
let mainElement: Element | null = null;
for (const selector of contentSelectors) {
const element = document.querySelector(selector);
if (element) {
content = element.textContent || '';
if (content.trim().length > 100) {
mainElement = element;
break;
}
}
}
if (!content.trim()) {
const scripts = document.querySelectorAll('script, style, nav, header, footer, .nav, .header, .footer');
scripts.forEach(el => el.remove());
content = document.body.textContent || '';
}
// Calculate metadata
const words = content.trim().split(/\s+/).length;
metadata.wordCount = words;
metadata.readingTime = Math.ceil(words / 200); // 200 words per minute
// Determine complexity based on content analysis
const hasCode = metadata.sourceFiles.length > 0 || metadata.codeReferences.length > 0;
const hasTechnicalTerms = content.toLowerCase().includes('api') || content.toLowerCase().includes('architecture') || content.toLowerCase().includes('integration');
if (hasCode && hasTechnicalTerms) {
metadata.complexity = 'advanced';
} else if (hasCode || hasTechnicalTerms) {
metadata.complexity = 'intermediate';
} else {
metadata.complexity = 'basic';
}
const rawHtml = mainElement ? mainElement.innerHTML : document.body.innerHTML;
return {
title: title,
content: content.trim(),
metadata,
rawHtml
};
});
return pageData;
}
private async scrapeCategoryEnhanced(
category: Category,
contextName: string,
categoryName: string
): Promise<EnhancedScrapedContent[]> {
if (!this.browser) throw new Error("Browser not initialized");
logger.info(`๐ Enhanced scraping ${category.urls.length} URLs for ${categoryName}...`);
const results: EnhancedScrapedContent[] = [];
const timestamp = new Date().toISOString();
const scrapePromises = category.urls.map(async (url, index) => {
try {
const context = await this.browser!.newContext({
userAgent: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36",
viewport: { width: 1920, height: 1080 }
});
const page = await context.newPage();
// Block unnecessary resources based on processing level
if (this.options.level === 'polished') {
await page.route("**/*.{mp4,mp3,woff,woff2,ttf,otf,zip,svg}", route => route.abort());
} else {
await page.route("**/*.{png,jpg,jpeg,gif,webp,mp4,mp3,woff,woff2,ttf,otf,zip,svg}", route => route.abort());
}
logger.info(` [${index + 1}/${category.urls.length}] Scraping: ${url}`);
await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 30000
});
try {
await page.waitForLoadState("networkidle", { timeout: 5000 });
} catch (e) {
// Continue if network doesn't become idle
}
await page.waitForTimeout(2000);
const { title, content, metadata, rawHtml } = await this.extractEnhancedPageContent(page);
// Take screenshot if requested
let screenshot = '';
if (this.options.includeScreenshots) {
try {
const screenshotBuffer = await page.screenshot({
type: 'png',
fullPage: true
});
screenshot = screenshotBuffer.toString('base64');
} catch (e) {
logger.warn(` โ ๏ธ Could not capture screenshot for ${url}`);
}
}
await context.close();
// Calculate quality metrics
const quality = {
contentLength: content.length,
hasCode: metadata.sourceFiles.length > 0 || metadata.codeReferences.length > 0,
hasImages: metadata.diagrams.length > 0,
hasLinks: metadata.crossReferences.length > 0,
completeness: Math.min(100, Math.max(0,
(content.length / 1000) * 30 + // Content length factor
(metadata.sourceFiles.length + metadata.codeReferences.length) * 10 + // Code factor
(metadata.diagrams.length * 5) + // Images factor
(metadata.crossReferences.length * 3) // Links factor
))
};
const result: EnhancedScrapedContent = {
url,
title,
content,
timestamp,
category: categoryName,
context: contextName,
metadata,
quality,
rawHtml: this.options.includeRawHtml ? rawHtml : undefined,
screenshots: screenshot ? [screenshot] : undefined
};
this.stats.successfulScrapes++;
this.stats.totalContentSize += content.length;
return result;
} catch (error) {
logger.error(` โ Error scraping ${url}: ${(error as Error).message}`);
this.stats.failedScrapes++;
return {
url,
title: 'Error loading page',
content: `Error: ${(error as Error).message}`,
timestamp,
category: categoryName,
context: contextName,
metadata: {
sourceFiles: [],
codeReferences: [],
diagrams: [],
crossReferences: [],
navigation: [],
breadcrumbs: [],
tags: [],
wordCount: 0,
readingTime: 0,
complexity: 'basic'
},
quality: {
contentLength: 0,
hasCode: false,
hasImages: false,
hasLinks: false,
completeness: 0
}
};
}
});
const categoryResults = await Promise.all(scrapePromises);
results.push(...categoryResults);
logger.success(`โ
Enhanced scraping completed for ${categoryName}: ${categoryResults.length} pages`);
return results;
}
private async saveOrganizedContent(
content: EnhancedScrapedContent[],
outputDir: string
): Promise<void> {
if (!content || content.length === 0) {
throw new Error('Content array is empty or undefined');
}
const firstItem = content[0];
const contextName = firstItem?.context || 'unknown-context';
const categoryName = firstItem?.category || 'unknown-category';
// Create organized directory structure
let baseDir = outputDir;
if (this.options.organizeByDate) {
const date = new Date().toISOString().split('T')[0];
baseDir = path.join(outputDir, date);
}
const contextDir = path.join(baseDir, contextName.replace(/\s+/g, '-').toLowerCase());
const categoryDir = path.join(contextDir, categoryName.replace(/\s+/g, '-').toLowerCase());
if (!existsSync(baseDir)) {
await mkdir(baseDir, { recursive: true });
}
if (!existsSync(contextDir)) {
await mkdir(contextDir, { recursive: true });
}
if (!existsSync(categoryDir)) {
await mkdir(categoryDir, { recursive: true });
}
// Save each page based on processing level
for (const item of content) {
const filename = item.url.split('/').pop() || 'index';
const safeFilename = filename.replace(/[^a-zA-Z0-9-_]/g, '_');
let filepath: string;
let contentToSave: string;
switch (this.options.level) {
case 'full':
filepath = path.join(categoryDir, `${safeFilename}_full.md`);
contentToSave = this.generateFullContent(item);
break;
case 'categorized':
filepath = path.join(categoryDir, `${safeFilename}_categorized.md`);
contentToSave = this.generateCategorizedContent(item);
break;
case 'polished':
filepath = path.join(categoryDir, `${safeFilename}_polished.md`);
contentToSave = this.generatePolishedContent(item);
break;
default:
filepath = path.join(categoryDir, `${safeFilename}.md`);
contentToSave = this.generateFullContent(item);
}
await writeFile(filepath, contentToSave, 'utf8');
}
// Save enhanced summary file
const summaryPath = path.join(categoryDir, '_summary.md');
const summaryContent = this.generateCategorySummary(content, categoryName, contextName);
await writeFile(summaryPath, summaryContent, 'utf8');
// Save metadata index
const metadataPath = path.join(categoryDir, '_metadata.json');
const metadataContent = {
category: categoryName,
context: contextName,
totalPages: content.length,
timestamp: new Date().toISOString(),
processingLevel: this.options.level,
qualityMetrics: {
averageCompleteness: content.reduce((sum, item) => sum + item.quality.completeness, 0) / content.length,
totalCodeReferences: content.reduce((sum, item) => sum + item.metadata.codeReferences.length, 0),
totalImages: content.reduce((sum, item) => sum + item.metadata.diagrams.length, 0),
totalLinks: content.reduce((sum, item) => sum + item.metadata.crossReferences.length, 0)
}
};
await writeFile(metadataPath, JSON.stringify(metadataContent, null, 2), 'utf8');
}
private generateFullContent(item: EnhancedScrapedContent): string {
return `# ${item.title}
**URL:** ${item.url}
**Category:** ${item.category}
**Context:** ${item.context}
**Scraped:** ${item.timestamp}
**Quality Score:** ${item.quality.completeness}/100
## Metadata
### Content Statistics
- **Word Count:** ${item.metadata.wordCount}
- **Reading Time:** ${item.metadata.readingTime} minutes
- **Complexity Level:** ${item.metadata.complexity}
- **Content Length:** ${item.content.length} characters
### Source Files
${item.metadata.sourceFiles.length > 0 ? item.metadata.sourceFiles.map(file => `- \`${file}\``).join('\n') : '- None found'}
### Code References
${item.metadata.codeReferences.length > 0 ? item.metadata.codeReferences.map(ref => `- ${ref}`).join('\n') : '- None found'}
### Diagrams & Images
${item.metadata.diagrams.length > 0 ? item.metadata.diagrams.map(diagram => `- ${diagram}`).join('\n') : '- None found'}
### Cross-References
${item.metadata.crossReferences.length > 0 ? item.metadata.crossReferences.map(ref => `- ${ref}`).join('\n') : '- None found'}
### Navigation
${item.metadata.navigation.length > 0 ? item.metadata.navigation.map(nav => `- ${nav}`).join('\n') : '- None found'}
### Breadcrumbs
${item.metadata.breadcrumbs.length > 0 ? item.metadata.breadcrumbs.join(' > ') : '- None found'}
### Tags
${item.metadata.tags.length > 0 ? item.metadata.tags.map(tag => `- ${tag}`).join('\n') : '- None found'}
${item.metadata.lastModified ? `### Last Modified\n${item.metadata.lastModified}\n` : ''}
${item.metadata.contributors && item.metadata.contributors.length > 0 ? `### Contributors\n${item.metadata.contributors.map(contrib => `- ${contrib}`).join('\n')}\n` : ''}
## Content
${item.content}
${item.rawHtml ? `
## Raw HTML Context
\`\`\`html
${item.rawHtml.substring(0, 2000)}...
\`\`\`
` : ''}
${item.screenshots && item.screenshots.length > 0 ? `
## Screenshot

` : ''}
`;
}
private generateCategorizedContent(item: EnhancedScrapedContent): string {
return `# ${item.title}
**URL:** ${item.url}
**Category:** ${item.category}
**Context:** ${item.context}
**Scraped:** ${item.timestamp}
**Quality Score:** ${item.quality.completeness}/100
## Quick Stats
- **Word Count:** ${item.metadata.wordCount}
- **Reading Time:** ${item.metadata.readingTime} minutes
- **Complexity:** ${item.metadata.complexity}
- **Has Code:** ${item.quality.hasCode ? 'Yes' : 'No'}
- **Has Images:** ${item.quality.hasImages ? 'Yes' : 'No'}
- **Has Links:** ${item.quality.hasLinks ? 'Yes' : 'No'}
## Key References
${item.metadata.codeReferences.length > 0 ? `### Code Files\n${item.metadata.codeReferences.slice(0, 5).map(ref => `- ${ref}`).join('\n')}\n` : ''}
${item.metadata.diagrams.length > 0 ? `### Images & Diagrams\n${item.metadata.diagrams.slice(0, 3).map(diagram => `- ${diagram}`).join('\n')}\n` : ''}
${item.metadata.crossReferences.length > 0 ? `### Related Links\n${item.metadata.crossReferences.slice(0, 5).map(ref => `- ${ref}`).join('\n')}\n` : ''}
## Content Summary
${item.content.length > 500 ? item.content.substring(0, 500) + '...' : item.content}
${item.content.length > 500 ? `\n*[Content truncated for categorized view. Full content available in full data mode.]*` : ''}
`;
}
private generatePolishedContent(item: EnhancedScrapedContent): string {
// Clean and polish the content
const polishedContent = item.content
.replace(/\s+/g, ' ') // Normalize whitespace
.replace(/\n\s*\n/g, '\n\n') // Clean up multiple newlines
.trim();
return `# ${item.title}
**URL:** ${item.url}
**Category:** ${item.category}
**Context:** ${item.context}
**Scraped:** ${item.timestamp}
## Overview
- **Complexity:** ${item.metadata.complexity}
- **Reading Time:** ${item.metadata.readingTime} minutes
- **Quality Score:** ${item.quality.completeness}/100
## Content
${polishedContent}
${item.metadata.codeReferences.length > 0 ? `
## Related Code
${item.metadata.codeReferences.slice(0, 3).map(ref => `- ${ref}`).join('\n')}
` : ''}
${item.metadata.crossReferences.length > 0 ? `
## Related Documentation
${item.metadata.crossReferences.slice(0, 3).map(ref => `- ${ref}`).join('\n')}
` : ''}
`;
}
private generateCategorySummary(
content: EnhancedScrapedContent[],
categoryName: string,
contextName: string
): string {
const totalContentSize = content.reduce((sum, item) => sum + item.content.length, 0);
const avgQuality = content.reduce((sum, item) => sum + item.quality.completeness, 0) / content.length;
return `# ${categoryName} Summary
**Context:** ${contextName}
**Total Pages:** ${content.length}
**Scraped:** ${new Date().toISOString()}
**Processing Level:** ${this.options.level}
## Statistics
- **Total Content Size:** ${(totalContentSize / 1024).toFixed(2)} KB
- **Average Quality Score:** ${avgQuality.toFixed(1)}/100
- **Successful Scrapes:** ${this.stats.successfulScrapes}
- **Failed Scrapes:** ${this.stats.failedScrapes}
## Quality Distribution
${this.generateQualityDistribution(content)}
## Pages by Complexity
${this.generateComplexityDistribution(content)}
## Pages
${content.map(item => `- [${item.title}](${item.url}) - ${item.quality.completeness}/100`).join('\n')}
## Processing Options Used
- **Level:** ${this.options.level}
- **Include Metadata:** ${this.options.includeMetadata}
- **Include Raw HTML:** ${this.options.includeRawHtml}
- **Include Screenshots:** ${this.options.includeScreenshots}
- **Organize by Date:** ${this.options.organizeByDate}
`;
}
private generateQualityDistribution(content: EnhancedScrapedContent[]): string {
const distribution = {
excellent: content.filter(item => item.quality.completeness >= 80).length,
good: content.filter(item => item.quality.completeness >= 60 && item.quality.completeness < 80).length,
fair: content.filter(item => item.quality.completeness >= 40 && item.quality.completeness < 60).length,
poor: content.filter(item => item.quality.completeness < 40).length
};
return `- **Excellent (80-100):** ${distribution.excellent} pages
- **Good (60-79):** ${distribution.good} pages
- **Fair (40-59):** ${distribution.fair} pages
- **Poor (0-39):** ${distribution.poor} pages`;
}
private generateComplexityDistribution(content: EnhancedScrapedContent[]): string {
const distribution = {
basic: content.filter(item => item.metadata.complexity === 'basic').length,
intermediate: content.filter(item => item.metadata.complexity === 'intermediate').length,
advanced: content.filter(item => item.metadata.complexity === 'advanced').length
};
return `- **Basic:** ${distribution.basic} pages
- **Intermediate:** ${distribution.intermediate} pages
- **Advanced:** ${distribution.advanced} pages`;
}
private async cleanupOldData(outputDir: string): Promise<void> {
if (!this.options.cleanupOldData) return;
logger.info("๐งน Cleaning up old data...");
try {
const items = await readdir(outputDir, { withFileTypes: true });
for (const item of items) {
const fullPath = path.join(outputDir, item.name);
if (item.isDirectory()) {
// Check if directory is older than 7 days
const stats = statSync(fullPath);
const daysOld = (Date.now() - stats.mtime.getTime()) / (1000 * 60 * 60 * 24);
if (daysOld > 7) {
await this.removeDirectoryRecursively(fullPath);
logger.info(`๐๏ธ Removed old directory: ${item.name}`);
}
}
}
logger.success("โ
Cleanup completed");
} catch (error) {
logger.warn(`โ ๏ธ Cleanup warning: ${(error as Error).message}`);
}
}
private async removeDirectoryRecursively(dirPath: string): Promise<void> {
const items = await readdir(dirPath, { withFileTypes: true });
for (const item of items) {
const fullPath = path.join(dirPath, item.name);
if (item.isDirectory()) {
await this.removeDirectoryRecursively(fullPath);
} else {
await unlink(fullPath);
}
}
await rmdir(dirPath);
}
async processData(): Promise<void> {
const startTime = Date.now();
try {
logger.info("๐ Starting data organization process...");
// Find the links file
const files = await readdir(".", { withFileTypes: true });
const linkFiles = files.filter(f => f.isFile() && f.name.endsWith('_links.txt'));
if (linkFiles.length === 0) {
throw new Error("No links file found. Please run the crawler first.");
}
const linksFile = linkFiles[0].name;
const domainName = linksFile.replace('_links.txt', '').replace(/_/g, '.');
logger.info(`๐ Reading links from: ${linksFile}`);
logger.info(`๐ Domain: ${domainName}`);
// Read URLs
const content = await readFile(linksFile, "utf8");
const urls = content.trim().split("\n").filter(Boolean);
this.stats.totalUrls = urls.length;
logger.info(`Processing ${urls.length} URLs...`);
const contexts = this.categorizeUrls(urls);
// Create output directory
const outputDir = `organized-data-${this.options.level}`;
if (!existsSync(outputDir)) {
await mkdir(outputDir, { recursive: true });
}
logger.info(`๐ Output directory: ${outputDir}`);
// Process each context and category
for (const context of contexts) {
logger.info(`๐ Processing ${context.name}...`);
for (const category of context.categories) {
if (category.urls.length === 0) continue;
logger.info(` ๐ Category: ${category.name} (${category.urls.length} URLs)`);
const scrapedContent = await this.scrapeCategoryEnhanced(
category,
context.name,
category.name
);
await this.saveOrganizedContent(scrapedContent, outputDir);
}
}
// Generate overall summary
await this.generateOverallSummary(contexts, outputDir);
// Cleanup old data if requested
await this.cleanupOldData(outputDir);
this.stats.processingTime = Date.now() - startTime;
logger.success("โ
Data organization completed!");
logger.info(`๐ All content saved to: ${outputDir}/`);
this.printFinalStats();
} catch (error) {
logger.error(`โ Error during data organization: ${(error as Error).message}`);
throw error;
}
}
private categorizeUrls(urls: string[]): Context[] {
const contexts: Context[] = [
{
name: "Developer Context",
description: "Technical documentation for developers building with ElizaOS",
categories: [
{
name: "Architecture & Core Concepts",
description: "Fundamental system architecture and core concepts",
patterns: ["architecture", "core-system", "overview"],
urls: []
},
{
name: "Plugin Development",
description: "Creating and managing plugins for ElizaOS",
patterns: ["plugin", "creating-plugins", "plugin-architecture"],
urls: []
},
{
name: "API Reference",
description: "Complete API documentation for developers",
patterns: ["api-reference", "core-api", "client-api", "cli-api"],
urls: []
},
{
name: "Development Workflow",
description: "Development tools, testing, and CI/CD",
patterns: ["development", "building", "testing", "cicd", "contributing"],
urls: []
},
{
name: "Server & Infrastructure",
description: "Server architecture, deployment, and configuration",
patterns: ["server", "deployment", "configuration"],
urls: []
},
{
name: "Data & Storage",
description: "Database integration and data management",
patterns: ["data", "database", "memory-management", "data-models"],
urls: []
},
{
name: "Advanced Development",
description: "Advanced features and integrations",
patterns: ["advanced-features", "tee-integration", "scenario-testing"],
urls: []
}
]
},
{
name: "User Context",
description: "User-facing documentation and guides",
categories: [
{
name: "Getting Started",
description: "Quick start guides and tutorials",
patterns: ["getting-started", "quickstart", "tutorial", "guide"],
urls: []
},
{
name: "User Interface",
description: "UI components and user experience",
patterns: ["ui", "interface", "components", "ux"],
urls: []
},
{
name: "Features & Capabilities",
description: "End-user features and functionality",
patterns: ["features", "capabilities", "functionality"],
urls: []
}
]
}
];
// Categorize URLs
for (const url of urls) {
const urlLower = url.toLowerCase();
let categorized = false;
for (const context of contexts) {
for (const category of context.categories) {
if (category.patterns.some(pattern => urlLower.includes(pattern))) {
category.urls.push(url);
categorized = true;
break;
}
}
if (categorized) break;
}
// If not categorized, add to first available category
if (!categorized && contexts[0]?.categories[0]) {
contexts[0].categories[0].urls.push(url);
}
}
return contexts;
}
private async generateOverallSummary(contexts: Context[], outputDir: string): Promise<void> {
const summaryPath = path.join(outputDir, "README.md");
const totalUrls = contexts.reduce((sum, context) =>
sum + context.categories.reduce((catSum, cat) => catSum + cat.urls.length, 0), 0
);
const summaryContent = `# ElizaOS Organized Data
This directory contains organized scraped content from the ElizaOS documentation, processed at the **${this.options.level}** level.
## Processing Options
- **Level:** ${this.options.level}
- **Include Metadata:** ${this.options.includeMetadata}
- **Include Raw HTML:** ${this.options.includeRawHtml}
- **Include Screenshots:** ${this.options.includeScreenshots}
- **Cleanup Old Data:** ${this.options.cleanupOldData}
- **Organize by Date:** ${this.options.organizeByDate}
- **Compress Output:** ${this.options.compressOutput}
## Structure
${contexts.map(context => {
const totalUrls = context.categories.reduce((sum, cat) => sum + cat.urls.length, 0);
return `### ${context.name} (${totalUrls} pages)
${context.categories.filter(cat => cat.urls.length > 0).map(cat =>
`- **${cat.name}**: ${cat.urls.length} pages`
).join('\n')}`;
}).join('\n\n')}
## Statistics
- **Total URLs:** ${totalUrls}
- **Processing Level:** ${this.options.level}
- **Generated:** ${new Date().toISOString()}
- **Processing Time:** ${this.stats.processingTime}ms
- **Successful Scrapes:** ${this.stats.successfulScrapes}
- **Failed Scrapes:** ${this.stats.failedScrapes}
- **Total Content Size:** ${(this.stats.totalContentSize / 1024).toFixed(2)} KB
## Data Levels Explained
### Full Data
Complete scraped content with all metadata, raw HTML, and screenshots. Best for comprehensive analysis and development purposes.
### Categorized Data
Organized content with key metadata and summaries. Ideal for quick reference and content discovery.
### Polished Data
Cleaned and refined content optimized for readability. Perfect for end-user consumption and documentation.
## File Organization
Each category contains:
- Individual page files (processed according to selected level)
- \`_summary.md\` - Category overview and statistics
- \`_metadata.json\` - Detailed metadata and quality metrics
## Quality Metrics
Content is automatically scored based on:
- Content completeness and length
- Presence of code references and examples
- Image and diagram content
- Cross-reference links
- Overall information density
`;
await writeFile(summaryPath, summaryContent, 'utf8');
logger.info(`๐ Overall summary saved to: ${summaryPath}`);
}
private printFinalStats(): void {
logger.info("\n๐ Final Statistics:");
logger.info(` Total URLs: ${this.stats.totalUrls}`);
logger.info(` Successful: ${this.stats.successfulScrapes}`);
logger.info(` Failed: ${this.stats.failedScrapes}`);
logger.info(` Processing Time: ${this.stats.processingTime}ms`);
logger.info(` Total Content: ${(this.stats.totalContentSize / 1024).toFixed(2)} KB`);
logger.info(` Output Level: ${this.options.level}`);
}
}
// User preference selection and main execution
async function main(): Promise<void> {
try {
logger.info("๐ฏ ElizaOS Data Organizer");
// Load user preferences
const preferenceManager = new UserPreferenceManager();
const preferences = await preferenceManager.loadPreferences();
// Check command line arguments for overrides
const args = process.argv.slice(2);
let level = preferences.dataProcessingLevel;
if (args.includes('--full')) level = 'full';
else if (args.includes('--polished')) level = 'polished';
else if (args.includes('--categorized')) level = 'categorized';
// Update preferences if changed via command line
if (level !== preferences.dataProcessingLevel) {
await preferenceManager.updatePreferences({ dataProcessingLevel: level });
}
const options: DataProcessingOptions = {
level,
includeMetadata: preferences.includeMetadata,
includeRawHtml: preferences.includeRawHtml,
includeScreenshots: preferences.includeScreenshots,
cleanupOldData: preferences.cleanupOldData,
organizeByDate: preferences.organizeByDate,
compressOutput: preferences.compressOutput
};
logger.info(`\n๐ง Processing Options (from user preferences):`);
logger.info(` Level: ${options.level}`);
logger.info(` Include Metadata: ${options.includeMetadata}`);
logger.info(` Include Raw HTML: ${options.includeRawHtml}`);
logger.info(` Include Screenshots: ${options.includeScreenshots}`);
logger.info(` Cleanup Old Data: ${options.cleanupOldData}`);
logger.info(` Organize by Date: ${options.organizeByDate}`);
logger.info(` Output Directory: ${preferences.outputDirectory}`);
const organizer = new DataOrganizer(options);
await organizer.initialize();
await organizer.processData();
await organizer.cleanup();
logger.success("๐ Data organization completed successfully!");
logger.info("๐ก Use 'bun run user-preferences.ts show' to view your current preferences");
logger.info("๐ก Use 'bun run user-preferences.ts setup' to configure your preferences");
} catch (error) {
logger.error(`โ Fatal error: ${(error as Error).message}`);
process.exit(1);
}
}
if (import.meta.main) {
main();
}