@promptordie/siphon-knowledge
Version:
AI-powered documentation generation system for AI Coding Agents.
694 lines (539 loc) โข 23.5 kB
text/typescript
import { readFile, writeFile, mkdir, readdir, unlink, rmdir, statSync } from "node:fs/promises";
import { existsSync } from "node:fs";
import path from "node:path";
import { logger } from "../logger.ts";
import { UserPreferenceManager } from "./user-preferences.ts";
interface CleanupStats {
filesProcessed: number;
filesRemoved: number;
directoriesRemoved: number;
spaceFreed: number;
processingTime: number;
}
interface DataQualityMetrics {
totalFiles: number;
averageQuality: number;
qualityDistribution: {
excellent: number;
good: number;
fair: number;
poor: number;
};
complexityDistribution: {
basic: number;
intermediate: number;
advanced: number;
};
contentTypes: {
withCode: number;
withImages: number;
withLinks: number;
};
}
class DataCleanupManager {
private stats: CleanupStats = {
filesProcessed: 0,
filesRemoved: 0,
directoriesRemoved: 0,
spaceFreed: 0,
processingTime: 0
};
constructor(private preferences: any) {}
async performComprehensiveCleanup(): Promise<void> {
const startTime = Date.now();
logger.info("๐งน Starting comprehensive data cleanup...");
try {
// 1. Clean up old data based on user preferences
await this.cleanupOldData();
// 2. Organize and compress data
await this.organizeAndCompressData();
// 3. Generate quality reports
await this.generateQualityReports();
// 4. Final cleanup and optimization
await this.finalCleanup();
this.stats.processingTime = Date.now() - startTime;
logger.success("โ
Comprehensive cleanup completed!");
this.printCleanupStats();
} catch (error) {
logger.error(`โ Error during cleanup: ${(error as Error).message}`);
throw error;
}
}
private async cleanupOldData(): Promise<void> {
if (!this.preferences.cleanupOldData) {
logger.info("โญ๏ธ Skipping old data cleanup (disabled in preferences)");
return;
}
logger.info("๐๏ธ Cleaning up old data...");
const outputDir = this.preferences.outputDirectory || 'organized-data';
if (!existsSync(outputDir)) {
logger.info("๐ Output directory doesn't exist, nothing to clean");
return;
}
try {
const items = await readdir(outputDir, { withFileTypes: true });
const cutoffDate = new Date();
cutoffDate.setDate(cutoffDate.getDate() - this.preferences.autoCleanupDays);
for (const item of items) {
const fullPath = path.join(outputDir, item.name);
if (item.isDirectory()) {
const stats = statSync(fullPath);
const itemDate = new Date(stats.mtime);
if (itemDate < cutoffDate) {
await this.removeDirectoryRecursively(fullPath);
this.stats.directoriesRemoved++;
this.stats.spaceFreed += await this.calculateDirectorySize(fullPath);
logger.info(`๐๏ธ Removed old directory: ${item.name}`);
}
}
}
logger.success(`โ
Old data cleanup completed. Removed ${this.stats.directoriesRemoved} directories`);
} catch (error) {
logger.warn(`โ ๏ธ Cleanup warning: ${(error as Error).message}`);
}
}
private async organizeAndCompressData(): Promise<void> {
logger.info("๐ฆ Organizing and compressing data...");
const outputDir = this.preferences.outputDirectory || 'organized-data';
if (!existsSync(outputDir)) {
logger.info("๐ Output directory doesn't exist, nothing to organize");
return;
}
try {
// Create organized structure
await this.createOrganizedStructure(outputDir);
// Compress data if requested
if (this.preferences.compressOutput) {
await this.compressData(outputDir);
}
// Generate index files
await this.generateIndexFiles(outputDir);
logger.success("โ
Data organization completed");
} catch (error) {
logger.error(`โ Error organizing data: ${(error as Error).message}`);
}
}
private async createOrganizedStructure(baseDir: string): Promise<void> {
logger.info("๐ Creating organized directory structure...");
const items = await readdir(baseDir, { withFileTypes: true });
for (const item of items) {
if (item.isDirectory()) {
const itemPath = path.join(baseDir, item.name);
// Create category index
await this.createCategoryIndex(itemPath);
// Create context index if it's a context directory
if (this.isContextDirectory(itemPath)) {
await this.createContextIndex(itemPath);
}
}
}
}
private async createCategoryIndex(categoryDir: string): Promise<void> {
try {
const items = await readdir(categoryDir, { withFileTypes: true });
const markdownFiles = items.filter(item =>
item.isFile() && item.name.endsWith('.md') && !item.name.startsWith('_')
);
if (markdownFiles.length === 0) return;
const indexPath = path.join(categoryDir, '_index.md');
const indexContent = `# Category Index
**Category:** ${path.basename(categoryDir)}
**Total Files:** ${markdownFiles.length}
**Generated:** ${new Date().toISOString()}
## Files
${markdownFiles.map(file => `- [${file.name.replace('.md', '')}](${file.name})`).join('\n')}
## Quick Stats
- **Processing Level:** ${this.preferences.dataProcessingLevel}
- **Include Metadata:** ${this.preferences.includeMetadata ? 'Yes' : 'No'}
- **Include Screenshots:** ${this.preferences.includeScreenshots ? 'Yes' : 'No'}
`;
await writeFile(indexPath, indexContent, 'utf8');
this.stats.filesProcessed++;
} catch (error) {
logger.warn(`โ ๏ธ Could not create category index for ${categoryDir}: ${(error as Error).message}`);
}
}
private async createContextIndex(contextDir: string): Promise<void> {
try {
const items = await readdir(contextDir, { withFileTypes: true });
const categoryDirs = items.filter(item => item.isDirectory());
if (categoryDirs.length === 0) return;
const indexPath = path.join(contextDir, '_index.md');
const indexContent = `# Context Index
**Context:** ${path.basename(contextDir)}
**Total Categories:** ${categoryDirs.length}
**Generated:** ${new Date().toISOString()}
## Categories
${categoryDirs.map(dir => `- [${dir.name}](${dir.name}/_index.md)`).join('\n')}
## Overview
This context contains documentation and resources related to ${path.basename(contextDir).toLowerCase()} development and usage.
## Processing Information
- **Level:** ${this.preferences.dataProcessingLevel}
- **Last Updated:** ${new Date().toISOString()}
`;
await writeFile(indexPath, indexContent, 'utf8');
this.stats.filesProcessed++;
} catch (error) {
logger.warn(`โ ๏ธ Could not create context index for ${contextDir}: ${(error as Error).message}`);
}
}
private isContextDirectory(dirPath: string): boolean {
const dirName = path.basename(dirPath);
return dirName.includes('context') || dirName.includes('developer') || dirName.includes('user');
}
private async compressData(baseDir: string): Promise<void> {
logger.info("๐๏ธ Compressing data...");
// For now, we'll just log compression
// In a real implementation, you could use libraries like 'archiver' or 'tar'
logger.info("๐ Compression feature would be implemented here");
logger.info("๐ก Consider using 'bun add archiver' for actual compression");
}
private async generateIndexFiles(baseDir: string): Promise<void> {
logger.info("๐ Generating index files...");
try {
const items = await readdir(baseDir, { withFileTypes: true });
const contextDirs = items.filter(item => item.isDirectory());
// Generate main index
const mainIndexPath = path.join(baseDir, '_main-index.md');
const mainIndexContent = `# Main Index
**ElizaOS Data Organization**
**Generated:** ${new Date().toISOString()}
**Processing Level:** ${this.preferences.dataProcessingLevel}
## Contexts
${contextDirs.map(dir => `- [${dir.name}](${dir.name}/_index.md)`).join('\n')}
## Quick Navigation
- [Developer Context](developer-context/_index.md) - Technical documentation
- [User Context](user-context/_index.md) - User guides and tutorials
## Processing Information
- **Include Metadata:** ${this.preferences.includeMetadata ? 'Yes' : 'No'}
- **Include Raw HTML:** ${this.preferences.includeRawHtml ? 'Yes' : 'No'}
- **Include Screenshots:** ${this.preferences.includeScreenshots ? 'Yes' : 'No'}
- **Cleanup Old Data:** ${this.preferences.cleanupOldData ? 'Yes' : 'No'}
- **Organize by Date:** ${this.preferences.organizeByDate ? 'Yes' : 'No'}
## Usage
Navigate through the context directories to find organized content by category.
Each category contains individual files and summary information.
`;
await writeFile(mainIndexPath, mainIndexContent, 'utf8');
this.stats.filesProcessed++;
// Generate JSON index for programmatic access
const jsonIndexPath = path.join(baseDir, '_index.json');
const jsonIndexContent = {
title: "ElizaOS Data Organization",
generated: new Date().toISOString(),
processingLevel: this.preferences.dataProcessingLevel,
contexts: contextDirs.map(dir => ({
name: dir.name,
path: `${dir.name}/_index.md`,
type: this.isContextDirectory(path.join(baseDir, dir.name)) ? 'context' : 'category'
})),
preferences: this.preferences,
stats: {
totalContexts: contextDirs.length,
filesProcessed: this.stats.filesProcessed
}
};
await writeFile(jsonIndexPath, JSON.stringify(jsonIndexContent, null, 2), 'utf8');
this.stats.filesProcessed++;
logger.success("โ
Index files generated");
} catch (error) {
logger.error(`โ Error generating index files: ${(error as Error).message}`);
}
}
private async generateQualityReports(): Promise<void> {
logger.info("๐ Generating quality reports...");
const outputDir = this.preferences.outputDirectory || 'organized-data';
if (!existsSync(outputDir)) return;
try {
const metrics = await this.calculateQualityMetrics(outputDir);
await this.saveQualityReport(metrics, outputDir);
logger.success("โ
Quality reports generated");
} catch (error) {
logger.warn(`โ ๏ธ Could not generate quality reports: ${(error as Error).message}`);
}
}
private async calculateQualityMetrics(baseDir: string): Promise<DataQualityMetrics> {
const metrics: DataQualityMetrics = {
totalFiles: 0,
averageQuality: 0,
qualityDistribution: { excellent: 0, good: 0, fair: 0, poor: 0 },
complexityDistribution: { basic: 0, intermediate: 0, advanced: 0 },
contentTypes: { withCode: 0, withImages: 0, withLinks: 0 }
};
try {
await this.scanDirectoryForMetrics(baseDir, metrics);
if (metrics.totalFiles > 0) {
metrics.averageQuality = metrics.averageQuality / metrics.totalFiles;
}
} catch (error) {
logger.warn(`โ ๏ธ Error calculating metrics: ${(error as Error).message}`);
}
return metrics;
}
private async scanDirectoryForMetrics(dirPath: string, metrics: DataQualityMetrics): Promise<void> {
const items = await readdir(dirPath, { withFileTypes: true });
for (const item of items) {
const fullPath = path.join(dirPath, item.name);
if (item.isDirectory()) {
await this.scanDirectoryForMetrics(fullPath, metrics);
} else if (item.isFile() && item.name.endsWith('.md')) {
await this.analyzeFileMetrics(fullPath, metrics);
}
}
}
private async analyzeFileMetrics(filePath: string, metrics: DataQualityMetrics): Promise<void> {
try {
const content = await readFile(filePath, 'utf8');
metrics.totalFiles++;
// Simple analysis based on content
const hasCode = content.includes('```') || content.includes('`');
const hasImages = content.includes(';
if (hasCode) metrics.contentTypes.withCode++;
if (hasImages) metrics.contentTypes.withImages++;
if (hasLinks) metrics.contentTypes.withLinks++;
// Estimate quality based on content length and structure
const quality = Math.min(100, Math.max(0,
(content.length / 1000) * 30 +
(hasCode ? 20 : 0) +
(hasImages ? 15 : 0) +
(hasLinks ? 10 : 0) +
(content.includes('##') ? 25 : 0)
));
metrics.averageQuality += quality;
if (quality >= 80) metrics.qualityDistribution.excellent++;
else if (quality >= 60) metrics.qualityDistribution.good++;
else if (quality >= 40) metrics.qualityDistribution.fair++;
else metrics.qualityDistribution.poor++;
// Estimate complexity
const complexity = this.estimateComplexity(content);
metrics.complexityDistribution[complexity]++;
} catch (error) {
logger.warn(`โ ๏ธ Could not analyze file ${filePath}: ${(error as Error).message}`);
}
}
private estimateComplexity(content: string): 'basic' | 'intermediate' | 'advanced' {
const hasCode = content.includes('```') || content.includes('`');
const hasTechnicalTerms = content.toLowerCase().includes('api') ||
content.toLowerCase().includes('architecture') ||
content.toLowerCase().includes('integration');
if (hasCode && hasTechnicalTerms) return 'advanced';
if (hasCode || hasTechnicalTerms) return 'intermediate';
return 'basic';
}
private async saveQualityReport(metrics: DataQualityMetrics, outputDir: string): Promise<void> {
const reportPath = path.join(outputDir, '_quality-report.md');
const reportContent = `# Data Quality Report
**Generated:** ${new Date().toISOString()}
**Processing Level:** ${this.preferences.dataProcessingLevel}
## Overview
- **Total Files:** ${metrics.totalFiles}
- **Average Quality Score:** ${metrics.averageQuality.toFixed(1)}/100
## Quality Distribution
- **Excellent (80-100):** ${metrics.qualityDistribution.excellent} files
- **Good (60-79):** ${metrics.qualityDistribution.good} files
- **Fair (40-59):** ${metrics.qualityDistribution.fair} files
- **Poor (0-39):** ${metrics.qualityDistribution.poor} files
## Complexity Distribution
- **Basic:** ${metrics.complexityDistribution.basic} files
- **Intermediate:** ${metrics.complexityDistribution.intermediate} files
- **Advanced:** ${metrics.complexityDistribution.advanced} files
## Content Types
- **With Code:** ${metrics.contentTypes.withCode} files
- **With Images:** ${metrics.contentTypes.withImages} files
- **With Links:** ${metrics.contentTypes.withLinks} files
## Recommendations
${this.generateRecommendations(metrics)}
`;
await writeFile(reportPath, reportContent, 'utf8');
// Also save as JSON for programmatic access
const jsonReportPath = path.join(outputDir, '_quality-report.json');
await writeFile(jsonReportPath, JSON.stringify(metrics, null, 2), 'utf8');
}
private generateRecommendations(metrics: DataQualityMetrics): string {
const recommendations = [];
if (metrics.averageQuality < 50) {
recommendations.push("- Consider re-scraping low-quality content");
}
if (metrics.contentTypes.withCode < metrics.totalFiles * 0.3) {
recommendations.push("- Many files lack code examples - consider enhanced scraping");
}
if (metrics.contentTypes.withImages < metrics.totalFiles * 0.2) {
recommendations.push("- Consider including more visual content");
}
if (metrics.qualityDistribution.poor > metrics.totalFiles * 0.3) {
recommendations.push("- High percentage of low-quality files - review scraping parameters");
}
if (recommendations.length === 0) {
recommendations.push("- Data quality looks good! No immediate actions needed.");
}
return recommendations.join('\n');
}
private async finalCleanup(): Promise<void> {
logger.info("โจ Performing final cleanup...");
const outputDir = this.preferences.outputDirectory || 'organized-data';
if (!existsSync(outputDir)) return;
try {
// Remove temporary files
await this.removeTemporaryFiles(outputDir);
// Optimize directory structure
await this.optimizeDirectoryStructure(outputDir);
// Generate final summary
await this.generateFinalSummary(outputDir);
logger.success("โ
Final cleanup completed");
} catch (error) {
logger.warn(`โ ๏ธ Error during final cleanup: ${(error as Error).message}`);
}
}
private async removeTemporaryFiles(baseDir: string): Promise<void> {
const tempPatterns = ['temp', 'tmp', '.tmp', '.temp', 'cache', '.cache'];
try {
await this.removeFilesByPattern(baseDir, tempPatterns);
} catch (error) {
logger.warn(`โ ๏ธ Could not remove temporary files: ${(error as Error).message}`);
}
}
private async removeFilesByPattern(baseDir: string, patterns: string[]): Promise<void> {
const items = await readdir(baseDir, { withFileTypes: true });
for (const item of items) {
const fullPath = path.join(baseDir, item.name);
if (item.isDirectory()) {
await this.removeFilesByPattern(fullPath, patterns);
} else if (item.isFile()) {
const shouldRemove = patterns.some(pattern =>
item.name.toLowerCase().includes(pattern.toLowerCase())
);
if (shouldRemove) {
try {
await unlink(fullPath);
this.stats.filesRemoved++;
logger.info(`๐๏ธ Removed temporary file: ${item.name}`);
} catch (error) {
logger.warn(`โ ๏ธ Could not remove ${item.name}: ${(error as Error).message}`);
}
}
}
}
}
private async optimizeDirectoryStructure(baseDir: string): Promise<void> {
// This would implement directory optimization logic
// For now, just log that it's completed
logger.info("๐ Directory structure optimization completed");
}
private async generateFinalSummary(baseDir: string): Promise<void> {
const summaryPath = path.join(baseDir, '_final-summary.md');
const summaryContent = `# Final Summary
**ElizaOS Data Organization Complete**
**Generated:** ${new Date().toISOString()}
## Processing Summary
- **Processing Level:** ${this.preferences.dataProcessingLevel}
- **Include Metadata:** ${this.preferences.includeMetadata ? 'Yes' : 'No'}
- **Include Raw HTML:** ${this.preferences.includeRawHtml ? 'Yes' : 'No'}
- **Include Screenshots:** ${this.preferences.includeScreenshots ? 'Yes' : 'No'}
## Cleanup Results
- **Files Processed:** ${this.stats.filesProcessed}
- **Files Removed:** ${this.stats.filesRemoved}
- **Directories Removed:** ${this.stats.directoriesRemoved}
- **Space Freed:** ${(this.stats.spaceFreed / 1024).toFixed(2)} KB
- **Processing Time:** ${this.stats.processingTime}ms
## Organization Features
- **Cleanup Old Data:** ${this.preferences.cleanupOldData ? 'Enabled' : 'Disabled'}
- **Organize by Date:** ${this.preferences.organizeByDate ? 'Enabled' : 'Disabled'}
- **Compress Output:** ${this.preferences.compressOutput ? 'Enabled' : 'Disabled'}
## Next Steps
1. Review the organized content in the category directories
2. Check quality reports for any issues
3. Use the index files for easy navigation
4. Configure preferences for future runs using 'bun run user-preferences.ts setup'
## Files Generated
- \`_main-index.md\` - Main navigation index
- \`_index.json\` - Programmatic access index
- \`_quality-report.md\` - Data quality analysis
- \`_quality-report.json\` - Quality metrics in JSON format
- \`_final-summary.md\` - This summary file
๐ **Data organization and cleanup completed successfully!**
`;
await writeFile(summaryPath, summaryContent, 'utf8');
this.stats.filesProcessed++;
}
private async removeDirectoryRecursively(dirPath: string): Promise<void> {
const items = await readdir(dirPath, { withFileTypes: true });
for (const item of items) {
const fullPath = path.join(dirPath, item.name);
if (item.isDirectory()) {
await this.removeDirectoryRecursively(fullPath);
} else {
await unlink(fullPath);
}
}
await rmdir(dirPath);
}
private async calculateDirectorySize(dirPath: string): Promise<number> {
let totalSize = 0;
try {
const items = await readdir(dirPath, { withFileTypes: true });
for (const item of items) {
const fullPath = path.join(dirPath, item.name);
if (item.isDirectory()) {
totalSize += await this.calculateDirectorySize(fullPath);
} else {
const stats = statSync(fullPath);
totalSize += stats.size;
}
}
} catch (error) {
// Ignore errors in size calculation
}
return totalSize;
}
private printCleanupStats(): void {
logger.info("\n๐ Cleanup Statistics:");
logger.info("=".repeat(40));
logger.info(`Files Processed: ${this.stats.filesProcessed}`);
logger.info(`Files Removed: ${this.stats.filesRemoved}`);
logger.info(`Directories Removed: ${this.stats.directoriesRemoved}`);
logger.info(`Space Freed: ${(this.stats.spaceFreed / 1024).toFixed(2)} KB`);
logger.info(`Processing Time: ${this.stats.processingTime}ms`);
logger.info("=".repeat(40));
}
}
// Main execution
async function main(): Promise<void> {
try {
logger.info("๐งน ElizaOS Data Cleanup Manager");
// Load user preferences
const preferenceManager = new UserPreferenceManager();
const preferences = await preferenceManager.loadPreferences();
// Check command line arguments
const args = process.argv.slice(2);
if (args.includes('--help') || args.includes('-h')) {
logger.info("Usage: bun run data-cleanup.ts [options]");
logger.info("Options:");
logger.info(" --help, -h Show this help message");
logger.info(" --force Force cleanup even if disabled in preferences");
logger.info(" --dry-run Show what would be cleaned without actually doing it");
return;
}
const forceCleanup = args.includes('--force');
const dryRun = args.includes('--dry-run');
if (dryRun) {
logger.info("๐ DRY RUN MODE - No actual cleanup will be performed");
}
if (!preferences.cleanupOldData && !forceCleanup) {
logger.info("โญ๏ธ Cleanup is disabled in preferences. Use --force to override.");
return;
}
const cleanupManager = new DataCleanupManager(preferences);
if (dryRun) {
logger.info("๐ Dry run completed - no actual cleanup performed");
return;
}
await cleanupManager.performComprehensiveCleanup();
} catch (error) {
logger.error(`โ Fatal error: ${(error as Error).message}`);
process.exit(1);
}
}
if (import.meta.main) {
main();
}