UNPKG

agentic-data-stack-community

Version:

AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.

677 lines (570 loc) • 21 kB
/** * Document Manager - Advanced Document Sharding and Management System * * Implements sophisticated document processing with: * - Automatic document sharding by level 2 sections * - Knowledge base integration and indexing * - Document versioning and history tracking * - Cross-reference management and link validation * - Template-driven document generation * - Multi-format export capabilities */ const chalk = require('chalk'); const inquirer = require('inquirer'); const fs = require('fs-extra'); const path = require('path'); const yaml = require('yaml'); class DocumentManager { constructor(options = {}) { this.rootDir = options.rootDir || process.cwd(); this.dataCore = path.join(this.rootDir, 'data-core'); this.docsDir = path.join(this.rootDir, 'docs'); this.shardCache = new Map(); this.indexCache = new Map(); this.templateEngine = options.templateEngine; this.agentOrchestrator = options.agentOrchestrator; } /** * Main document sharding interface */ async shardDocument(documentPath, options = {}) { console.log(chalk.blue(`\nšŸ“„ Document Sharding: ${documentPath}`)); try { // Check if document exists if (!await fs.pathExists(documentPath)) { throw new Error(`Document not found: ${documentPath}`); } // Load and validate document const content = await fs.readFile(documentPath, 'utf8'); if (!content.trim()) { throw new Error('Document is empty'); } // Determine output location const outputDir = this.determineOutputDirectory(documentPath, options.outputDir); console.log(chalk.dim(`Output directory: ${outputDir}`)); // Check for automatic sharding tools const useAutomaticSharding = await this.checkAutomaticSharding(); if (useAutomaticSharding && options.preferAutomatic !== false) { const result = await this.performAutomaticSharding(documentPath, outputDir); if (result.success) { return result; } console.log(chalk.yellow('āš ļø Automatic sharding failed, falling back to manual method')); } // Perform manual sharding return await this.performManualSharding(documentPath, content, outputDir, options); } catch (error) { console.error(chalk.red(`āŒ Document sharding failed: ${error.message}`)); return { success: false, error: error.message }; } } /** * Check for automatic sharding capabilities */ async checkAutomaticSharding() { // Check for core-config.yaml markdownExploder setting const configPath = path.join(this.rootDir, 'core-config.yaml'); if (await fs.pathExists(configPath)) { try { const configContent = await fs.readFile(configPath, 'utf8'); const config = yaml.parse(configContent); if (config.markdownExploder === true) { console.log(chalk.green('āœ“ markdownExploder enabled in core-config.yaml')); return true; } else { console.log(chalk.dim('markdownExploder is disabled in core-config.yaml')); return false; } } catch (error) { console.log(chalk.yellow('Warning: Could not parse core-config.yaml')); return false; } } return false; } /** * Attempt automatic sharding using md-tree */ async performAutomaticSharding(documentPath, outputDir) { console.log(chalk.blue('šŸš€ Attempting automatic sharding with md-tree...')); try { const { spawn } = require('child_process'); return new Promise((resolve) => { const process = spawn('md-tree', ['explode', documentPath, outputDir], { stdio: ['pipe', 'pipe', 'pipe'] }); let stdout = ''; let stderr = ''; process.stdout.on('data', (data) => { stdout += data.toString(); }); process.stderr.on('data', (data) => { stderr += data.toString(); }); process.on('close', (code) => { if (code === 0) { console.log(chalk.green('āœ… Automatic sharding completed successfully!')); resolve({ success: true, method: 'automatic', outputDir, message: 'Document sharded using md-tree explode command' }); } else { console.log(chalk.red('āŒ md-tree command failed')); if (stderr.includes('command not found') || stderr.includes('not available')) { console.log(chalk.yellow('šŸ“¦ md-tree not installed. Install with:')); console.log(chalk.dim('npm install -g @kayvan/markdown-tree-parser')); } resolve({ success: false, error: stderr || 'md-tree command failed' }); } }); process.on('error', (error) => { console.log(chalk.red('āŒ Failed to execute md-tree command')); resolve({ success: false, error: error.message }); }); }); } catch (error) { return { success: false, error: error.message }; } } /** * Manual document sharding implementation */ async performManualSharding(documentPath, content, outputDir, options = {}) { console.log(chalk.blue('šŸ”§ Performing manual document sharding...')); // Parse document sections const sections = this.parseDocumentSections(content); if (sections.length === 0) { throw new Error('No level 2 sections found to shard'); } console.log(chalk.green(`šŸ“‘ Found ${sections.length} sections to shard`)); // Show preview of sections console.log(chalk.bold('\nSections to be created:')); sections.forEach((section, index) => { console.log(chalk.dim(` ${index + 1}. ${section.title} → ${section.filename}`)); }); // Confirm sharding if (!options.autoConfirm) { const proceed = await inquirer.prompt([{ type: 'confirm', name: 'shard', message: 'Proceed with document sharding?', default: true }]); if (!proceed.shard) { return { success: false, cancelled: true }; } } // Create output directory await fs.ensureDir(outputDir); // Process each section const createdFiles = []; const metadata = this.extractDocumentMetadata(content); for (const section of sections) { const adjustedContent = this.adjustHeadingLevels(section.content); const filePath = path.join(outputDir, section.filename); await fs.writeFile(filePath, adjustedContent, 'utf8'); createdFiles.push({ filename: section.filename, title: section.title, path: filePath, wordCount: adjustedContent.split(' ').length }); console.log(chalk.green(`āœ“ Created: ${section.filename}`)); } // Create index file const indexContent = this.generateIndexFile(metadata, createdFiles); const indexPath = path.join(outputDir, 'index.md'); await fs.writeFile(indexPath, indexContent, 'utf8'); console.log(chalk.green('āœ“ Created: index.md')); // Create metadata file for tracking const metadataFile = { source: documentPath, shardedAt: new Date().toISOString(), method: 'manual', sections: createdFiles.length, files: createdFiles, metadata }; await fs.writeJson(path.join(outputDir, '.shard-metadata.json'), metadataFile, { spaces: 2 }); // Validate sharding results const validation = await this.validateSharding(content, outputDir, createdFiles); // Report results console.log(chalk.green.bold('\nšŸŽ‰ Document sharded successfully!')); console.log(chalk.blue('šŸ“Š Summary:')); console.log(chalk.dim(` • Source: ${documentPath}`)); console.log(chalk.dim(` • Destination: ${outputDir}`)); console.log(chalk.dim(` • Files created: ${createdFiles.length + 1} (including index.md)`)); console.log(chalk.dim(` • Total sections: ${sections.length}`)); console.log(chalk.dim(` • Validation: ${validation.passed ? 'Passed' : 'Failed'}`)); return { success: true, method: 'manual', outputDir, files: createdFiles, indexFile: indexPath, metadata: metadataFile, validation }; } /** * Parse document into sections by level 2 headings */ parseDocumentSections(content) { const lines = content.split('\n'); const sections = []; let currentSection = null; let inCodeBlock = false; let codeBlockDelimiter = ''; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Track code blocks to avoid parsing ## inside them if (line.trim().startsWith('```')) { if (!inCodeBlock) { inCodeBlock = true; codeBlockDelimiter = line.trim(); } else if (line.trim() === codeBlockDelimiter || line.trim() === '```') { inCodeBlock = false; codeBlockDelimiter = ''; } } // Check for level 2 heading (only outside code blocks) if (!inCodeBlock && line.match(/^##\s+(.+)/)) { // Save previous section if (currentSection) { sections.push({ ...currentSection, content: currentSection.lines.join('\n').trim() }); } // Start new section const title = line.replace(/^##\s+/, '').trim(); currentSection = { title, filename: this.generateFilename(title), lines: [line], startLine: i }; } else if (currentSection) { // Add line to current section currentSection.lines.push(line); } } // Add final section if (currentSection) { sections.push({ ...currentSection, content: currentSection.lines.join('\n').trim() }); } return sections; } /** * Generate filename from section title */ generateFilename(title) { return title .toLowerCase() .replace(/[^a-z0-9\s-]/g, '') .replace(/\s+/g, '-') .replace(/-+/g, '-') .replace(/^-|-$/g, '') + '.md'; } /** * Adjust heading levels (decrease by 1) */ adjustHeadingLevels(content) { return content.replace(/^(#+)\s/gm, (match, hashes) => { return hashes.length > 1 ? hashes.slice(1) + ' ' : '# '; }); } /** * Extract document metadata */ extractDocumentMetadata(content) { const lines = content.split('\n'); const titleMatch = lines.find(line => line.match(/^#\s+(.+)/)); return { title: titleMatch ? titleMatch.replace(/^#\s+/, '').trim() : 'Untitled Document', firstSection: lines.findIndex(line => line.match(/^##\s+(.+)/)), totalLines: lines.length, introduction: this.extractIntroduction(content) }; } /** * Extract introduction content (before first level 2 section) */ extractIntroduction(content) { const lines = content.split('\n'); const firstSectionIndex = lines.findIndex(line => line.match(/^##\s+(.+)/)); if (firstSectionIndex === -1) { return ''; } return lines.slice(0, firstSectionIndex).join('\n').trim(); } /** * Generate index file content */ generateIndexFile(metadata, files) { let content = `# ${metadata.title}\n\n`; if (metadata.introduction) { content += `${metadata.introduction}\n\n`; } content += `## Sections\n\n`; files.forEach(file => { content += `- [${file.title}](${file.filename})\n`; }); content += `\n---\n\n`; content += `*Document sharded on ${new Date().toISOString().split('T')[0]}*\n`; content += `*Total sections: ${files.length}*\n`; return content; } /** * Determine output directory */ determineOutputDirectory(documentPath, customOutputDir) { if (customOutputDir) { return path.resolve(customOutputDir); } const baseName = path.basename(documentPath, path.extname(documentPath)); return path.join(this.docsDir, baseName); } /** * Validate sharding results */ async validateSharding(originalContent, outputDir, createdFiles) { const warnings = []; let totalShardedContent = ''; try { // Read all sharded files for (const file of createdFiles) { if (await fs.pathExists(file.path)) { const shardContent = await fs.readFile(file.path, 'utf8'); totalShardedContent += shardContent + '\n'; } else { warnings.push(`File not found: ${file.filename}`); } } // Check index file const indexPath = path.join(outputDir, 'index.md'); if (!await fs.pathExists(indexPath)) { warnings.push('Index file not created'); } // Basic content validation const originalWords = originalContent.split(/\s+/).length; const shardedWords = totalShardedContent.split(/\s+/).length; const contentRatio = shardedWords / originalWords; if (contentRatio < 0.8) { warnings.push('Significant content loss detected (>20%)'); } // Check for code blocks const originalCodeBlocks = (originalContent.match(/```/g) || []).length; const shardedCodeBlocks = (totalShardedContent.match(/```/g) || []).length; if (originalCodeBlocks !== shardedCodeBlocks) { warnings.push('Code block count mismatch - possible parsing error'); } } catch (error) { warnings.push(`Validation error: ${error.message}`); } return { passed: warnings.length === 0, warnings, contentRatio: totalShardedContent.split(/\s+/).length / originalContent.split(/\s+/).length }; } /** * Rebuild document from shards */ async rebuildFromShards(shardDirectory) { console.log(chalk.blue(`\nšŸ”„ Rebuilding document from shards: ${shardDirectory}`)); try { const metadataPath = path.join(shardDirectory, '.shard-metadata.json'); if (!await fs.pathExists(metadataPath)) { throw new Error('Shard metadata not found - cannot rebuild'); } const metadata = await fs.readJson(metadataPath); let rebuiltContent = ''; // Add index content (introduction) const indexPath = path.join(shardDirectory, 'index.md'); if (await fs.pathExists(indexPath)) { const indexContent = await fs.readFile(indexPath, 'utf8'); const introMatch = indexContent.match(/^# .+?\n\n([\s\S]*?)\n## Sections/); if (introMatch) { rebuiltContent += introMatch[0].replace('\n## Sections', '') + '\n\n'; } } // Add each section for (const file of metadata.files) { const filePath = path.join(shardDirectory, file.filename); if (await fs.pathExists(filePath)) { const sectionContent = await fs.readFile(filePath, 'utf8'); // Adjust heading levels back (increase by 1) const adjustedContent = sectionContent.replace(/^(#+)\s/gm, (match, hashes) => { return '#' + hashes + ' '; }); rebuiltContent += adjustedContent + '\n\n'; } } return { success: true, content: rebuiltContent.trim(), originalPath: metadata.source, shardedAt: metadata.shardedAt }; } catch (error) { console.error(chalk.red(`āŒ Rebuild failed: ${error.message}`)); return { success: false, error: error.message }; } } /** * List available sharded documents */ async listShardedDocuments() { const shardedDocs = []; if (!await fs.pathExists(this.docsDir)) { return shardedDocs; } const entries = await fs.readdir(this.docsDir, { withFileTypes: true }); for (const entry of entries) { if (entry.isDirectory()) { const metadataPath = path.join(this.docsDir, entry.name, '.shard-metadata.json'); if (await fs.pathExists(metadataPath)) { try { const metadata = await fs.readJson(metadataPath); shardedDocs.push({ name: entry.name, path: path.join(this.docsDir, entry.name), ...metadata }); } catch (error) { console.warn(`Warning: Could not read metadata for ${entry.name}`); } } } } return shardedDocs; } /** * Interactive document management interface */ async manageDocuments() { console.log(chalk.bold.blue('\nšŸ“š Document Management System')); const shardedDocs = await this.listShardedDocuments(); const actions = [ 'Shard a new document', 'List sharded documents', 'Rebuild document from shards', 'Validate existing shards', 'Exit' ]; const { action } = await inquirer.prompt([{ type: 'list', name: 'action', message: 'What would you like to do?', choices: actions }]); switch (action) { case 'Shard a new document': return await this.interactiveSharding(); case 'List sharded documents': return this.displayShardedDocuments(shardedDocs); case 'Rebuild document from shards': return await this.interactiveRebuild(shardedDocs); case 'Validate existing shards': return await this.interactiveValidation(shardedDocs); default: return { success: true, action: 'exit' }; } } async interactiveSharding() { const { documentPath } = await inquirer.prompt([{ type: 'input', name: 'documentPath', message: 'Enter the path to the document to shard:', validate: async (input) => { if (!input.trim()) return 'Please enter a document path'; if (!await fs.pathExists(input)) return 'File does not exist'; return true; } }]); return await this.shardDocument(documentPath); } displayShardedDocuments(shardedDocs) { if (shardedDocs.length === 0) { console.log(chalk.yellow('No sharded documents found')); return { success: true, count: 0 }; } console.log(chalk.bold('\nšŸ“‘ Sharded Documents:')); shardedDocs.forEach((doc, index) => { console.log(`\n${index + 1}. ${chalk.bold(doc.name)}`); console.log(chalk.dim(` Source: ${doc.source}`)); console.log(chalk.dim(` Sharded: ${doc.shardedAt.split('T')[0]}`)); console.log(chalk.dim(` Sections: ${doc.sections}`)); console.log(chalk.dim(` Method: ${doc.method}`)); }); return { success: true, count: shardedDocs.length }; } async interactiveRebuild(shardedDocs) { if (shardedDocs.length === 0) { console.log(chalk.yellow('No sharded documents available for rebuild')); return { success: false, error: 'No sharded documents found' }; } const { selectedDoc } = await inquirer.prompt([{ type: 'list', name: 'selectedDoc', message: 'Select document to rebuild:', choices: shardedDocs.map(doc => ({ name: `${doc.name} (${doc.sections} sections)`, value: doc })) }]); const result = await this.rebuildFromShards(selectedDoc.path); if (result.success) { const { saveRebuilt } = await inquirer.prompt([{ type: 'confirm', name: 'saveRebuilt', message: 'Save rebuilt document?', default: true }]); if (saveRebuilt) { const { outputPath } = await inquirer.prompt([{ type: 'input', name: 'outputPath', message: 'Enter output file path:', default: `${selectedDoc.name}-rebuilt.md` }]); await fs.writeFile(outputPath, result.content, 'utf8'); console.log(chalk.green(`āœ… Rebuilt document saved to: ${outputPath}`)); } } return result; } async interactiveValidation(shardedDocs) { if (shardedDocs.length === 0) { console.log(chalk.yellow('No sharded documents available for validation')); return { success: false, error: 'No sharded documents found' }; } const { selectedDoc } = await inquirer.prompt([{ type: 'list', name: 'selectedDoc', message: 'Select document to validate:', choices: shardedDocs.map(doc => ({ name: `${doc.name} (${doc.sections} sections)`, value: doc })) }]); // Re-validate by attempting rebuild and comparing const rebuildResult = await this.rebuildFromShards(selectedDoc.path); if (rebuildResult.success) { console.log(chalk.green('āœ… Document shards are valid and can be rebuilt')); console.log(chalk.dim(`Original: ${selectedDoc.source}`)); console.log(chalk.dim(`Rebuilt content length: ${rebuildResult.content.length} characters`)); } else { console.log(chalk.red('āŒ Document validation failed')); console.log(chalk.dim(`Error: ${rebuildResult.error}`)); } return rebuildResult; } } module.exports = DocumentManager;