agentic-data-stack-community
Version:
AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.
677 lines (570 loc) ⢠21 kB
JavaScript
/**
* Document Manager - Advanced Document Sharding and Management System
*
* Implements sophisticated document processing with:
* - Automatic document sharding by level 2 sections
* - Knowledge base integration and indexing
* - Document versioning and history tracking
* - Cross-reference management and link validation
* - Template-driven document generation
* - Multi-format export capabilities
*/
const chalk = require('chalk');
const inquirer = require('inquirer');
const fs = require('fs-extra');
const path = require('path');
const yaml = require('yaml');
class DocumentManager {
constructor(options = {}) {
this.rootDir = options.rootDir || process.cwd();
this.dataCore = path.join(this.rootDir, 'data-core');
this.docsDir = path.join(this.rootDir, 'docs');
this.shardCache = new Map();
this.indexCache = new Map();
this.templateEngine = options.templateEngine;
this.agentOrchestrator = options.agentOrchestrator;
}
/**
* Main document sharding interface
*/
async shardDocument(documentPath, options = {}) {
console.log(chalk.blue(`\nš Document Sharding: ${documentPath}`));
try {
// Check if document exists
if (!await fs.pathExists(documentPath)) {
throw new Error(`Document not found: ${documentPath}`);
}
// Load and validate document
const content = await fs.readFile(documentPath, 'utf8');
if (!content.trim()) {
throw new Error('Document is empty');
}
// Determine output location
const outputDir = this.determineOutputDirectory(documentPath, options.outputDir);
console.log(chalk.dim(`Output directory: ${outputDir}`));
// Check for automatic sharding tools
const useAutomaticSharding = await this.checkAutomaticSharding();
if (useAutomaticSharding && options.preferAutomatic !== false) {
const result = await this.performAutomaticSharding(documentPath, outputDir);
if (result.success) {
return result;
}
console.log(chalk.yellow('ā ļø Automatic sharding failed, falling back to manual method'));
}
// Perform manual sharding
return await this.performManualSharding(documentPath, content, outputDir, options);
} catch (error) {
console.error(chalk.red(`ā Document sharding failed: ${error.message}`));
return { success: false, error: error.message };
}
}
/**
* Check for automatic sharding capabilities
*/
async checkAutomaticSharding() {
// Check for core-config.yaml markdownExploder setting
const configPath = path.join(this.rootDir, 'core-config.yaml');
if (await fs.pathExists(configPath)) {
try {
const configContent = await fs.readFile(configPath, 'utf8');
const config = yaml.parse(configContent);
if (config.markdownExploder === true) {
console.log(chalk.green('ā markdownExploder enabled in core-config.yaml'));
return true;
} else {
console.log(chalk.dim('markdownExploder is disabled in core-config.yaml'));
return false;
}
} catch (error) {
console.log(chalk.yellow('Warning: Could not parse core-config.yaml'));
return false;
}
}
return false;
}
/**
* Attempt automatic sharding using md-tree
*/
async performAutomaticSharding(documentPath, outputDir) {
console.log(chalk.blue('š Attempting automatic sharding with md-tree...'));
try {
const { spawn } = require('child_process');
return new Promise((resolve) => {
const process = spawn('md-tree', ['explode', documentPath, outputDir], {
stdio: ['pipe', 'pipe', 'pipe']
});
let stdout = '';
let stderr = '';
process.stdout.on('data', (data) => {
stdout += data.toString();
});
process.stderr.on('data', (data) => {
stderr += data.toString();
});
process.on('close', (code) => {
if (code === 0) {
console.log(chalk.green('ā
Automatic sharding completed successfully!'));
resolve({
success: true,
method: 'automatic',
outputDir,
message: 'Document sharded using md-tree explode command'
});
} else {
console.log(chalk.red('ā md-tree command failed'));
if (stderr.includes('command not found') || stderr.includes('not available')) {
console.log(chalk.yellow('š¦ md-tree not installed. Install with:'));
console.log(chalk.dim('npm install -g @kayvan/markdown-tree-parser'));
}
resolve({ success: false, error: stderr || 'md-tree command failed' });
}
});
process.on('error', (error) => {
console.log(chalk.red('ā Failed to execute md-tree command'));
resolve({ success: false, error: error.message });
});
});
} catch (error) {
return { success: false, error: error.message };
}
}
/**
* Manual document sharding implementation
*/
async performManualSharding(documentPath, content, outputDir, options = {}) {
console.log(chalk.blue('š§ Performing manual document sharding...'));
// Parse document sections
const sections = this.parseDocumentSections(content);
if (sections.length === 0) {
throw new Error('No level 2 sections found to shard');
}
console.log(chalk.green(`š Found ${sections.length} sections to shard`));
// Show preview of sections
console.log(chalk.bold('\nSections to be created:'));
sections.forEach((section, index) => {
console.log(chalk.dim(` ${index + 1}. ${section.title} ā ${section.filename}`));
});
// Confirm sharding
if (!options.autoConfirm) {
const proceed = await inquirer.prompt([{
type: 'confirm',
name: 'shard',
message: 'Proceed with document sharding?',
default: true
}]);
if (!proceed.shard) {
return { success: false, cancelled: true };
}
}
// Create output directory
await fs.ensureDir(outputDir);
// Process each section
const createdFiles = [];
const metadata = this.extractDocumentMetadata(content);
for (const section of sections) {
const adjustedContent = this.adjustHeadingLevels(section.content);
const filePath = path.join(outputDir, section.filename);
await fs.writeFile(filePath, adjustedContent, 'utf8');
createdFiles.push({
filename: section.filename,
title: section.title,
path: filePath,
wordCount: adjustedContent.split(' ').length
});
console.log(chalk.green(`ā Created: ${section.filename}`));
}
// Create index file
const indexContent = this.generateIndexFile(metadata, createdFiles);
const indexPath = path.join(outputDir, 'index.md');
await fs.writeFile(indexPath, indexContent, 'utf8');
console.log(chalk.green('ā Created: index.md'));
// Create metadata file for tracking
const metadataFile = {
source: documentPath,
shardedAt: new Date().toISOString(),
method: 'manual',
sections: createdFiles.length,
files: createdFiles,
metadata
};
await fs.writeJson(path.join(outputDir, '.shard-metadata.json'), metadataFile, { spaces: 2 });
// Validate sharding results
const validation = await this.validateSharding(content, outputDir, createdFiles);
// Report results
console.log(chalk.green.bold('\nš Document sharded successfully!'));
console.log(chalk.blue('š Summary:'));
console.log(chalk.dim(` ⢠Source: ${documentPath}`));
console.log(chalk.dim(` ⢠Destination: ${outputDir}`));
console.log(chalk.dim(` ⢠Files created: ${createdFiles.length + 1} (including index.md)`));
console.log(chalk.dim(` ⢠Total sections: ${sections.length}`));
console.log(chalk.dim(` ⢠Validation: ${validation.passed ? 'Passed' : 'Failed'}`));
return {
success: true,
method: 'manual',
outputDir,
files: createdFiles,
indexFile: indexPath,
metadata: metadataFile,
validation
};
}
/**
* Parse document into sections by level 2 headings
*/
parseDocumentSections(content) {
const lines = content.split('\n');
const sections = [];
let currentSection = null;
let inCodeBlock = false;
let codeBlockDelimiter = '';
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Track code blocks to avoid parsing ## inside them
if (line.trim().startsWith('```')) {
if (!inCodeBlock) {
inCodeBlock = true;
codeBlockDelimiter = line.trim();
} else if (line.trim() === codeBlockDelimiter || line.trim() === '```') {
inCodeBlock = false;
codeBlockDelimiter = '';
}
}
// Check for level 2 heading (only outside code blocks)
if (!inCodeBlock && line.match(/^##\s+(.+)/)) {
// Save previous section
if (currentSection) {
sections.push({
...currentSection,
content: currentSection.lines.join('\n').trim()
});
}
// Start new section
const title = line.replace(/^##\s+/, '').trim();
currentSection = {
title,
filename: this.generateFilename(title),
lines: [line],
startLine: i
};
} else if (currentSection) {
// Add line to current section
currentSection.lines.push(line);
}
}
// Add final section
if (currentSection) {
sections.push({
...currentSection,
content: currentSection.lines.join('\n').trim()
});
}
return sections;
}
/**
* Generate filename from section title
*/
generateFilename(title) {
return title
.toLowerCase()
.replace(/[^a-z0-9\s-]/g, '')
.replace(/\s+/g, '-')
.replace(/-+/g, '-')
.replace(/^-|-$/g, '') + '.md';
}
/**
* Adjust heading levels (decrease by 1)
*/
adjustHeadingLevels(content) {
return content.replace(/^(#+)\s/gm, (match, hashes) => {
return hashes.length > 1 ? hashes.slice(1) + ' ' : '# ';
});
}
/**
* Extract document metadata
*/
extractDocumentMetadata(content) {
const lines = content.split('\n');
const titleMatch = lines.find(line => line.match(/^#\s+(.+)/));
return {
title: titleMatch ? titleMatch.replace(/^#\s+/, '').trim() : 'Untitled Document',
firstSection: lines.findIndex(line => line.match(/^##\s+(.+)/)),
totalLines: lines.length,
introduction: this.extractIntroduction(content)
};
}
/**
* Extract introduction content (before first level 2 section)
*/
extractIntroduction(content) {
const lines = content.split('\n');
const firstSectionIndex = lines.findIndex(line => line.match(/^##\s+(.+)/));
if (firstSectionIndex === -1) {
return '';
}
return lines.slice(0, firstSectionIndex).join('\n').trim();
}
/**
* Generate index file content
*/
generateIndexFile(metadata, files) {
let content = `# ${metadata.title}\n\n`;
if (metadata.introduction) {
content += `${metadata.introduction}\n\n`;
}
content += `## Sections\n\n`;
files.forEach(file => {
content += `- [${file.title}](${file.filename})\n`;
});
content += `\n---\n\n`;
content += `*Document sharded on ${new Date().toISOString().split('T')[0]}*\n`;
content += `*Total sections: ${files.length}*\n`;
return content;
}
/**
* Determine output directory
*/
determineOutputDirectory(documentPath, customOutputDir) {
if (customOutputDir) {
return path.resolve(customOutputDir);
}
const baseName = path.basename(documentPath, path.extname(documentPath));
return path.join(this.docsDir, baseName);
}
/**
* Validate sharding results
*/
async validateSharding(originalContent, outputDir, createdFiles) {
const warnings = [];
let totalShardedContent = '';
try {
// Read all sharded files
for (const file of createdFiles) {
if (await fs.pathExists(file.path)) {
const shardContent = await fs.readFile(file.path, 'utf8');
totalShardedContent += shardContent + '\n';
} else {
warnings.push(`File not found: ${file.filename}`);
}
}
// Check index file
const indexPath = path.join(outputDir, 'index.md');
if (!await fs.pathExists(indexPath)) {
warnings.push('Index file not created');
}
// Basic content validation
const originalWords = originalContent.split(/\s+/).length;
const shardedWords = totalShardedContent.split(/\s+/).length;
const contentRatio = shardedWords / originalWords;
if (contentRatio < 0.8) {
warnings.push('Significant content loss detected (>20%)');
}
// Check for code blocks
const originalCodeBlocks = (originalContent.match(/```/g) || []).length;
const shardedCodeBlocks = (totalShardedContent.match(/```/g) || []).length;
if (originalCodeBlocks !== shardedCodeBlocks) {
warnings.push('Code block count mismatch - possible parsing error');
}
} catch (error) {
warnings.push(`Validation error: ${error.message}`);
}
return {
passed: warnings.length === 0,
warnings,
contentRatio: totalShardedContent.split(/\s+/).length / originalContent.split(/\s+/).length
};
}
/**
* Rebuild document from shards
*/
async rebuildFromShards(shardDirectory) {
console.log(chalk.blue(`\nš Rebuilding document from shards: ${shardDirectory}`));
try {
const metadataPath = path.join(shardDirectory, '.shard-metadata.json');
if (!await fs.pathExists(metadataPath)) {
throw new Error('Shard metadata not found - cannot rebuild');
}
const metadata = await fs.readJson(metadataPath);
let rebuiltContent = '';
// Add index content (introduction)
const indexPath = path.join(shardDirectory, 'index.md');
if (await fs.pathExists(indexPath)) {
const indexContent = await fs.readFile(indexPath, 'utf8');
const introMatch = indexContent.match(/^# .+?\n\n([\s\S]*?)\n## Sections/);
if (introMatch) {
rebuiltContent += introMatch[0].replace('\n## Sections', '') + '\n\n';
}
}
// Add each section
for (const file of metadata.files) {
const filePath = path.join(shardDirectory, file.filename);
if (await fs.pathExists(filePath)) {
const sectionContent = await fs.readFile(filePath, 'utf8');
// Adjust heading levels back (increase by 1)
const adjustedContent = sectionContent.replace(/^(#+)\s/gm, (match, hashes) => {
return '#' + hashes + ' ';
});
rebuiltContent += adjustedContent + '\n\n';
}
}
return {
success: true,
content: rebuiltContent.trim(),
originalPath: metadata.source,
shardedAt: metadata.shardedAt
};
} catch (error) {
console.error(chalk.red(`ā Rebuild failed: ${error.message}`));
return { success: false, error: error.message };
}
}
/**
* List available sharded documents
*/
async listShardedDocuments() {
const shardedDocs = [];
if (!await fs.pathExists(this.docsDir)) {
return shardedDocs;
}
const entries = await fs.readdir(this.docsDir, { withFileTypes: true });
for (const entry of entries) {
if (entry.isDirectory()) {
const metadataPath = path.join(this.docsDir, entry.name, '.shard-metadata.json');
if (await fs.pathExists(metadataPath)) {
try {
const metadata = await fs.readJson(metadataPath);
shardedDocs.push({
name: entry.name,
path: path.join(this.docsDir, entry.name),
...metadata
});
} catch (error) {
console.warn(`Warning: Could not read metadata for ${entry.name}`);
}
}
}
}
return shardedDocs;
}
/**
* Interactive document management interface
*/
async manageDocuments() {
console.log(chalk.bold.blue('\nš Document Management System'));
const shardedDocs = await this.listShardedDocuments();
const actions = [
'Shard a new document',
'List sharded documents',
'Rebuild document from shards',
'Validate existing shards',
'Exit'
];
const { action } = await inquirer.prompt([{
type: 'list',
name: 'action',
message: 'What would you like to do?',
choices: actions
}]);
switch (action) {
case 'Shard a new document':
return await this.interactiveSharding();
case 'List sharded documents':
return this.displayShardedDocuments(shardedDocs);
case 'Rebuild document from shards':
return await this.interactiveRebuild(shardedDocs);
case 'Validate existing shards':
return await this.interactiveValidation(shardedDocs);
default:
return { success: true, action: 'exit' };
}
}
async interactiveSharding() {
const { documentPath } = await inquirer.prompt([{
type: 'input',
name: 'documentPath',
message: 'Enter the path to the document to shard:',
validate: async (input) => {
if (!input.trim()) return 'Please enter a document path';
if (!await fs.pathExists(input)) return 'File does not exist';
return true;
}
}]);
return await this.shardDocument(documentPath);
}
displayShardedDocuments(shardedDocs) {
if (shardedDocs.length === 0) {
console.log(chalk.yellow('No sharded documents found'));
return { success: true, count: 0 };
}
console.log(chalk.bold('\nš Sharded Documents:'));
shardedDocs.forEach((doc, index) => {
console.log(`\n${index + 1}. ${chalk.bold(doc.name)}`);
console.log(chalk.dim(` Source: ${doc.source}`));
console.log(chalk.dim(` Sharded: ${doc.shardedAt.split('T')[0]}`));
console.log(chalk.dim(` Sections: ${doc.sections}`));
console.log(chalk.dim(` Method: ${doc.method}`));
});
return { success: true, count: shardedDocs.length };
}
async interactiveRebuild(shardedDocs) {
if (shardedDocs.length === 0) {
console.log(chalk.yellow('No sharded documents available for rebuild'));
return { success: false, error: 'No sharded documents found' };
}
const { selectedDoc } = await inquirer.prompt([{
type: 'list',
name: 'selectedDoc',
message: 'Select document to rebuild:',
choices: shardedDocs.map(doc => ({
name: `${doc.name} (${doc.sections} sections)`,
value: doc
}))
}]);
const result = await this.rebuildFromShards(selectedDoc.path);
if (result.success) {
const { saveRebuilt } = await inquirer.prompt([{
type: 'confirm',
name: 'saveRebuilt',
message: 'Save rebuilt document?',
default: true
}]);
if (saveRebuilt) {
const { outputPath } = await inquirer.prompt([{
type: 'input',
name: 'outputPath',
message: 'Enter output file path:',
default: `${selectedDoc.name}-rebuilt.md`
}]);
await fs.writeFile(outputPath, result.content, 'utf8');
console.log(chalk.green(`ā
Rebuilt document saved to: ${outputPath}`));
}
}
return result;
}
async interactiveValidation(shardedDocs) {
if (shardedDocs.length === 0) {
console.log(chalk.yellow('No sharded documents available for validation'));
return { success: false, error: 'No sharded documents found' };
}
const { selectedDoc } = await inquirer.prompt([{
type: 'list',
name: 'selectedDoc',
message: 'Select document to validate:',
choices: shardedDocs.map(doc => ({
name: `${doc.name} (${doc.sections} sections)`,
value: doc
}))
}]);
// Re-validate by attempting rebuild and comparing
const rebuildResult = await this.rebuildFromShards(selectedDoc.path);
if (rebuildResult.success) {
console.log(chalk.green('ā
Document shards are valid and can be rebuilt'));
console.log(chalk.dim(`Original: ${selectedDoc.source}`));
console.log(chalk.dim(`Rebuilt content length: ${rebuildResult.content.length} characters`));
} else {
console.log(chalk.red('ā Document validation failed'));
console.log(chalk.dim(`Error: ${rebuildResult.error}`));
}
return rebuildResult;
}
}
module.exports = DocumentManager;