UNPKG

codesummary

Version:

Cross-platform CLI tool that generates professional PDF documentation and RAG-optimized JSON outputs from project source code. Perfect for code reviews, audits, documentation, and AI/ML applications with semantic chunking and precision offsets.

1,325 lines (1,112 loc) 64.2 kB
import fs from 'fs-extra'; import path from 'path'; import crypto from 'crypto'; import os from 'os'; import { createReadStream } from 'fs'; import ErrorHandler from './errorHandler.js'; import ragConfig from './ragConfig.js'; /** * Professional RAG Generator for CodeSummary * Generates streaming JSON output optimized for vector database ingestion * Follows deterministic, AI-free approach with efficient memory usage */ export class RagGenerator { constructor(config = {}) { this.config = config; // Global parameters this.maxTokensPerChunk = config.maxTokensPerChunk || 1000; this.overlapTokens = config.overlapTokens || 200; this.maxWorkers = Math.min(config.maxWorkers || 8, os.cpus().length); // Extension to language mapping (deterministic) this.extensionToLanguage = { '.js': 'JavaScript', '.jsx': 'JavaScript', '.ts': 'TypeScript', '.tsx': 'TypeScript', '.py': 'Python', '.java': 'Java', '.cs': 'C#', '.cpp': 'C++', '.c': 'C', '.h': 'C/C++', '.html': 'HTML', '.xml': 'XML', '.css': 'CSS', '.scss': 'SCSS', '.json': 'JSON', '.yaml': 'YAML', '.yml': 'YAML', '.md': 'Markdown', '.txt': 'Text', '.sh': 'Shell', '.bat': 'Batch' }; // Initialize handlers this.handlers = this.initializeHandlers(); // Statistics tracking this.stats = { filesProcessed: 0, chunksGenerated: 0, bytesWritten: 0, startTime: null, endTime: null }; // Error collection this.errors = []; } /** * Main entry point - generates streaming RAG JSON * @param {object} filesByExtension - Files grouped by extension * @param {Array} selectedExtensions - Selected extensions to process * @param {string} outputPath - Output JSON file path * @param {string} projectName - Project name * @param {string} scanPath - Root scan path * @returns {object} Generation result */ async generateRagOutput(filesByExtension, selectedExtensions, outputPath, projectName, scanPath) { this.stats.startTime = Date.now(); try { console.log(`🚀 Starting RAG generation for ${projectName}`); // Load RAG configuration const config = await ragConfig.loadConfig(); this.updateConfigFromYAML(config); // Display configuration ragConfig.displayConfig(); // Phase 1: Discovery and file preparation const discoveredFiles = await this.discoveryPhase(filesByExtension, selectedExtensions, scanPath); // Phase 2: Atomic JSON generation (thread-safe) const result = await this.generate(discoveredFiles, outputPath, projectName, scanPath); this.stats.endTime = Date.now(); const duration = (this.stats.endTime - this.stats.startTime) / 1000; console.log(`✅ RAG generation completed in ${duration.toFixed(2)}s`); console.log(`📊 Stats: ${this.stats.filesProcessed} files, ${this.stats.chunksGenerated} chunks`); return { outputPath, totalFiles: this.stats.filesProcessed, totalChunks: this.stats.chunksGenerated, duration, success: true }; } catch (error) { ErrorHandler.handleError(error, 'RAG Generation'); throw error; } } /** * Update internal configuration from loaded YAML config * @param {object} yamlConfig - Configuration from YAML */ updateConfigFromYAML(yamlConfig) { if (yamlConfig.chunking) { this.maxTokensPerChunk = yamlConfig.chunking.maxTokens || this.maxTokensPerChunk; this.overlapTokens = yamlConfig.chunking.overlap || this.overlapTokens; } if (yamlConfig.performance) { this.maxWorkers = Math.min( yamlConfig.performance.maxWorkers || this.maxWorkers, os.cpus().length ); } // Store full config for handlers to use this.yamlConfig = yamlConfig; } /** * Phase 1: Discovery - BFS traversal and file metadata collection * @param {object} filesByExtension - Files by extension * @param {Array} selectedExtensions - Selected extensions * @param {string} scanPath - Root scan path * @returns {Array} Discovered files with metadata */ async discoveryPhase(filesByExtension, selectedExtensions, scanPath) { console.log('🔍 Discovery phase: collecting file metadata...'); const discoveredFiles = []; let processed = 0; const totalFiles = selectedExtensions.reduce((sum, ext) => sum + (filesByExtension[ext]?.length || 0), 0); // Process files concurrently but limit memory usage const batchSize = 50; for (const extension of selectedExtensions) { const files = filesByExtension[extension] || []; for (let i = 0; i < files.length; i += batchSize) { const batch = files.slice(i, i + batchSize); const batchResults = await Promise.all( batch.map(fileInfo => this.enrichFileMetadata(fileInfo, extension, scanPath)) ); discoveredFiles.push(...batchResults.filter(Boolean)); processed += batch.length; // Progress reporting with validation const progress = (processed / totalFiles * 100).toFixed(1); const validFiles = discoveredFiles.length; const skippedFiles = processed - validFiles; process.stdout.write(`\r📊 Discovery: ${progress}% (${validFiles} valid, ${skippedFiles} skipped)`); // Internal validation if (processed % 50 === 0) { this.validateDiscoveryProgress(discoveredFiles, processed); } } } console.log(`\n✅ Discovery completed: ${discoveredFiles.length} files enriched`); return discoveredFiles; } /** * Enrich file with metadata including hash, tags, and analysis * @param {object} fileInfo - Basic file info from scanner * @param {string} extension - File extension * @param {string} scanPath - Root scan path * @returns {object} Enriched file metadata */ async enrichFileMetadata(fileInfo, extension, scanPath) { try { // Calculate SHA-256 hash in streaming mode const hash = await this.calculateFileHash(fileInfo.absolutePath); // Determine language and tags const language = this.extensionToLanguage[extension] || 'Unknown'; const tags = this.extractFileTags(fileInfo.relativePath, extension); // Basic file stats const stats = await fs.stat(fileInfo.absolutePath); return { id: hash.substring(0, 16), // Use first 16 chars of hash as unique ID path: fileInfo.relativePath, absolutePath: fileInfo.absolutePath, extension, language, size: stats.size, hash: `sha256-${hash}`, modified: stats.mtime.toISOString(), tags, // Will be populated during chunking chunks: null, // Metadata for processing _stats: stats }; } catch (error) { console.warn(`⚠️ Could not process file ${fileInfo.relativePath}: ${error.message}`); return null; } } /** * Calculate SHA-256 hash of file in streaming mode * @param {string} filePath - File path * @returns {string} SHA-256 hash (hex) */ async calculateFileHash(filePath) { return new Promise((resolve, reject) => { const hash = crypto.createHash('sha256'); const stream = createReadStream(filePath); stream.on('data', data => hash.update(data)); stream.on('end', () => resolve(hash.digest('hex'))); stream.on('error', reject); }); } /** * Extract file tags based on path heuristics * @param {string} relativePath - Relative file path * @param {string} extension - File extension * @returns {Array} Array of tags */ extractFileTags(relativePath, extension) { const tags = []; const pathLower = relativePath.toLowerCase(); const fileName = path.basename(relativePath, extension).toLowerCase(); const fullPath = relativePath.toLowerCase(); // Path-based tags (enhanced) if (pathLower.includes('/test/') || pathLower.includes('\\test\\')) tags.push('test'); if (pathLower.includes('/spec/') || pathLower.includes('\\spec\\')) tags.push('test'); if (pathLower.includes('/__tests__/') || pathLower.includes('\\__tests__\\')) tags.push('test'); if (pathLower.includes('/scripts/') || pathLower.includes('\\scripts\\')) tags.push('script'); if (pathLower.includes('/config/') || pathLower.includes('\\config\\')) tags.push('config'); if (pathLower.includes('/lib/') || pathLower.includes('\\lib\\')) tags.push('library'); if (pathLower.includes('/utils/') || pathLower.includes('\\utils\\')) tags.push('utility'); if (pathLower.includes('/helpers/') || pathLower.includes('\\helpers\\')) tags.push('utility'); // Framework-specific tags if (pathLower.includes('/pages/') || pathLower.includes('\\pages\\')) tags.push('page'); if (pathLower.includes('/components/') || pathLower.includes('\\components\\')) tags.push('component'); if (pathLower.includes('/shared/') || pathLower.includes('\\shared\\')) tags.push('shared'); if (pathLower.includes('/common/') || pathLower.includes('\\common\\')) tags.push('shared'); if (pathLower.includes('/hooks/') || pathLower.includes('\\hooks\\')) tags.push('hook'); if (pathLower.includes('/services/') || pathLower.includes('\\services\\')) tags.push('service'); if (pathLower.includes('/api/') || pathLower.includes('\\api\\')) tags.push('api'); if (pathLower.includes('/routes/') || pathLower.includes('\\routes\\')) tags.push('route'); if (pathLower.includes('/controllers/') || pathLower.includes('\\controllers\\')) tags.push('controller'); if (pathLower.includes('/models/') || pathLower.includes('\\models\\')) tags.push('model'); if (pathLower.includes('/views/') || pathLower.includes('\\views\\')) tags.push('view'); if (pathLower.includes('/layouts/') || pathLower.includes('\\layouts\\')) tags.push('layout'); if (pathLower.includes('/middleware/') || pathLower.includes('\\middleware\\')) tags.push('middleware'); // Build and tooling if (pathLower.includes('/build/') || pathLower.includes('\\build\\')) tags.push('build'); if (pathLower.includes('/dist/') || pathLower.includes('\\dist\\')) tags.push('build'); if (pathLower.includes('/.github/') || pathLower.includes('\\.github\\')) tags.push('ci'); if (pathLower.includes('/workflows/') || pathLower.includes('\\workflows\\')) tags.push('ci'); // Filename-based tags (enhanced) if (fileName.includes('config')) tags.push('config'); if (fileName.includes('test') || fileName.includes('spec')) tags.push('test'); if (fileName.includes('index')) tags.push('entry'); if (fileName.includes('main')) tags.push('entry'); if (fileName.includes('app')) tags.push('application'); if (fileName.includes('component')) tags.push('component'); if (fileName.includes('page')) tags.push('page'); if (fileName.includes('layout')) tags.push('layout'); if (fileName.includes('service')) tags.push('service'); if (fileName.includes('util') || fileName.includes('helper')) tags.push('utility'); if (fileName.includes('hook')) tags.push('hook'); if (fileName.includes('api')) tags.push('api'); if (fileName.includes('route')) tags.push('route'); if (fileName.includes('model')) tags.push('model'); if (fileName.includes('controller')) tags.push('controller'); if (fileName.includes('middleware')) tags.push('middleware'); if (fileName.includes('store') || fileName.includes('state')) tags.push('state'); if (fileName.includes('context')) tags.push('context'); if (fileName.includes('provider')) tags.push('provider'); // Extension-based tags (enhanced) if (['.test.js', '.spec.js', '.test.ts', '.spec.ts', '.test.tsx', '.spec.tsx'].some(ext => fullPath.endsWith(ext))) { tags.push('test'); } if (['.d.ts'].some(ext => fullPath.endsWith(ext))) { tags.push('types'); } if (['.stories.js', '.stories.ts', '.stories.tsx'].some(ext => fullPath.endsWith(ext))) { tags.push('storybook'); } if (['.cy.js', '.cy.ts'].some(ext => fullPath.endsWith(ext))) { tags.push('e2e'); } // Framework detection if (extension === '.tsx' || extension === '.jsx') { tags.push('react'); } if (fullPath.includes('vue') || extension === '.vue') { tags.push('vue'); } if (fullPath.includes('angular') || fullPath.includes('.component.') || fullPath.includes('.service.')) { tags.push('angular'); } if (fullPath.includes('next') || fullPath.includes('_app.') || fullPath.includes('_document.')) { tags.push('nextjs'); } // Special files if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) { tags.push('documentation'); } if (['dockerfile', 'docker-compose', '.dockerignore'].includes(fileName)) { tags.push('docker'); } if (['package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml'].includes(path.basename(relativePath))) { tags.push('package'); } if (['tsconfig.json', 'jsconfig.json', 'webpack.config.js', 'vite.config.js'].includes(path.basename(relativePath))) { tags.push('config'); } // Infrastructure files if (extension === '.bat' || extension === '.cmd') { tags.push('infrastructure', 'script', 'windows'); } if (extension === '.sh') { tags.push('infrastructure', 'script', 'unix'); } if (extension === '.json' && (fileName.includes('config') || fileName.includes('settings') || fileName.includes('.config.'))) { tags.push('infrastructure', 'config'); } if (['makefile', 'makefile.am', 'cmake', 'cmakelists.txt'].includes(fileName)) { tags.push('infrastructure', 'build'); } if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) { tags.push('documentation'); } return [...new Set(tags)]; // Remove duplicates } /** * Initialize specialized handlers for different file types * @returns {object} Handler registry */ initializeHandlers() { return { 'code-c-like': new CLikeHandler(), 'code-script': new ScriptHandler(), 'markup': new MarkupHandler(), 'styling': new StylingHandler(), 'config-plain': new ConfigPlainHandler() }; } /** * Get appropriate handler for file extension with full coverage * @param {string} extension - File extension * @returns {object} Handler instance */ getHandler(extension) { // Complete mapping for all 22 target extensions const handlerMap = { // Code-C like (5 extensions) '.c': 'code-c-like', '.h': 'code-c-like', '.cpp': 'code-c-like', '.cs': 'code-c-like', '.java': 'code-c-like', // Code-Script (7 extensions) '.js': 'code-script', '.jsx': 'code-script', '.ts': 'code-script', '.tsx': 'code-script', '.py': 'code-script', '.sh': 'code-script', '.bat': 'code-script', // Markup (2 extensions) '.html': 'markup', '.xml': 'markup', // Styling (2 extensions) '.css': 'styling', '.scss': 'styling', // Config/Plain (6 extensions) '.json': 'config-plain', '.yaml': 'config-plain', '.yml': 'config-plain', '.md': 'config-plain', '.txt': 'config-plain' }; const handlerType = handlerMap[extension]; if (!handlerType) { console.warn(`⚠️ No handler found for extension: ${extension}`); return this.handlers['config-plain']; // Fallback } return this.handlers[handlerType]; } /** * Verify extension coverage against target list * @param {Array} processedExtensions - Extensions found in processing */ verifyExtensionCoverage(processedExtensions) { const targetExtensions = [ '.json', '.ts', '.js', '.jsx', '.tsx', '.xml', '.html', '.css', '.scss', '.md', '.txt', '.py', '.java', '.cs', '.cpp', '.c', '.h', '.yaml', '.yml', '.sh', '.bat' ]; // 22 total extensions const missing = targetExtensions.filter(ext => !processedExtensions.includes(ext)); const extra = processedExtensions.filter(ext => !targetExtensions.includes(ext)); console.log(`\n📊 Extension Coverage Analysis:`); console.log(` Target extensions: ${targetExtensions.length}`); console.log(` Processed extensions: ${processedExtensions.length}`); if (missing.length > 0) { console.warn(` ⚠️ Missing: ${missing.join(', ')}`); } if (extra.length > 0) { console.log(` ➕ Extra: ${extra.join(', ')}`); } if (missing.length === 0) { console.log(` ✅ Full coverage achieved!`); } return { targetCount: targetExtensions.length, processedCount: processedExtensions.length, missing, extra, coverage: ((targetExtensions.length - missing.length) / targetExtensions.length * 100).toFixed(1) }; } /** * Improved token estimation using multiple heuristics * @param {string} content - Text content * @param {string} language - Programming language for context * @returns {number} Estimated token count */ safeEstimateTokens(content, language = 'text') { try { if (typeof content !== 'string') { console.warn('⚠️ Non-string content passed to token estimator'); return 0; } if (content.length === 0) return 0; // Base estimation using multiple factors const charCount = content.length; const wordCount = content.trim().split(/\s+/).length; const lineCount = content.split('\n').length; // Language-specific adjustments let tokensPerChar = 0.25; // Default: ~4 chars per token let tokensPerWord = 1.3; // Default: ~1.3 tokens per word // Adjust based on content type if (['javascript', 'typescript', 'python', 'java', 'c++', 'c#'].includes(language.toLowerCase())) { // Code tends to have more symbols and operators tokensPerChar = 0.28; tokensPerWord = 1.4; // Additional tokens for common code patterns const brackets = (content.match(/[{}()\[\]]/g) || []).length; const operators = (content.match(/[+\-*/%=<>!&|^~]/g) || []).length; const dots = (content.match(/\./g) || []).length; const syntaxTokens = Math.ceil((brackets + operators + dots) * 0.15); // Character-based estimation with syntax bonus const charEstimate = Math.ceil(charCount * tokensPerChar) + syntaxTokens; const wordEstimate = Math.ceil(wordCount * tokensPerWord); return Math.max(charEstimate, wordEstimate); } else if (['json', 'yaml', 'xml', 'html'].includes(language.toLowerCase())) { // Structured data tends to be more compact in tokens tokensPerChar = 0.22; tokensPerWord = 1.1; } else if (language.toLowerCase() === 'markdown') { // Markdown has formatting symbols but is mostly text tokensPerChar = 0.26; tokensPerWord = 1.2; } // Calculate estimates using both methods const charEstimate = Math.ceil(charCount * tokensPerChar); const wordEstimate = Math.ceil(wordCount * tokensPerWord); // Return the higher estimate for safety (avoid truncation) return Math.max(charEstimate, wordEstimate, Math.ceil(charCount / 4)); } catch (error) { console.warn(`⚠️ Token estimation error: ${error.message}`); return Math.ceil((content?.length || 0) / 4); } } /** * Estimate token count using simple heuristic * @param {string} content - Text content * @returns {number} Estimated token count */ estimateTokens(content) { return Math.ceil(content.length / 4); } /** * Extract imports from content using simple regex * @param {string} content - File content * @param {string} extension - File extension * @returns {Array} Array of import statements */ extractImports(content, extension) { const imports = []; switch (extension) { case '.js': case '.jsx': case '.ts': case '.tsx': // import ... from '...' const importRegex = /import\s+.*?from\s+['"]([^'"]+)['"]/g; let match; while ((match = importRegex.exec(content)) !== null) { imports.push(match[1]); } // require('...') const requireRegex = /require\s*\(\s*['"]([^'"]+)['"]\s*\)/g; while ((match = requireRegex.exec(content)) !== null) { imports.push(match[1]); } break; case '.py': // import ... / from ... import ... const pyImportRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g; while ((match = pyImportRegex.exec(content)) !== null) { imports.push(match[1] || match[2]); } break; case '.c': case '.cpp': case '.h': // #include "..." / #include <...> const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g; while ((match = includeRegex.exec(content)) !== null) { imports.push(match[1]); } break; } return [...new Set(imports)]; // Remove duplicates } /** * Extract function/method calls using simple regex * @param {string} content - File content * @param {string} extension - File extension * @returns {Array} Array of function calls */ extractCalls(content, extension) { const calls = []; // Generic function call pattern: identifier followed by ( const callRegex = /\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(/g; let match; while ((match = callRegex.exec(content)) !== null) { const funcName = match[1]; // Filter out language keywords const keywords = ['if', 'for', 'while', 'switch', 'catch', 'typeof', 'return', 'new']; if (!keywords.includes(funcName) && funcName.length > 1) { calls.push(funcName); } } // Return unique calls, limited to prevent noise return [...new Set(calls)].slice(0, 20); } /** * Validate generated JSON file * @param {string} outputPath - Path to generated JSON */ async validateGeneratedJSON(outputPath) { try { // Check file exists and is readable const stats = await fs.stat(outputPath); // Check file size warnings const maxSize = ragConfig.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB'); if (stats.size > maxSize) { console.warn(`⚠️ Generated file is large: ${this.formatFileSize(stats.size)} (>${this.formatFileSize(maxSize)})`); } // Simple validation: read first 1KB to check JSON structure const stream = createReadStream(outputPath, { encoding: 'utf8', start: 0, end: 1023 }); let sampleText = ''; for await (const chunk of stream) { sampleText += chunk; } // Check for basic JSON structure if (!sampleText.trim().startsWith('{')) { throw new Error('Generated file does not start with valid JSON'); } // Check for expected structure if (!sampleText.includes('"metadata"') || !sampleText.includes('"files"')) { console.warn('⚠️ JSON structure may be incomplete - expected sections not found in sample'); } console.log('✅ JSON validation passed'); } catch (error) { console.error(`❌ JSON validation failed: ${error.message}`); // Don't re-throw - just warn since file was successfully written console.warn('⚠️ Continuing despite validation warning - file was generated successfully'); } } /** * Format file size in human readable format * @param {number} bytes - Size in bytes * @returns {string} Formatted size string */ formatFileSize(bytes) { const units = ['B', 'KB', 'MB', 'GB']; let size = bytes; let unitIndex = 0; while (size >= 1024 && unitIndex < units.length - 1) { size /= 1024; unitIndex++; } return `${size.toFixed(1)} ${units[unitIndex]}`; } /** * Validate discovery progress for quality assurance * @param {Array} discoveredFiles - Files discovered so far * @param {number} processedCount - Total files processed */ validateDiscoveryProgress(discoveredFiles, processedCount) { const issues = []; // Check for duplicate hashes const hashes = new Set(); const duplicates = []; for (const file of discoveredFiles) { if (hashes.has(file.hash)) { duplicates.push(file.hash.substring(0, 8)); } else { hashes.add(file.hash); } } if (duplicates.length > 0) { issues.push(`Duplicate hashes detected: ${duplicates.join(', ')}`); } // Check file size distribution const largeSizeThreshold = ragConfig.parseFileSize(this.yamlConfig?.performance?.maxFileSize || '100MB'); const largeFiles = discoveredFiles.filter(f => f.size > largeSizeThreshold); if (largeFiles.length > 0) { issues.push(`${largeFiles.length} files exceed size threshold`); } // Check tag distribution const tagCounts = {}; discoveredFiles.forEach(file => { file.tags.forEach(tag => { tagCounts[tag] = (tagCounts[tag] || 0) + 1; }); }); const untaggedFiles = discoveredFiles.filter(f => f.tags.length === 0); if (untaggedFiles.length > discoveredFiles.length * 0.5) { issues.push(`High untagged ratio: ${untaggedFiles.length}/${discoveredFiles.length}`); } // Report issues if any if (issues.length > 0) { console.warn(`\n⚠️ Discovery validation issues: ${issues.join(', ')}`); } } /** * Validate processing progress for quality assurance * @param {Map} chunkOffsets - Current chunk offsets */ validateProcessingProgress(chunkOffsets) { const issues = []; // Check chunk size distribution const chunkSizes = []; for (const [chunkId, offsetData] of chunkOffsets.entries()) { const size = offsetData.contentEnd - offsetData.contentStart; chunkSizes.push(size); } if (chunkSizes.length > 0) { const avgChunkSize = chunkSizes.reduce((a, b) => a + b, 0) / chunkSizes.length; const maxChunkSize = Math.max(...chunkSizes); const maxChunkThreshold = ragConfig.parseFileSize(this.yamlConfig?.quality?.maxChunkSize || '50KB'); if (maxChunkSize > maxChunkThreshold) { issues.push(`Large chunk detected: ${this.formatFileSize(maxChunkSize)}`); } if (avgChunkSize < 100) { issues.push(`Small average chunk size: ${this.formatFileSize(avgChunkSize)}`); } } // Check offset consistency let invalidOffsets = 0; for (const [chunkId, offsetData] of chunkOffsets.entries()) { if (offsetData.contentStart >= offsetData.contentEnd) { invalidOffsets++; } } if (invalidOffsets > 0) { issues.push(`Invalid offsets: ${invalidOffsets} chunks`); } // Report issues if any if (issues.length > 0) { console.warn(`\n⚠️ Processing validation issues: ${issues.join(', ')}`); } } /** * Final validation of generated output with seek inverse testing * @param {string} outputPath - Generated file path * @param {Array} discoveredFiles - All processed files * @param {Map} chunkOffsets - All chunk offsets */ async validateFinalOutput(outputPath, discoveredFiles, chunkOffsets) { const issues = []; try { const stats = await fs.stat(outputPath); // Check file size const maxOutputSize = ragConfig.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB'); if (stats.size > maxOutputSize) { issues.push(`Output size (${this.formatFileSize(stats.size)}) exceeds threshold`); } // Check completeness const expectedChunks = discoveredFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0); const actualChunks = chunkOffsets.size; if (expectedChunks !== actualChunks) { issues.push(`Chunk count mismatch: expected ${expectedChunks}, got ${actualChunks}`); } // Check for empty chunks const emptyChunks = Array.from(chunkOffsets.values()).filter(offset => offset.contentEnd - offset.contentStart < 10 ).length; if (emptyChunks > 0) { issues.push(`${emptyChunks} near-empty chunks detected`); } // SEEK INVERSE TESTING - Test random chunk offsets await this.validateSeekInverse(outputPath, chunkOffsets); // JSON Schema validation (basic) await this.validateJsonStructure(outputPath); // Report final validation if (issues.length > 0) { console.warn(`\n⚠️ Final validation issues:`); issues.forEach(issue => console.warn(` • ${issue}`)); } else { console.log('✅ Final validation passed - output is healthy'); } } catch (error) { console.error(`❌ Final validation failed: ${error.message}`); } } /** * Test seek operations on random chunk offsets to verify accuracy * @param {string} outputPath - Generated JSON file path * @param {Map} chunkOffsets - Chunk offset map */ async validateSeekInverse(outputPath, chunkOffsets) { const chunkIds = Array.from(chunkOffsets.keys()); const testCount = Math.min(3, chunkIds.length); // Test 2-3 random chunks if (testCount === 0) { console.warn('⚠️ No chunks to test for seek validation'); return; } console.log(`🔍 Testing seek inverse on ${testCount} random chunks...`); for (let i = 0; i < testCount; i++) { const randomIndex = Math.floor(Math.random() * chunkIds.length); const chunkId = chunkIds[randomIndex]; const offsetData = chunkOffsets.get(chunkId); try { // Read the specific chunk content using simple file read const fullContent = await fs.readFile(outputPath, 'utf8'); const seekContent = fullContent.slice(offsetData.contentStart, offsetData.contentEnd); // Verify it's valid JSON content (should be a JSON string value) try { // Try to parse as JSON - if it's valid JSON string content, this should work const parsed = JSON.parse(seekContent); if (typeof parsed === 'string') { console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON string (${seekContent.length} bytes)`); } else { console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON (${typeof parsed}, ${seekContent.length} bytes)`); } } catch (parseError) { // If it doesn't parse as JSON, it might be a partial chunk console.log(` ✅ Chunk ${chunkId}: seek successful, partial content (${seekContent.length} bytes)`); } } catch (error) { console.error(` ❌ Chunk ${chunkId}: seek failed - ${error.message}`); } } } /** * Basic JSON structure validation * @param {string} outputPath - Generated JSON file path */ async validateJsonStructure(outputPath) { try { // Read full content for validation (simpler approach) const fullContent = await fs.readFile(outputPath, 'utf8'); const startText = fullContent.slice(0, 1024).trim(); const endText = fullContent.slice(-1024).trim(); // Basic structure checks const issues = []; if (!startText.startsWith('{')) { issues.push('File does not start with {'); } if (!endText.endsWith('}')) { issues.push('File does not end with }'); } if (!startText.includes('"metadata"')) { issues.push('Missing metadata section'); } if (!startText.includes('"files"')) { issues.push('Missing files section'); } if (!fullContent.includes('"index"')) { issues.push('Missing index section'); } if (issues.length === 0) { console.log('✅ JSON structure validation passed'); } else { console.warn(`⚠️ JSON structure issues: ${issues.join(', ')}`); } } catch (error) { console.error(`❌ JSON structure validation failed: ${error.message}`); } } /** * Generate RAG output atomically - build complete structure in memory (thread-safe) */ async generate(discoveredFiles, outputPath, projectName, scanPath) { console.log('📝 Atomic generation: processing all files in memory...'); await fs.ensureDir(path.dirname(outputPath)); const processedFiles = []; let totalChunks = 0; for (let i = 0; i < discoveredFiles.length; i++) { const fileData = discoveredFiles[i]; const progress = ((i + 1) / discoveredFiles.length * 100).toFixed(1); process.stdout.write(`\r📊 Processing: ${progress}% (${i + 1}/${discoveredFiles.length})`); try { const processedFile = await this.processFileInMemory(fileData); processedFiles.push(processedFile); totalChunks += processedFile.chunks?.length || 0; this.stats.filesProcessed++; this.stats.chunksGenerated += processedFile.chunks?.length || 0; } catch (error) { console.warn(`\n⚠️ Error processing ${fileData.path}: ${error.message}`); this.errors.push({ file: fileData.path, error: error.message }); processedFiles.push({ ...fileData, chunks: [], error: error.message }); } } console.log(`\n✅ All files processed: ${processedFiles.length} files, ${totalChunks} chunks`); const completeJSON = this.buildCompleteJSON(processedFiles, projectName, scanPath); const finalJSON = this.calculateAndInjectOffsets(completeJSON); await fs.writeFile(outputPath, finalJSON, 'utf8'); this.stats.bytesWritten = finalJSON.length; if (this.yamlConfig?.output?.validation) { console.log('🔍 Validating generated output...'); await this.validateGeneratedJSON(outputPath); } console.log(`✅ JSON written successfully to ${outputPath}`); return { outputPath, totalFiles: processedFiles.length, totalChunks, bytesWritten: finalJSON.length, extensionCoverage: this.verifyExtensionCoverage([...new Set(processedFiles.map(f => f.extension))]) }; } async processFileInMemory(fileData) { const content = await fs.readFile(fileData.absolutePath, 'utf8'); const handler = this.getHandler(fileData.extension); console.log(`🔍 ${handler.constructor.name} processing ${fileData.extension} file: ${fileData.path}`); const chunks = await handler.generateChunks(content, { fileId: fileData.id, filePath: fileData.path, extension: fileData.extension, language: fileData.language, maxTokens: this.maxTokensPerChunk, overlap: this.overlapTokens }); if (chunks.length > 0) { console.log(` 📝 Found ${chunks.length} semantic chunks`); } const enrichedChunks = chunks.map((chunk, index) => ({ ...chunk, id: `chunk_${fileData.id}_${index}`, tokenEstimate: this.safeEstimateTokens(chunk.content, fileData.language), imports: this.extractImports(chunk.content, fileData.extension), calls: this.extractCalls(chunk.content, fileData.extension) })); return { id: fileData.id, path: fileData.path, language: fileData.language, extension: fileData.extension, size: fileData.size, lines: content.split('\n').length, hash: fileData.hash, modified: fileData.modified, tags: fileData.tags, chunks: enrichedChunks }; } buildCompleteJSON(processedFiles, projectName, scanPath) { const totalChunks = processedFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0); const emptyFiles = processedFiles.filter(f => (f.chunks?.length || 0) === 0).length; this.stats.endTime = Date.now(); const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime); return { metadata: { projectName, generatedAt: new Date().toISOString(), scanPath, generator: 'CodeSummary RAG Generator', version: '3.1.0', config: { maxTokensPerChunk: this.maxTokensPerChunk, overlapTokens: this.overlapTokens, tokenEstimationMethod: 'enhanced_heuristic_v1.0' }, summary: { totalFiles: processedFiles.length, languages: [...new Set(processedFiles.map(f => f.language))], extensions: [...new Set(processedFiles.map(f => f.extension))] }, schemaVersion: "1.0", schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json" }, files: processedFiles, index: { version: "3.1.0", generatedAt: new Date().toISOString(), schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json", summary: { fileCount: processedFiles.length - emptyFiles, chunkCount: totalChunks, totalBytes: 0, languages: [...new Set(processedFiles.map(f => f.language))], extensions: [...new Set(processedFiles.map(f => f.extension))], avgFileSize: 0, avgChunksPerFile: processedFiles.length > 0 ? Math.round(totalChunks / processedFiles.length) : 0 }, chunkOffsets: {}, fileOffsets: {}, seekInfo: { instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content", format: "All offsets are absolute byte positions in this JSON file", chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)", fileFormat: "Array [start, end] for each file in JSON" }, statistics: { processingTimeMs, bytesPerSecond: 0, bytesWritten: 0, chunksWithValidOffsets: totalChunks, filesWithValidOffsets: processedFiles.length - emptyFiles, totalFiles: processedFiles.length, emptyFiles: emptyFiles, totalChunksGenerated: totalChunks, errors: this.errors } } }; } /** * Finaliza la estructura JSON calculando y reinyectando los offsets correctos. * Este enfoque garantiza la máxima precisión al operar sobre el string JSON final. * @param {object} jsonStructure - El objeto JSON completo con datos pero sin offsets. * @returns {string} El string JSON final, formateado y con offsets precisos. */ calculateAndInjectOffsets(jsonStructure) { console.log('🔍 Calculating precise byte offsets and building complete index...'); // PASO 1: Construir JSON preliminar sin index para medir posiciones exactas const jsonWithoutIndex = { metadata: jsonStructure.metadata, files: jsonStructure.files }; const preliminaryJsonString = JSON.stringify(jsonWithoutIndex, null, 2); const preliminaryBytes = Buffer.byteLength(preliminaryJsonString, 'utf8'); // PASO 2: Calcular offsets precisos de archivos y chunks const fileOffsets = {}; const chunkOffsets = {}; let totalChunks = 0; let validChunks = 0; for (const file of jsonStructure.files) { // Buscar el inicio del objeto file por su ID const filePattern = `"id": "${file.id}"`; const fileStartPos = preliminaryJsonString.indexOf(filePattern); if (fileStartPos !== -1) { // Buscar el final aproximado del objeto file const nextFilePattern = preliminaryJsonString.indexOf(' {\n "id":', fileStartPos + 1); const fileEndPos = nextFilePattern !== -1 ? nextFilePattern : preliminaryJsonString.lastIndexOf(' ]'); // Formato del esquema: fileId -> [start, end] fileOffsets[file.id] = [fileStartPos, fileEndPos]; // Calcular offsets de chunks dentro de este archivo for (const chunk of file.chunks) { const chunkPattern = `"id": "${chunk.id}"`; const chunkStartPos = preliminaryJsonString.indexOf(chunkPattern, fileStartPos); if (chunkStartPos !== -1) { // Encontrar el campo "content" dentro de este chunk const contentPattern = '"content": "'; const contentStartSearch = preliminaryJsonString.indexOf(contentPattern, chunkStartPos); if (contentStartSearch !== -1) { const contentStart = contentStartSearch + contentPattern.length; // Buscar el final del contenido (cierre de la cadena JSON) let contentEnd = contentStart; let inEscape = false; for (let i = contentStart; i < preliminaryJsonString.length; i++) { const char = preliminaryJsonString[i]; if (inEscape) { inEscape = false; continue; } if (char === '\\') { inEscape = true; continue; } if (char === '"') { contentEnd = i; break; } } // Buscar el final del objeto chunk completo const chunkEndPattern = '},'; const chunkEndSearch = preliminaryJsonString.indexOf(chunkEndPattern, contentEnd); const chunkEnd = chunkEndSearch !== -1 ? chunkEndSearch + 1 : contentEnd + 100; // Formato del esquema: chunkId -> objeto con offsets precisos chunkOffsets[chunk.id] = { jsonStart: chunkStartPos, jsonEnd: chunkEnd, contentStart: contentStart, contentEnd: contentEnd, filePath: file.path }; validChunks++; } } totalChunks++; } } } // PASO 3: Construir estadísticas completas const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime); const emptyFiles = jsonStructure.files.filter(f => f.chunks.length === 0).length; // PASO 4: Construir el bloque index completo según el esquema const indexBlock = { version: "3.1.0", generatedAt: new Date().toISOString(), schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json", summary: { fileCount: jsonStructure.files.length - emptyFiles, chunkCount: totalChunks, totalBytes: 0, // Se actualizará después languages: [...new Set(jsonStructure.files.map(f => f.language))], extensions: [...new Set(jsonStructure.files.map(f => f.extension))], avgFileSize: 0, // Se actualizará después avgChunksPerFile: jsonStructure.files.length > 0 ? Math.round(totalChunks / jsonStructure.files.length) : 0 }, chunkOffsets: chunkOffsets, fileOffsets: fileOffsets, seekInfo: { instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content", format: "All offsets are absolute byte positions in this JSON file", chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)", fileFormat: "Array [start, end] for each file in JSON", validation: `Generated with ${validChunks} chunks across ${Object.keys(fileOffsets).length} files` }, statistics: { processingTimeMs, bytesPerSecond: 0, // Se actualizará después bytesWritten: 0, // Se actualizará después chunksWithValidOffsets: validChunks, filesWithValidOffsets: Object.keys(fileOffsets).length, totalFiles: jsonStructure.files.length, emptyFiles: emptyFiles, totalChunksGenerated: totalChunks } }; // PASO 5: Construir JSON final con index y calcular métricas finales const completeStructure = { metadata: jsonStructure.metadata, files: jsonStructure.files, index: indexBlock }; const finalJsonString = JSON.stringify(completeStructure, null, 2); const finalBytes = Buffer.byteLength(finalJsonString, 'utf8'); const bytesPerSecond = Math.round(finalBytes / (processingTimeMs / 1000)); // Actualizar métricas finales en el index completeStructure.index.summary.totalBytes = finalBytes; completeStructure.index.summary.avgFileSize = jsonStructure.files.length > 0 ? Math.round(finalBytes / jsonStructure.files.length) : 0; completeStructure.index.statistics.bytesPerSecond = bytesPerSecond; completeStructure.index.statistics.bytesWritten = finalBytes; // PASO 6: Regenerar JSON final con estadísticas actualizadas const finalResult = JSON.stringify(completeStructure, null, 2); console.log(`✅ Complete index built: ${Object.keys(fileOffsets).length} files, ${validChunks}/${totalChunks} chunks with precise offsets`); console.log(`✅ Final JSON: ${this.formatFileSize(Buffer.byteLength(finalResult, 'utf8'))}, processing: ${processingTimeMs}ms`); return finalResult; } } // Specialized Handler Classes class BaseHandler { async generateChunks(content, options) { // Fallback: split by lines if no specific logic return this.chunkByLines(content, options); } chunkByLines(content, options) { const lines = content.split('\n'); const chunks = []; const maxLines = Math.ceil(options.maxTokens / 20); // ~20 tokens per line estimate for (let i = 0; i < lines.length; i += maxLines) { const chunkLines = lines.slice(i, Math.min(i + maxLines, lines.length)); const chunkContent = chunkLines.join('\n'); chunks.push({ content: chunkContent, lineStart: i + 1, lineEnd: Math.min(i + maxLines, lines.length), chunkingMethod: 'line-based' }); } return chunks; } /** * Estimate token count for chunking decisions * @param {string} content - Text content * @returns {number} Estimated token count */ estimateTokens(content) { return Math.ceil(content.length / 4); } } class CLikeHandler extends BaseHandler { async generateChunks(content, options) { const chunks = []; const lines = content.split('\n'); // Find class/struct/function boundaries const boundaries = this.findCodeBoundaries(content); if (boundaries.length > 0) { return this.chunkByBoundaries(content, boundaries, options); } // Fallback to line-based chunking return this.chunkByLines(content, options); } findCodeBoundaries(content) { const boundaries = []; const boundaryRegex = /^(?:class|struct|enum|union|static)?\s*([a-zA-Z_][\w]*)\s*.*{/gm; let match; while ((match = boundaryRegex.exec(content)) !== null) { const lineNumber = content.substring(0, match.index).split('\n').length; boundaries.push({ name: match[1], line: lineNumber, type: 'function' }); } return boundaries; } chunkByBoundaries(content, boundaries, options) { const lines = content.split('\n'); const chunks = []; let currentStart = 0; for (const boundary of boundaries) { if (currentStart < boundary.line - 1) { const chunkLines = lines.slice(currentStart, boundary.line - 1); if (chunkLines.length > 0) { chunks.push({ content: chunkLines.join('\n'), lineStart: currentStart + 1, lineEnd: boundary.line - 1 }); } } currentStart = boundary.line - 1; } // Add remaining lines if (currentStart < lines.length) { const chunkLines = lines.slice(currentStart); chunks.push({ content: chunkLines.join('\n'), lineStart: currentStart + 1, lineEnd: lines.length }); } return chunks; } } class ScriptHandler extends BaseHandler { async generateChunks(content, options) { console.log(`🔍 ScriptHandler processing ${options.extension} file: ${options.filePath}`); // ALWAYS try semantic chunking first for script files