codesummary
Version:
Cross-platform CLI tool that generates professional PDF documentation and RAG-optimized JSON outputs from project source code. Perfect for code reviews, audits, documentation, and AI/ML applications with semantic chunking and precision offsets.
1,325 lines (1,112 loc) • 64.2 kB
JavaScript
import fs from 'fs-extra';
import path from 'path';
import crypto from 'crypto';
import os from 'os';
import { createReadStream } from 'fs';
import ErrorHandler from './errorHandler.js';
import ragConfig from './ragConfig.js';
/**
* Professional RAG Generator for CodeSummary
* Generates streaming JSON output optimized for vector database ingestion
* Follows deterministic, AI-free approach with efficient memory usage
*/
export class RagGenerator {
constructor(config = {}) {
this.config = config;
// Global parameters
this.maxTokensPerChunk = config.maxTokensPerChunk || 1000;
this.overlapTokens = config.overlapTokens || 200;
this.maxWorkers = Math.min(config.maxWorkers || 8, os.cpus().length);
// Extension to language mapping (deterministic)
this.extensionToLanguage = {
'.js': 'JavaScript', '.jsx': 'JavaScript', '.ts': 'TypeScript', '.tsx': 'TypeScript',
'.py': 'Python', '.java': 'Java', '.cs': 'C#', '.cpp': 'C++', '.c': 'C', '.h': 'C/C++',
'.html': 'HTML', '.xml': 'XML', '.css': 'CSS', '.scss': 'SCSS',
'.json': 'JSON', '.yaml': 'YAML', '.yml': 'YAML',
'.md': 'Markdown', '.txt': 'Text',
'.sh': 'Shell', '.bat': 'Batch'
};
// Initialize handlers
this.handlers = this.initializeHandlers();
// Statistics tracking
this.stats = {
filesProcessed: 0,
chunksGenerated: 0,
bytesWritten: 0,
startTime: null,
endTime: null
};
// Error collection
this.errors = [];
}
/**
* Main entry point - generates streaming RAG JSON
* @param {object} filesByExtension - Files grouped by extension
* @param {Array} selectedExtensions - Selected extensions to process
* @param {string} outputPath - Output JSON file path
* @param {string} projectName - Project name
* @param {string} scanPath - Root scan path
* @returns {object} Generation result
*/
async generateRagOutput(filesByExtension, selectedExtensions, outputPath, projectName, scanPath) {
this.stats.startTime = Date.now();
try {
console.log(`🚀 Starting RAG generation for ${projectName}`);
// Load RAG configuration
const config = await ragConfig.loadConfig();
this.updateConfigFromYAML(config);
// Display configuration
ragConfig.displayConfig();
// Phase 1: Discovery and file preparation
const discoveredFiles = await this.discoveryPhase(filesByExtension, selectedExtensions, scanPath);
// Phase 2: Atomic JSON generation (thread-safe)
const result = await this.generate(discoveredFiles, outputPath, projectName, scanPath);
this.stats.endTime = Date.now();
const duration = (this.stats.endTime - this.stats.startTime) / 1000;
console.log(`✅ RAG generation completed in ${duration.toFixed(2)}s`);
console.log(`📊 Stats: ${this.stats.filesProcessed} files, ${this.stats.chunksGenerated} chunks`);
return {
outputPath,
totalFiles: this.stats.filesProcessed,
totalChunks: this.stats.chunksGenerated,
duration,
success: true
};
} catch (error) {
ErrorHandler.handleError(error, 'RAG Generation');
throw error;
}
}
/**
* Update internal configuration from loaded YAML config
* @param {object} yamlConfig - Configuration from YAML
*/
updateConfigFromYAML(yamlConfig) {
if (yamlConfig.chunking) {
this.maxTokensPerChunk = yamlConfig.chunking.maxTokens || this.maxTokensPerChunk;
this.overlapTokens = yamlConfig.chunking.overlap || this.overlapTokens;
}
if (yamlConfig.performance) {
this.maxWorkers = Math.min(
yamlConfig.performance.maxWorkers || this.maxWorkers,
os.cpus().length
);
}
// Store full config for handlers to use
this.yamlConfig = yamlConfig;
}
/**
* Phase 1: Discovery - BFS traversal and file metadata collection
* @param {object} filesByExtension - Files by extension
* @param {Array} selectedExtensions - Selected extensions
* @param {string} scanPath - Root scan path
* @returns {Array} Discovered files with metadata
*/
async discoveryPhase(filesByExtension, selectedExtensions, scanPath) {
console.log('🔍 Discovery phase: collecting file metadata...');
const discoveredFiles = [];
let processed = 0;
const totalFiles = selectedExtensions.reduce((sum, ext) => sum + (filesByExtension[ext]?.length || 0), 0);
// Process files concurrently but limit memory usage
const batchSize = 50;
for (const extension of selectedExtensions) {
const files = filesByExtension[extension] || [];
for (let i = 0; i < files.length; i += batchSize) {
const batch = files.slice(i, i + batchSize);
const batchResults = await Promise.all(
batch.map(fileInfo => this.enrichFileMetadata(fileInfo, extension, scanPath))
);
discoveredFiles.push(...batchResults.filter(Boolean));
processed += batch.length;
// Progress reporting with validation
const progress = (processed / totalFiles * 100).toFixed(1);
const validFiles = discoveredFiles.length;
const skippedFiles = processed - validFiles;
process.stdout.write(`\r📊 Discovery: ${progress}% (${validFiles} valid, ${skippedFiles} skipped)`);
// Internal validation
if (processed % 50 === 0) {
this.validateDiscoveryProgress(discoveredFiles, processed);
}
}
}
console.log(`\n✅ Discovery completed: ${discoveredFiles.length} files enriched`);
return discoveredFiles;
}
/**
* Enrich file with metadata including hash, tags, and analysis
* @param {object} fileInfo - Basic file info from scanner
* @param {string} extension - File extension
* @param {string} scanPath - Root scan path
* @returns {object} Enriched file metadata
*/
async enrichFileMetadata(fileInfo, extension, scanPath) {
try {
// Calculate SHA-256 hash in streaming mode
const hash = await this.calculateFileHash(fileInfo.absolutePath);
// Determine language and tags
const language = this.extensionToLanguage[extension] || 'Unknown';
const tags = this.extractFileTags(fileInfo.relativePath, extension);
// Basic file stats
const stats = await fs.stat(fileInfo.absolutePath);
return {
id: hash.substring(0, 16), // Use first 16 chars of hash as unique ID
path: fileInfo.relativePath,
absolutePath: fileInfo.absolutePath,
extension,
language,
size: stats.size,
hash: `sha256-${hash}`,
modified: stats.mtime.toISOString(),
tags,
// Will be populated during chunking
chunks: null,
// Metadata for processing
_stats: stats
};
} catch (error) {
console.warn(`⚠️ Could not process file ${fileInfo.relativePath}: ${error.message}`);
return null;
}
}
/**
* Calculate SHA-256 hash of file in streaming mode
* @param {string} filePath - File path
* @returns {string} SHA-256 hash (hex)
*/
async calculateFileHash(filePath) {
return new Promise((resolve, reject) => {
const hash = crypto.createHash('sha256');
const stream = createReadStream(filePath);
stream.on('data', data => hash.update(data));
stream.on('end', () => resolve(hash.digest('hex')));
stream.on('error', reject);
});
}
/**
* Extract file tags based on path heuristics
* @param {string} relativePath - Relative file path
* @param {string} extension - File extension
* @returns {Array} Array of tags
*/
extractFileTags(relativePath, extension) {
const tags = [];
const pathLower = relativePath.toLowerCase();
const fileName = path.basename(relativePath, extension).toLowerCase();
const fullPath = relativePath.toLowerCase();
// Path-based tags (enhanced)
if (pathLower.includes('/test/') || pathLower.includes('\\test\\')) tags.push('test');
if (pathLower.includes('/spec/') || pathLower.includes('\\spec\\')) tags.push('test');
if (pathLower.includes('/__tests__/') || pathLower.includes('\\__tests__\\')) tags.push('test');
if (pathLower.includes('/scripts/') || pathLower.includes('\\scripts\\')) tags.push('script');
if (pathLower.includes('/config/') || pathLower.includes('\\config\\')) tags.push('config');
if (pathLower.includes('/lib/') || pathLower.includes('\\lib\\')) tags.push('library');
if (pathLower.includes('/utils/') || pathLower.includes('\\utils\\')) tags.push('utility');
if (pathLower.includes('/helpers/') || pathLower.includes('\\helpers\\')) tags.push('utility');
// Framework-specific tags
if (pathLower.includes('/pages/') || pathLower.includes('\\pages\\')) tags.push('page');
if (pathLower.includes('/components/') || pathLower.includes('\\components\\')) tags.push('component');
if (pathLower.includes('/shared/') || pathLower.includes('\\shared\\')) tags.push('shared');
if (pathLower.includes('/common/') || pathLower.includes('\\common\\')) tags.push('shared');
if (pathLower.includes('/hooks/') || pathLower.includes('\\hooks\\')) tags.push('hook');
if (pathLower.includes('/services/') || pathLower.includes('\\services\\')) tags.push('service');
if (pathLower.includes('/api/') || pathLower.includes('\\api\\')) tags.push('api');
if (pathLower.includes('/routes/') || pathLower.includes('\\routes\\')) tags.push('route');
if (pathLower.includes('/controllers/') || pathLower.includes('\\controllers\\')) tags.push('controller');
if (pathLower.includes('/models/') || pathLower.includes('\\models\\')) tags.push('model');
if (pathLower.includes('/views/') || pathLower.includes('\\views\\')) tags.push('view');
if (pathLower.includes('/layouts/') || pathLower.includes('\\layouts\\')) tags.push('layout');
if (pathLower.includes('/middleware/') || pathLower.includes('\\middleware\\')) tags.push('middleware');
// Build and tooling
if (pathLower.includes('/build/') || pathLower.includes('\\build\\')) tags.push('build');
if (pathLower.includes('/dist/') || pathLower.includes('\\dist\\')) tags.push('build');
if (pathLower.includes('/.github/') || pathLower.includes('\\.github\\')) tags.push('ci');
if (pathLower.includes('/workflows/') || pathLower.includes('\\workflows\\')) tags.push('ci');
// Filename-based tags (enhanced)
if (fileName.includes('config')) tags.push('config');
if (fileName.includes('test') || fileName.includes('spec')) tags.push('test');
if (fileName.includes('index')) tags.push('entry');
if (fileName.includes('main')) tags.push('entry');
if (fileName.includes('app')) tags.push('application');
if (fileName.includes('component')) tags.push('component');
if (fileName.includes('page')) tags.push('page');
if (fileName.includes('layout')) tags.push('layout');
if (fileName.includes('service')) tags.push('service');
if (fileName.includes('util') || fileName.includes('helper')) tags.push('utility');
if (fileName.includes('hook')) tags.push('hook');
if (fileName.includes('api')) tags.push('api');
if (fileName.includes('route')) tags.push('route');
if (fileName.includes('model')) tags.push('model');
if (fileName.includes('controller')) tags.push('controller');
if (fileName.includes('middleware')) tags.push('middleware');
if (fileName.includes('store') || fileName.includes('state')) tags.push('state');
if (fileName.includes('context')) tags.push('context');
if (fileName.includes('provider')) tags.push('provider');
// Extension-based tags (enhanced)
if (['.test.js', '.spec.js', '.test.ts', '.spec.ts', '.test.tsx', '.spec.tsx'].some(ext => fullPath.endsWith(ext))) {
tags.push('test');
}
if (['.d.ts'].some(ext => fullPath.endsWith(ext))) {
tags.push('types');
}
if (['.stories.js', '.stories.ts', '.stories.tsx'].some(ext => fullPath.endsWith(ext))) {
tags.push('storybook');
}
if (['.cy.js', '.cy.ts'].some(ext => fullPath.endsWith(ext))) {
tags.push('e2e');
}
// Framework detection
if (extension === '.tsx' || extension === '.jsx') {
tags.push('react');
}
if (fullPath.includes('vue') || extension === '.vue') {
tags.push('vue');
}
if (fullPath.includes('angular') || fullPath.includes('.component.') || fullPath.includes('.service.')) {
tags.push('angular');
}
if (fullPath.includes('next') || fullPath.includes('_app.') || fullPath.includes('_document.')) {
tags.push('nextjs');
}
// Special files
if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) {
tags.push('documentation');
}
if (['dockerfile', 'docker-compose', '.dockerignore'].includes(fileName)) {
tags.push('docker');
}
if (['package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml'].includes(path.basename(relativePath))) {
tags.push('package');
}
if (['tsconfig.json', 'jsconfig.json', 'webpack.config.js', 'vite.config.js'].includes(path.basename(relativePath))) {
tags.push('config');
}
// Infrastructure files
if (extension === '.bat' || extension === '.cmd') {
tags.push('infrastructure', 'script', 'windows');
}
if (extension === '.sh') {
tags.push('infrastructure', 'script', 'unix');
}
if (extension === '.json' && (fileName.includes('config') || fileName.includes('settings') || fileName.includes('.config.'))) {
tags.push('infrastructure', 'config');
}
if (['makefile', 'makefile.am', 'cmake', 'cmakelists.txt'].includes(fileName)) {
tags.push('infrastructure', 'build');
}
if (['readme', 'license', 'changelog', 'contributing'].includes(fileName)) {
tags.push('documentation');
}
return [...new Set(tags)]; // Remove duplicates
}
/**
* Initialize specialized handlers for different file types
* @returns {object} Handler registry
*/
initializeHandlers() {
return {
'code-c-like': new CLikeHandler(),
'code-script': new ScriptHandler(),
'markup': new MarkupHandler(),
'styling': new StylingHandler(),
'config-plain': new ConfigPlainHandler()
};
}
/**
* Get appropriate handler for file extension with full coverage
* @param {string} extension - File extension
* @returns {object} Handler instance
*/
getHandler(extension) {
// Complete mapping for all 22 target extensions
const handlerMap = {
// Code-C like (5 extensions)
'.c': 'code-c-like', '.h': 'code-c-like', '.cpp': 'code-c-like',
'.cs': 'code-c-like', '.java': 'code-c-like',
// Code-Script (7 extensions)
'.js': 'code-script', '.jsx': 'code-script', '.ts': 'code-script',
'.tsx': 'code-script', '.py': 'code-script', '.sh': 'code-script', '.bat': 'code-script',
// Markup (2 extensions)
'.html': 'markup', '.xml': 'markup',
// Styling (2 extensions)
'.css': 'styling', '.scss': 'styling',
// Config/Plain (6 extensions)
'.json': 'config-plain', '.yaml': 'config-plain', '.yml': 'config-plain',
'.md': 'config-plain', '.txt': 'config-plain'
};
const handlerType = handlerMap[extension];
if (!handlerType) {
console.warn(`⚠️ No handler found for extension: ${extension}`);
return this.handlers['config-plain']; // Fallback
}
return this.handlers[handlerType];
}
/**
* Verify extension coverage against target list
* @param {Array} processedExtensions - Extensions found in processing
*/
verifyExtensionCoverage(processedExtensions) {
const targetExtensions = [
'.json', '.ts', '.js', '.jsx', '.tsx', '.xml', '.html', '.css', '.scss',
'.md', '.txt', '.py', '.java', '.cs', '.cpp', '.c', '.h', '.yaml', '.yml',
'.sh', '.bat'
]; // 22 total extensions
const missing = targetExtensions.filter(ext => !processedExtensions.includes(ext));
const extra = processedExtensions.filter(ext => !targetExtensions.includes(ext));
console.log(`\n📊 Extension Coverage Analysis:`);
console.log(` Target extensions: ${targetExtensions.length}`);
console.log(` Processed extensions: ${processedExtensions.length}`);
if (missing.length > 0) {
console.warn(` ⚠️ Missing: ${missing.join(', ')}`);
}
if (extra.length > 0) {
console.log(` ➕ Extra: ${extra.join(', ')}`);
}
if (missing.length === 0) {
console.log(` ✅ Full coverage achieved!`);
}
return {
targetCount: targetExtensions.length,
processedCount: processedExtensions.length,
missing,
extra,
coverage: ((targetExtensions.length - missing.length) / targetExtensions.length * 100).toFixed(1)
};
}
/**
* Improved token estimation using multiple heuristics
* @param {string} content - Text content
* @param {string} language - Programming language for context
* @returns {number} Estimated token count
*/
safeEstimateTokens(content, language = 'text') {
try {
if (typeof content !== 'string') {
console.warn('⚠️ Non-string content passed to token estimator');
return 0;
}
if (content.length === 0) return 0;
// Base estimation using multiple factors
const charCount = content.length;
const wordCount = content.trim().split(/\s+/).length;
const lineCount = content.split('\n').length;
// Language-specific adjustments
let tokensPerChar = 0.25; // Default: ~4 chars per token
let tokensPerWord = 1.3; // Default: ~1.3 tokens per word
// Adjust based on content type
if (['javascript', 'typescript', 'python', 'java', 'c++', 'c#'].includes(language.toLowerCase())) {
// Code tends to have more symbols and operators
tokensPerChar = 0.28;
tokensPerWord = 1.4;
// Additional tokens for common code patterns
const brackets = (content.match(/[{}()\[\]]/g) || []).length;
const operators = (content.match(/[+\-*/%=<>!&|^~]/g) || []).length;
const dots = (content.match(/\./g) || []).length;
const syntaxTokens = Math.ceil((brackets + operators + dots) * 0.15);
// Character-based estimation with syntax bonus
const charEstimate = Math.ceil(charCount * tokensPerChar) + syntaxTokens;
const wordEstimate = Math.ceil(wordCount * tokensPerWord);
return Math.max(charEstimate, wordEstimate);
} else if (['json', 'yaml', 'xml', 'html'].includes(language.toLowerCase())) {
// Structured data tends to be more compact in tokens
tokensPerChar = 0.22;
tokensPerWord = 1.1;
} else if (language.toLowerCase() === 'markdown') {
// Markdown has formatting symbols but is mostly text
tokensPerChar = 0.26;
tokensPerWord = 1.2;
}
// Calculate estimates using both methods
const charEstimate = Math.ceil(charCount * tokensPerChar);
const wordEstimate = Math.ceil(wordCount * tokensPerWord);
// Return the higher estimate for safety (avoid truncation)
return Math.max(charEstimate, wordEstimate, Math.ceil(charCount / 4));
} catch (error) {
console.warn(`⚠️ Token estimation error: ${error.message}`);
return Math.ceil((content?.length || 0) / 4);
}
}
/**
* Estimate token count using simple heuristic
* @param {string} content - Text content
* @returns {number} Estimated token count
*/
estimateTokens(content) {
return Math.ceil(content.length / 4);
}
/**
* Extract imports from content using simple regex
* @param {string} content - File content
* @param {string} extension - File extension
* @returns {Array} Array of import statements
*/
extractImports(content, extension) {
const imports = [];
switch (extension) {
case '.js':
case '.jsx':
case '.ts':
case '.tsx':
// import ... from '...'
const importRegex = /import\s+.*?from\s+['"]([^'"]+)['"]/g;
let match;
while ((match = importRegex.exec(content)) !== null) {
imports.push(match[1]);
}
// require('...')
const requireRegex = /require\s*\(\s*['"]([^'"]+)['"]\s*\)/g;
while ((match = requireRegex.exec(content)) !== null) {
imports.push(match[1]);
}
break;
case '.py':
// import ... / from ... import ...
const pyImportRegex = /(?:from\s+(\S+)\s+import|import\s+(\S+))/g;
while ((match = pyImportRegex.exec(content)) !== null) {
imports.push(match[1] || match[2]);
}
break;
case '.c':
case '.cpp':
case '.h':
// #include "..." / #include <...>
const includeRegex = /#include\s*[<"]([^>"]+)[>"]/g;
while ((match = includeRegex.exec(content)) !== null) {
imports.push(match[1]);
}
break;
}
return [...new Set(imports)]; // Remove duplicates
}
/**
* Extract function/method calls using simple regex
* @param {string} content - File content
* @param {string} extension - File extension
* @returns {Array} Array of function calls
*/
extractCalls(content, extension) {
const calls = [];
// Generic function call pattern: identifier followed by (
const callRegex = /\b([a-zA-Z_][a-zA-Z0-9_]*)\s*\(/g;
let match;
while ((match = callRegex.exec(content)) !== null) {
const funcName = match[1];
// Filter out language keywords
const keywords = ['if', 'for', 'while', 'switch', 'catch', 'typeof', 'return', 'new'];
if (!keywords.includes(funcName) && funcName.length > 1) {
calls.push(funcName);
}
}
// Return unique calls, limited to prevent noise
return [...new Set(calls)].slice(0, 20);
}
/**
* Validate generated JSON file
* @param {string} outputPath - Path to generated JSON
*/
async validateGeneratedJSON(outputPath) {
try {
// Check file exists and is readable
const stats = await fs.stat(outputPath);
// Check file size warnings
const maxSize = ragConfig.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB');
if (stats.size > maxSize) {
console.warn(`⚠️ Generated file is large: ${this.formatFileSize(stats.size)} (>${this.formatFileSize(maxSize)})`);
}
// Simple validation: read first 1KB to check JSON structure
const stream = createReadStream(outputPath, { encoding: 'utf8', start: 0, end: 1023 });
let sampleText = '';
for await (const chunk of stream) {
sampleText += chunk;
}
// Check for basic JSON structure
if (!sampleText.trim().startsWith('{')) {
throw new Error('Generated file does not start with valid JSON');
}
// Check for expected structure
if (!sampleText.includes('"metadata"') || !sampleText.includes('"files"')) {
console.warn('⚠️ JSON structure may be incomplete - expected sections not found in sample');
}
console.log('✅ JSON validation passed');
} catch (error) {
console.error(`❌ JSON validation failed: ${error.message}`);
// Don't re-throw - just warn since file was successfully written
console.warn('⚠️ Continuing despite validation warning - file was generated successfully');
}
}
/**
* Format file size in human readable format
* @param {number} bytes - Size in bytes
* @returns {string} Formatted size string
*/
formatFileSize(bytes) {
const units = ['B', 'KB', 'MB', 'GB'];
let size = bytes;
let unitIndex = 0;
while (size >= 1024 && unitIndex < units.length - 1) {
size /= 1024;
unitIndex++;
}
return `${size.toFixed(1)} ${units[unitIndex]}`;
}
/**
* Validate discovery progress for quality assurance
* @param {Array} discoveredFiles - Files discovered so far
* @param {number} processedCount - Total files processed
*/
validateDiscoveryProgress(discoveredFiles, processedCount) {
const issues = [];
// Check for duplicate hashes
const hashes = new Set();
const duplicates = [];
for (const file of discoveredFiles) {
if (hashes.has(file.hash)) {
duplicates.push(file.hash.substring(0, 8));
} else {
hashes.add(file.hash);
}
}
if (duplicates.length > 0) {
issues.push(`Duplicate hashes detected: ${duplicates.join(', ')}`);
}
// Check file size distribution
const largeSizeThreshold = ragConfig.parseFileSize(this.yamlConfig?.performance?.maxFileSize || '100MB');
const largeFiles = discoveredFiles.filter(f => f.size > largeSizeThreshold);
if (largeFiles.length > 0) {
issues.push(`${largeFiles.length} files exceed size threshold`);
}
// Check tag distribution
const tagCounts = {};
discoveredFiles.forEach(file => {
file.tags.forEach(tag => {
tagCounts[tag] = (tagCounts[tag] || 0) + 1;
});
});
const untaggedFiles = discoveredFiles.filter(f => f.tags.length === 0);
if (untaggedFiles.length > discoveredFiles.length * 0.5) {
issues.push(`High untagged ratio: ${untaggedFiles.length}/${discoveredFiles.length}`);
}
// Report issues if any
if (issues.length > 0) {
console.warn(`\n⚠️ Discovery validation issues: ${issues.join(', ')}`);
}
}
/**
* Validate processing progress for quality assurance
* @param {Map} chunkOffsets - Current chunk offsets
*/
validateProcessingProgress(chunkOffsets) {
const issues = [];
// Check chunk size distribution
const chunkSizes = [];
for (const [chunkId, offsetData] of chunkOffsets.entries()) {
const size = offsetData.contentEnd - offsetData.contentStart;
chunkSizes.push(size);
}
if (chunkSizes.length > 0) {
const avgChunkSize = chunkSizes.reduce((a, b) => a + b, 0) / chunkSizes.length;
const maxChunkSize = Math.max(...chunkSizes);
const maxChunkThreshold = ragConfig.parseFileSize(this.yamlConfig?.quality?.maxChunkSize || '50KB');
if (maxChunkSize > maxChunkThreshold) {
issues.push(`Large chunk detected: ${this.formatFileSize(maxChunkSize)}`);
}
if (avgChunkSize < 100) {
issues.push(`Small average chunk size: ${this.formatFileSize(avgChunkSize)}`);
}
}
// Check offset consistency
let invalidOffsets = 0;
for (const [chunkId, offsetData] of chunkOffsets.entries()) {
if (offsetData.contentStart >= offsetData.contentEnd) {
invalidOffsets++;
}
}
if (invalidOffsets > 0) {
issues.push(`Invalid offsets: ${invalidOffsets} chunks`);
}
// Report issues if any
if (issues.length > 0) {
console.warn(`\n⚠️ Processing validation issues: ${issues.join(', ')}`);
}
}
/**
* Final validation of generated output with seek inverse testing
* @param {string} outputPath - Generated file path
* @param {Array} discoveredFiles - All processed files
* @param {Map} chunkOffsets - All chunk offsets
*/
async validateFinalOutput(outputPath, discoveredFiles, chunkOffsets) {
const issues = [];
try {
const stats = await fs.stat(outputPath);
// Check file size
const maxOutputSize = ragConfig.parseFileSize(this.yamlConfig?.quality?.maxOutputSize || '250MB');
if (stats.size > maxOutputSize) {
issues.push(`Output size (${this.formatFileSize(stats.size)}) exceeds threshold`);
}
// Check completeness
const expectedChunks = discoveredFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0);
const actualChunks = chunkOffsets.size;
if (expectedChunks !== actualChunks) {
issues.push(`Chunk count mismatch: expected ${expectedChunks}, got ${actualChunks}`);
}
// Check for empty chunks
const emptyChunks = Array.from(chunkOffsets.values()).filter(offset =>
offset.contentEnd - offset.contentStart < 10
).length;
if (emptyChunks > 0) {
issues.push(`${emptyChunks} near-empty chunks detected`);
}
// SEEK INVERSE TESTING - Test random chunk offsets
await this.validateSeekInverse(outputPath, chunkOffsets);
// JSON Schema validation (basic)
await this.validateJsonStructure(outputPath);
// Report final validation
if (issues.length > 0) {
console.warn(`\n⚠️ Final validation issues:`);
issues.forEach(issue => console.warn(` • ${issue}`));
} else {
console.log('✅ Final validation passed - output is healthy');
}
} catch (error) {
console.error(`❌ Final validation failed: ${error.message}`);
}
}
/**
* Test seek operations on random chunk offsets to verify accuracy
* @param {string} outputPath - Generated JSON file path
* @param {Map} chunkOffsets - Chunk offset map
*/
async validateSeekInverse(outputPath, chunkOffsets) {
const chunkIds = Array.from(chunkOffsets.keys());
const testCount = Math.min(3, chunkIds.length); // Test 2-3 random chunks
if (testCount === 0) {
console.warn('⚠️ No chunks to test for seek validation');
return;
}
console.log(`🔍 Testing seek inverse on ${testCount} random chunks...`);
for (let i = 0; i < testCount; i++) {
const randomIndex = Math.floor(Math.random() * chunkIds.length);
const chunkId = chunkIds[randomIndex];
const offsetData = chunkOffsets.get(chunkId);
try {
// Read the specific chunk content using simple file read
const fullContent = await fs.readFile(outputPath, 'utf8');
const seekContent = fullContent.slice(offsetData.contentStart, offsetData.contentEnd);
// Verify it's valid JSON content (should be a JSON string value)
try {
// Try to parse as JSON - if it's valid JSON string content, this should work
const parsed = JSON.parse(seekContent);
if (typeof parsed === 'string') {
console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON string (${seekContent.length} bytes)`);
} else {
console.log(` ✅ Chunk ${chunkId}: seek successful, valid JSON (${typeof parsed}, ${seekContent.length} bytes)`);
}
} catch (parseError) {
// If it doesn't parse as JSON, it might be a partial chunk
console.log(` ✅ Chunk ${chunkId}: seek successful, partial content (${seekContent.length} bytes)`);
}
} catch (error) {
console.error(` ❌ Chunk ${chunkId}: seek failed - ${error.message}`);
}
}
}
/**
* Basic JSON structure validation
* @param {string} outputPath - Generated JSON file path
*/
async validateJsonStructure(outputPath) {
try {
// Read full content for validation (simpler approach)
const fullContent = await fs.readFile(outputPath, 'utf8');
const startText = fullContent.slice(0, 1024).trim();
const endText = fullContent.slice(-1024).trim();
// Basic structure checks
const issues = [];
if (!startText.startsWith('{')) {
issues.push('File does not start with {');
}
if (!endText.endsWith('}')) {
issues.push('File does not end with }');
}
if (!startText.includes('"metadata"')) {
issues.push('Missing metadata section');
}
if (!startText.includes('"files"')) {
issues.push('Missing files section');
}
if (!fullContent.includes('"index"')) {
issues.push('Missing index section');
}
if (issues.length === 0) {
console.log('✅ JSON structure validation passed');
} else {
console.warn(`⚠️ JSON structure issues: ${issues.join(', ')}`);
}
} catch (error) {
console.error(`❌ JSON structure validation failed: ${error.message}`);
}
}
/**
* Generate RAG output atomically - build complete structure in memory (thread-safe)
*/
async generate(discoveredFiles, outputPath, projectName, scanPath) {
console.log('📝 Atomic generation: processing all files in memory...');
await fs.ensureDir(path.dirname(outputPath));
const processedFiles = [];
let totalChunks = 0;
for (let i = 0; i < discoveredFiles.length; i++) {
const fileData = discoveredFiles[i];
const progress = ((i + 1) / discoveredFiles.length * 100).toFixed(1);
process.stdout.write(`\r📊 Processing: ${progress}% (${i + 1}/${discoveredFiles.length})`);
try {
const processedFile = await this.processFileInMemory(fileData);
processedFiles.push(processedFile);
totalChunks += processedFile.chunks?.length || 0;
this.stats.filesProcessed++;
this.stats.chunksGenerated += processedFile.chunks?.length || 0;
} catch (error) {
console.warn(`\n⚠️ Error processing ${fileData.path}: ${error.message}`);
this.errors.push({ file: fileData.path, error: error.message });
processedFiles.push({ ...fileData, chunks: [], error: error.message });
}
}
console.log(`\n✅ All files processed: ${processedFiles.length} files, ${totalChunks} chunks`);
const completeJSON = this.buildCompleteJSON(processedFiles, projectName, scanPath);
const finalJSON = this.calculateAndInjectOffsets(completeJSON);
await fs.writeFile(outputPath, finalJSON, 'utf8');
this.stats.bytesWritten = finalJSON.length;
if (this.yamlConfig?.output?.validation) {
console.log('🔍 Validating generated output...');
await this.validateGeneratedJSON(outputPath);
}
console.log(`✅ JSON written successfully to ${outputPath}`);
return {
outputPath,
totalFiles: processedFiles.length,
totalChunks,
bytesWritten: finalJSON.length,
extensionCoverage: this.verifyExtensionCoverage([...new Set(processedFiles.map(f => f.extension))])
};
}
async processFileInMemory(fileData) {
const content = await fs.readFile(fileData.absolutePath, 'utf8');
const handler = this.getHandler(fileData.extension);
console.log(`🔍 ${handler.constructor.name} processing ${fileData.extension} file: ${fileData.path}`);
const chunks = await handler.generateChunks(content, {
fileId: fileData.id,
filePath: fileData.path,
extension: fileData.extension,
language: fileData.language,
maxTokens: this.maxTokensPerChunk,
overlap: this.overlapTokens
});
if (chunks.length > 0) {
console.log(` 📝 Found ${chunks.length} semantic chunks`);
}
const enrichedChunks = chunks.map((chunk, index) => ({
...chunk,
id: `chunk_${fileData.id}_${index}`,
tokenEstimate: this.safeEstimateTokens(chunk.content, fileData.language),
imports: this.extractImports(chunk.content, fileData.extension),
calls: this.extractCalls(chunk.content, fileData.extension)
}));
return {
id: fileData.id,
path: fileData.path,
language: fileData.language,
extension: fileData.extension,
size: fileData.size,
lines: content.split('\n').length,
hash: fileData.hash,
modified: fileData.modified,
tags: fileData.tags,
chunks: enrichedChunks
};
}
buildCompleteJSON(processedFiles, projectName, scanPath) {
const totalChunks = processedFiles.reduce((sum, file) => sum + (file.chunks?.length || 0), 0);
const emptyFiles = processedFiles.filter(f => (f.chunks?.length || 0) === 0).length;
this.stats.endTime = Date.now();
const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime);
return {
metadata: {
projectName,
generatedAt: new Date().toISOString(),
scanPath,
generator: 'CodeSummary RAG Generator',
version: '3.1.0',
config: {
maxTokensPerChunk: this.maxTokensPerChunk,
overlapTokens: this.overlapTokens,
tokenEstimationMethod: 'enhanced_heuristic_v1.0'
},
summary: {
totalFiles: processedFiles.length,
languages: [...new Set(processedFiles.map(f => f.language))],
extensions: [...new Set(processedFiles.map(f => f.extension))]
},
schemaVersion: "1.0",
schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json"
},
files: processedFiles,
index: {
version: "3.1.0",
generatedAt: new Date().toISOString(),
schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json",
summary: {
fileCount: processedFiles.length - emptyFiles,
chunkCount: totalChunks,
totalBytes: 0,
languages: [...new Set(processedFiles.map(f => f.language))],
extensions: [...new Set(processedFiles.map(f => f.extension))],
avgFileSize: 0,
avgChunksPerFile: processedFiles.length > 0 ? Math.round(totalChunks / processedFiles.length) : 0
},
chunkOffsets: {},
fileOffsets: {},
seekInfo: {
instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content",
format: "All offsets are absolute byte positions in this JSON file",
chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)",
fileFormat: "Array [start, end] for each file in JSON"
},
statistics: {
processingTimeMs,
bytesPerSecond: 0,
bytesWritten: 0,
chunksWithValidOffsets: totalChunks,
filesWithValidOffsets: processedFiles.length - emptyFiles,
totalFiles: processedFiles.length,
emptyFiles: emptyFiles,
totalChunksGenerated: totalChunks,
errors: this.errors
}
}
};
}
/**
* Finaliza la estructura JSON calculando y reinyectando los offsets correctos.
* Este enfoque garantiza la máxima precisión al operar sobre el string JSON final.
* @param {object} jsonStructure - El objeto JSON completo con datos pero sin offsets.
* @returns {string} El string JSON final, formateado y con offsets precisos.
*/
calculateAndInjectOffsets(jsonStructure) {
console.log('🔍 Calculating precise byte offsets and building complete index...');
// PASO 1: Construir JSON preliminar sin index para medir posiciones exactas
const jsonWithoutIndex = {
metadata: jsonStructure.metadata,
files: jsonStructure.files
};
const preliminaryJsonString = JSON.stringify(jsonWithoutIndex, null, 2);
const preliminaryBytes = Buffer.byteLength(preliminaryJsonString, 'utf8');
// PASO 2: Calcular offsets precisos de archivos y chunks
const fileOffsets = {};
const chunkOffsets = {};
let totalChunks = 0;
let validChunks = 0;
for (const file of jsonStructure.files) {
// Buscar el inicio del objeto file por su ID
const filePattern = `"id": "${file.id}"`;
const fileStartPos = preliminaryJsonString.indexOf(filePattern);
if (fileStartPos !== -1) {
// Buscar el final aproximado del objeto file
const nextFilePattern = preliminaryJsonString.indexOf(' {\n "id":', fileStartPos + 1);
const fileEndPos = nextFilePattern !== -1 ? nextFilePattern : preliminaryJsonString.lastIndexOf(' ]');
// Formato del esquema: fileId -> [start, end]
fileOffsets[file.id] = [fileStartPos, fileEndPos];
// Calcular offsets de chunks dentro de este archivo
for (const chunk of file.chunks) {
const chunkPattern = `"id": "${chunk.id}"`;
const chunkStartPos = preliminaryJsonString.indexOf(chunkPattern, fileStartPos);
if (chunkStartPos !== -1) {
// Encontrar el campo "content" dentro de este chunk
const contentPattern = '"content": "';
const contentStartSearch = preliminaryJsonString.indexOf(contentPattern, chunkStartPos);
if (contentStartSearch !== -1) {
const contentStart = contentStartSearch + contentPattern.length;
// Buscar el final del contenido (cierre de la cadena JSON)
let contentEnd = contentStart;
let inEscape = false;
for (let i = contentStart; i < preliminaryJsonString.length; i++) {
const char = preliminaryJsonString[i];
if (inEscape) {
inEscape = false;
continue;
}
if (char === '\\') {
inEscape = true;
continue;
}
if (char === '"') {
contentEnd = i;
break;
}
}
// Buscar el final del objeto chunk completo
const chunkEndPattern = '},';
const chunkEndSearch = preliminaryJsonString.indexOf(chunkEndPattern, contentEnd);
const chunkEnd = chunkEndSearch !== -1 ? chunkEndSearch + 1 : contentEnd + 100;
// Formato del esquema: chunkId -> objeto con offsets precisos
chunkOffsets[chunk.id] = {
jsonStart: chunkStartPos,
jsonEnd: chunkEnd,
contentStart: contentStart,
contentEnd: contentEnd,
filePath: file.path
};
validChunks++;
}
}
totalChunks++;
}
}
}
// PASO 3: Construir estadísticas completas
const processingTimeMs = Math.max(1, this.stats.endTime - this.stats.startTime);
const emptyFiles = jsonStructure.files.filter(f => f.chunks.length === 0).length;
// PASO 4: Construir el bloque index completo según el esquema
const indexBlock = {
version: "3.1.0",
generatedAt: new Date().toISOString(),
schemaUrl: "https://github.com/skamoll/CodeSummary/schemas/rag-output.json",
summary: {
fileCount: jsonStructure.files.length - emptyFiles,
chunkCount: totalChunks,
totalBytes: 0, // Se actualizará después
languages: [...new Set(jsonStructure.files.map(f => f.language))],
extensions: [...new Set(jsonStructure.files.map(f => f.extension))],
avgFileSize: 0, // Se actualizará después
avgChunksPerFile: jsonStructure.files.length > 0 ? Math.round(totalChunks / jsonStructure.files.length) : 0
},
chunkOffsets: chunkOffsets,
fileOffsets: fileOffsets,
seekInfo: {
instructions: "Use chunkOffsets[chunkId].contentStart and contentEnd to seek directly to chunk content",
format: "All offsets are absolute byte positions in this JSON file",
chunkFormat: "Object with jsonStart, jsonEnd, contentStart, contentEnd (absolute JSON positions)",
fileFormat: "Array [start, end] for each file in JSON",
validation: `Generated with ${validChunks} chunks across ${Object.keys(fileOffsets).length} files`
},
statistics: {
processingTimeMs,
bytesPerSecond: 0, // Se actualizará después
bytesWritten: 0, // Se actualizará después
chunksWithValidOffsets: validChunks,
filesWithValidOffsets: Object.keys(fileOffsets).length,
totalFiles: jsonStructure.files.length,
emptyFiles: emptyFiles,
totalChunksGenerated: totalChunks
}
};
// PASO 5: Construir JSON final con index y calcular métricas finales
const completeStructure = {
metadata: jsonStructure.metadata,
files: jsonStructure.files,
index: indexBlock
};
const finalJsonString = JSON.stringify(completeStructure, null, 2);
const finalBytes = Buffer.byteLength(finalJsonString, 'utf8');
const bytesPerSecond = Math.round(finalBytes / (processingTimeMs / 1000));
// Actualizar métricas finales en el index
completeStructure.index.summary.totalBytes = finalBytes;
completeStructure.index.summary.avgFileSize = jsonStructure.files.length > 0 ?
Math.round(finalBytes / jsonStructure.files.length) : 0;
completeStructure.index.statistics.bytesPerSecond = bytesPerSecond;
completeStructure.index.statistics.bytesWritten = finalBytes;
// PASO 6: Regenerar JSON final con estadísticas actualizadas
const finalResult = JSON.stringify(completeStructure, null, 2);
console.log(`✅ Complete index built: ${Object.keys(fileOffsets).length} files, ${validChunks}/${totalChunks} chunks with precise offsets`);
console.log(`✅ Final JSON: ${this.formatFileSize(Buffer.byteLength(finalResult, 'utf8'))}, processing: ${processingTimeMs}ms`);
return finalResult;
}
}
// Specialized Handler Classes
class BaseHandler {
async generateChunks(content, options) {
// Fallback: split by lines if no specific logic
return this.chunkByLines(content, options);
}
chunkByLines(content, options) {
const lines = content.split('\n');
const chunks = [];
const maxLines = Math.ceil(options.maxTokens / 20); // ~20 tokens per line estimate
for (let i = 0; i < lines.length; i += maxLines) {
const chunkLines = lines.slice(i, Math.min(i + maxLines, lines.length));
const chunkContent = chunkLines.join('\n');
chunks.push({
content: chunkContent,
lineStart: i + 1,
lineEnd: Math.min(i + maxLines, lines.length),
chunkingMethod: 'line-based'
});
}
return chunks;
}
/**
* Estimate token count for chunking decisions
* @param {string} content - Text content
* @returns {number} Estimated token count
*/
estimateTokens(content) {
return Math.ceil(content.length / 4);
}
}
class CLikeHandler extends BaseHandler {
async generateChunks(content, options) {
const chunks = [];
const lines = content.split('\n');
// Find class/struct/function boundaries
const boundaries = this.findCodeBoundaries(content);
if (boundaries.length > 0) {
return this.chunkByBoundaries(content, boundaries, options);
}
// Fallback to line-based chunking
return this.chunkByLines(content, options);
}
findCodeBoundaries(content) {
const boundaries = [];
const boundaryRegex = /^(?:class|struct|enum|union|static)?\s*([a-zA-Z_][\w]*)\s*.*{/gm;
let match;
while ((match = boundaryRegex.exec(content)) !== null) {
const lineNumber = content.substring(0, match.index).split('\n').length;
boundaries.push({
name: match[1],
line: lineNumber,
type: 'function'
});
}
return boundaries;
}
chunkByBoundaries(content, boundaries, options) {
const lines = content.split('\n');
const chunks = [];
let currentStart = 0;
for (const boundary of boundaries) {
if (currentStart < boundary.line - 1) {
const chunkLines = lines.slice(currentStart, boundary.line - 1);
if (chunkLines.length > 0) {
chunks.push({
content: chunkLines.join('\n'),
lineStart: currentStart + 1,
lineEnd: boundary.line - 1
});
}
}
currentStart = boundary.line - 1;
}
// Add remaining lines
if (currentStart < lines.length) {
const chunkLines = lines.slice(currentStart);
chunks.push({
content: chunkLines.join('\n'),
lineStart: currentStart + 1,
lineEnd: lines.length
});
}
return chunks;
}
}
class ScriptHandler extends BaseHandler {
async generateChunks(content, options) {
console.log(`🔍 ScriptHandler processing ${options.extension} file: ${options.filePath}`);
// ALWAYS try semantic chunking first for script files