codesummary
Version:
Cross-platform CLI tool that generates professional PDF documentation and RAG-optimized JSON outputs from project source code. Perfect for code reviews, audits, documentation, and AI/ML applications with semantic chunking and precision offsets.
468 lines (409 loc) • 16.1 kB
JavaScript
import fs from 'fs-extra';
import path from 'path';
import chalk from 'chalk';
import ErrorHandler from './errorHandler.js';
/**
* File Scanner for CodeSummary
* Handles recursive directory traversal and file filtering
*/
export class Scanner {
constructor(config) {
this.config = config;
this.allowedExtensions = new Set(config.allowedExtensions.map(ext => ext.toLowerCase()));
this.excludeDirs = new Set(config.excludeDirs);
this.excludeFiles = config.excludeFiles || [];
}
/**
* Scan a directory recursively and return files grouped by extension
* @param {string} rootPath - Root directory to scan
* @returns {Promise<object>} Object with extensions as keys and file arrays as values
*/
async scanDirectory(rootPath) {
const scanErrors = [];
const scanWarnings = [];
try {
// For scanner paths, we only need basic validation (no aggressive sanitization)
if (!rootPath || typeof rootPath !== 'string') {
throw new Error('Invalid root path: must be a non-empty string');
}
// Just resolve the path and validate it exists
const resolvedRoot = path.resolve(rootPath);
const stats = await fs.stat(resolvedRoot);
if (!stats.isDirectory()) {
throw new Error(`Path is not a directory: ${resolvedRoot}`);
}
console.log(chalk.gray(`Scanning directory: ${resolvedRoot}`));
const filesByExtension = {};
const scannedFiles = new Set(); // Prevent duplicates
const scanContext = {
errors: scanErrors,
warnings: scanWarnings,
skippedDirectories: 0,
skippedFiles: 0,
processedFiles: 0
};
await this.walkDirectory(resolvedRoot, resolvedRoot, filesByExtension, scannedFiles, scanContext);
// Sort files within each extension group
Object.keys(filesByExtension).forEach(ext => {
filesByExtension[ext].sort((a, b) => a.relativePath.localeCompare(b.relativePath));
});
// Report scan summary with warnings/errors
this.reportScanIssues(scanContext);
return filesByExtension;
} catch (error) {
if (error.code === 'ENOENT') {
throw new Error(`Directory does not exist: ${rootPath}`);
} else if (error.code === 'EACCES') {
throw new Error(`Permission denied accessing directory: ${rootPath}`);
}
throw error;
}
}
/**
* Recursively walk through directory structure
* @param {string} currentPath - Current directory being processed
* @param {string} rootPath - Original root path for relative path calculation
* @param {object} filesByExtension - Accumulator object for results
* @param {Set} scannedFiles - Set to track processed files and avoid duplicates
* @param {object} scanContext - Context object to track scan statistics
*/
async walkDirectory(currentPath, rootPath, filesByExtension, scannedFiles, scanContext) {
try {
const entries = await fs.readdir(currentPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(currentPath, entry.name);
const relativePath = path.relative(rootPath, fullPath);
if (entry.isDirectory()) {
// Skip excluded directories and hidden directories (unless explicitly allowed)
if (this.shouldSkipDirectory(entry.name, relativePath)) {
scanContext.skippedDirectories++;
continue;
}
// Recursively scan subdirectory
await this.walkDirectory(fullPath, rootPath, filesByExtension, scannedFiles, scanContext);
} else if (entry.isFile()) {
// Process file if it matches criteria
await this.processFile(fullPath, rootPath, filesByExtension, scannedFiles, scanContext);
} else if (entry.isSymbolicLink()) {
// Handle symbolic links with caution
scanContext.warnings.push(`Skipped symbolic link: ${relativePath}`);
}
// Skip other special files (devices, sockets, etc.)
}
} catch (error) {
// Track errors in context for better reporting
const relativePath = path.relative(rootPath, currentPath);
if (error.code === 'EACCES' || error.code === 'EPERM') {
scanContext.errors.push(`Permission denied: ${relativePath}`);
} else if (error.code === 'ENOENT') {
scanContext.warnings.push(`Directory no longer exists: ${relativePath}`);
} else if (error.code === 'ENOTDIR') {
scanContext.warnings.push(`Path is not a directory: ${relativePath}`);
} else {
scanContext.errors.push(`Cannot read directory ${relativePath}: ${error.message}`);
}
}
}
/**
* Process a single file and add it to results if it matches criteria
* @param {string} fullPath - Absolute path to the file
* @param {string} rootPath - Root path for relative calculation
* @param {object} filesByExtension - Results accumulator
* @param {Set} scannedFiles - Set of already processed files
* @param {object} scanContext - Context object to track scan statistics
*/
async processFile(fullPath, rootPath, filesByExtension, scannedFiles, scanContext) {
try {
const relativePath = path.relative(rootPath, fullPath);
// Avoid processing the same file twice (in case of symlinks)
if (scannedFiles.has(fullPath)) {
return;
}
scannedFiles.add(fullPath);
const extension = path.extname(relativePath).toLowerCase();
// Skip files without extensions or not in allowed list
if (!extension || !this.allowedExtensions.has(extension)) {
scanContext.skippedFiles++;
return;
}
// Skip hidden files (starting with .) unless explicitly needed
const fileName = path.basename(relativePath);
if (fileName.startsWith('.') && !this.isAllowedHiddenFile(fileName)) {
scanContext.skippedFiles++;
return;
}
// Check if file should be excluded by pattern (e.g., *-lock.json)
if (this.shouldExcludeFile(fileName)) {
scanContext.skippedFiles++;
return;
}
// Verify file is readable
const stats = await fs.stat(fullPath);
if (!stats.isFile()) {
scanContext.warnings.push(`Skipped non-regular file: ${relativePath}`);
return;
}
// Check file size limits
const MAX_INDIVIDUAL_FILE_SIZE = 100 * 1024 * 1024; // 100MB per file
if (stats.size > MAX_INDIVIDUAL_FILE_SIZE) {
scanContext.warnings.push(`Skipped large file (${Math.round(stats.size / 1024 / 1024)}MB): ${relativePath}`);
scanContext.skippedFiles++;
return;
}
// Add to results
if (!filesByExtension[extension]) {
filesByExtension[extension] = [];
}
filesByExtension[extension].push({
relativePath: relativePath.replace(/\\/g, '/'), // Normalize path separators
absolutePath: fullPath,
size: stats.size,
modified: stats.mtime
});
scanContext.processedFiles++;
} catch (error) {
// Handle file processing errors with appropriate context
const relativePath = path.relative(rootPath, fullPath);
if (error.code === 'EACCES' || error.code === 'EPERM') {
scanContext.errors.push(`Permission denied: ${relativePath}`);
} else if (error.code === 'ENOENT') {
// File might have been deleted during scan
scanContext.warnings.push(`File no longer exists: ${relativePath}`);
} else if (error.code === 'EISDIR') {
scanContext.warnings.push(`Path is a directory, not a file: ${relativePath}`);
} else {
scanContext.errors.push(`Cannot process file ${relativePath}: ${error.message}`);
}
scanContext.skippedFiles++;
}
}
/**
* Determine if a directory should be skipped
* @param {string} dirName - Directory name
* @param {string} relativePath - Relative path from root
* @returns {boolean} True if directory should be skipped
*/
shouldSkipDirectory(dirName, relativePath) {
// Skip directories in exclude list
if (this.excludeDirs.has(dirName)) {
return true;
}
// Skip hidden directories (starting with .) unless explicitly allowed
if (dirName.startsWith('.') && !this.isAllowedHiddenDirectory(dirName)) {
return true;
}
// Skip common build/cache directories that might not be in exclude list
const commonSkipDirs = new Set([
'tmp', 'temp', 'cache', '.cache', 'logs', '.logs',
'bower_components', 'vendor', '.vendor'
]);
if (commonSkipDirs.has(dirName.toLowerCase())) {
return true;
}
return false;
}
/**
* Check if a file should be excluded based on patterns
* @param {string} fileName - File name to check
* @returns {boolean} True if file should be excluded
*/
shouldExcludeFile(fileName) {
for (const pattern of this.excludeFiles) {
if (this.matchesPattern(fileName, pattern)) {
return true;
}
}
return false;
}
/**
* Simple glob pattern matching
* @param {string} fileName - File name to test
* @param {string} pattern - Pattern to match (supports * wildcards)
* @returns {boolean} True if pattern matches
*/
matchesPattern(fileName, pattern) {
// Exact match
if (pattern === fileName) {
return true;
}
// Convert glob pattern to regex
const regexPattern = pattern
.replace(/\./g, '\\.') // Escape dots
.replace(/\*/g, '.*'); // Convert * to .*
const regex = new RegExp(`^${regexPattern}$`, 'i');
return regex.test(fileName);
}
/**
* Check if a hidden file should be included
* @param {string} fileName - File name
* @returns {boolean} True if file should be included
*/
isAllowedHiddenFile(fileName) {
const allowedHiddenFiles = new Set([
'.gitignore', '.gitattributes', '.editorconfig',
'.eslintrc.js', '.eslintrc.json', '.prettierrc',
'.env.example', '.htaccess'
]);
return allowedHiddenFiles.has(fileName);
}
/**
* Check if a hidden directory should be included
* @param {string} dirName - Directory name
* @returns {boolean} True if directory should be included
*/
isAllowedHiddenDirectory(dirName) {
const allowedHiddenDirs = new Set([
'.github', '.gitlab', '.circleci'
]);
return allowedHiddenDirs.has(dirName);
}
/**
* Get file extension descriptions for user display
* @param {object} filesByExtension - Files grouped by extension
* @returns {Array} Array of extension info objects
*/
getExtensionInfo(filesByExtension) {
const extensionDescriptions = {
'.js': 'JavaScript',
'.ts': 'TypeScript',
'.jsx': 'React JSX',
'.tsx': 'TypeScript JSX',
'.json': 'JSON',
'.xml': 'XML',
'.html': 'HTML',
'.css': 'CSS',
'.scss': 'SCSS',
'.sass': 'Sass',
'.md': 'Markdown',
'.txt': 'Text',
'.py': 'Python',
'.java': 'Java',
'.cs': 'C#',
'.cpp': 'C++',
'.c': 'C',
'.h': 'Header',
'.yaml': 'YAML',
'.yml': 'YAML',
'.sh': 'Shell Script',
'.bat': 'Batch File',
'.ps1': 'PowerShell',
'.php': 'PHP',
'.rb': 'Ruby',
'.go': 'Go',
'.rs': 'Rust',
'.swift': 'Swift',
'.kt': 'Kotlin',
'.scala': 'Scala',
'.vue': 'Vue.js',
'.svelte': 'Svelte',
'.dockerfile': 'Dockerfile',
'.sql': 'SQL',
'.graphql': 'GraphQL'
};
return Object.keys(filesByExtension)
.sort()
.map(ext => ({
extension: ext,
description: extensionDescriptions[ext] || 'Unknown',
count: filesByExtension[ext].length,
files: filesByExtension[ext]
}));
}
/**
* Calculate total statistics for scanned files
* @param {object} filesByExtension - Files grouped by extension
* @returns {object} Statistics object
*/
calculateStatistics(filesByExtension) {
let totalFiles = 0;
let totalSize = 0;
const extensionCount = Object.keys(filesByExtension).length;
Object.values(filesByExtension).forEach(files => {
totalFiles += files.length;
totalSize += files.reduce((sum, file) => sum + file.size, 0);
});
return {
totalFiles,
totalSize,
extensionCount,
averageFileSize: totalFiles > 0 ? Math.round(totalSize / totalFiles) : 0,
totalSizeFormatted: this.formatFileSize(totalSize)
};
}
/**
* Format file size in human readable format
* @param {number} bytes - Size in bytes
* @returns {string} Formatted size string
*/
formatFileSize(bytes) {
const units = ['B', 'KB', 'MB', 'GB'];
let size = bytes;
let unitIndex = 0;
while (size >= 1024 && unitIndex < units.length - 1) {
size /= 1024;
unitIndex++;
}
return `${size.toFixed(1)} ${units[unitIndex]}`;
}
/**
* Report scan issues and statistics
* @param {object} scanContext - Context object with scan statistics
*/
reportScanIssues(scanContext) {
const { errors, warnings, skippedDirectories, skippedFiles, processedFiles } = scanContext;
// Report critical errors
if (errors.length > 0) {
console.log(chalk.red(`\n⚠️ ${errors.length} scan error(s):`));
errors.slice(0, 5).forEach(error => {
console.log(chalk.red(` • ${error}`));
});
if (errors.length > 5) {
console.log(chalk.gray(` ... and ${errors.length - 5} more errors`));
}
}
// Report warnings (less critical)
if (warnings.length > 0 && process.env.NODE_ENV === 'development') {
console.log(chalk.yellow(`\n⚠️ ${warnings.length} scan warning(s):`));
warnings.slice(0, 3).forEach(warning => {
console.log(chalk.yellow(` • ${warning}`));
});
if (warnings.length > 3) {
console.log(chalk.gray(` ... and ${warnings.length - 3} more warnings`));
}
}
// Report summary statistics
const totalIssues = errors.length + warnings.length;
if (skippedFiles > 0 || skippedDirectories > 0 || totalIssues > 0) {
console.log(chalk.gray(`\n📊 Scan Statistics:`));
console.log(chalk.gray(` Processed: ${processedFiles} files`));
if (skippedFiles > 0) {
console.log(chalk.gray(` Skipped: ${skippedFiles} files`));
}
if (skippedDirectories > 0) {
console.log(chalk.gray(` Skipped: ${skippedDirectories} directories`));
}
if (totalIssues > 0) {
console.log(chalk.gray(` Issues: ${errors.length} errors, ${warnings.length} warnings`));
}
}
// Warn if scan completeness is compromised
if (errors.length > 0) {
console.log(chalk.yellow(`\n⚠️ WARNING: Scan may be incomplete due to ${errors.length} access errors.`));
console.log(chalk.gray(' Some files or directories could not be accessed.'));
}
}
/**
* Display scan results summary
* @param {object} filesByExtension - Files grouped by extension
*/
displayScanSummary(filesByExtension) {
const stats = this.calculateStatistics(filesByExtension);
const extensions = Object.keys(filesByExtension).sort();
console.log(chalk.cyan('\n📊 Scan Summary:'));
console.log(chalk.gray(` Extensions found: ${extensions.join(', ')}`));
console.log(chalk.gray(` Total files: ${stats.totalFiles}`));
console.log(chalk.gray(` Total size: ${stats.totalSizeFormatted}`));
console.log();
}
}
export default Scanner;