UNPKG

superaugment

Version:

Enterprise-grade MCP server with world-class C++ analysis, robust error handling, and production-ready architecture for VS Code Augment

631 lines 27.3 kB
/** * CUDA Code Analyzer * * Specialized analyzer for CUDA C/C++ code including kernel analysis, * memory management, performance optimization, and BSGS algorithm support. */ // import { readFile } from 'fs/promises'; // Unused import import { FileSystemManager } from '../utils/FileSystemManager.js'; import { logger } from '../utils/logger.js'; import { AnalysisError, ErrorCode, } from '../errors/ErrorTypes.js'; /** * CUDA Code Analyzer */ export class CudaAnalyzer { fileSystemManager; constructor() { this.fileSystemManager = new FileSystemManager(); } /** * Analyze CUDA file */ async analyzeFile(filePath) { try { logger.info(`Starting CUDA analysis for: ${filePath}`); const content = await this.fileSystemManager.readFileContent(filePath); const lines = content.split('\n'); // Initialize result const result = { summary: { totalKernels: 0, totalDeviceFunctions: 0, memoryTransfers: 0, sharedMemoryUsage: 0, registerUsage: 0, }, kernels: [], deviceFunctions: [], memoryOperations: [], optimizations: [], performance: { occupancy: { theoretical: 0, achieved: 0, limitingFactor: 'blocks', recommendations: [] }, memoryBandwidth: { utilization: 0, bottlenecks: [], recommendations: [] }, computeIntensity: { ratio: 0, classification: 'balanced', recommendations: [] }, }, issues: [], recommendations: [], }; // Analyze different aspects await this.analyzeKernels(lines, result); await this.analyzeDeviceFunctions(lines, result); await this.analyzeMemoryOperations(lines, result); await this.analyzeBsgsPatterns(lines, result); await this.analyzePerformance(lines, result); await this.detectIssues(lines, result); await this.generateOptimizations(result); // Update summary result.summary.totalKernels = result.kernels.length; result.summary.totalDeviceFunctions = result.deviceFunctions.length; result.summary.memoryTransfers = result.memoryOperations.length; logger.info(`CUDA analysis completed for: ${filePath}`); return result; } catch (error) { throw new AnalysisError(`CUDA analysis failed: ${error instanceof Error ? error.message : 'Unknown error'}`, ErrorCode.ANALYSIS_FAILED, { additionalInfo: { filePath } }, error instanceof Error ? error : undefined); } } /** * Analyze CUDA kernels */ async analyzeKernels(lines, result) { const kernelRegex = /__global__\s+\w+\s+(\w+)\s*\([^)]*\)/g; const launchRegex = /(\w+)<<<([^>]+)>>>/g; lines.forEach((line, index) => { const trimmed = line.trim(); // Find kernel definitions const kernelMatch = kernelRegex.exec(trimmed); if (kernelMatch) { const kernel = { name: kernelMatch[1] || 'unknown_kernel', line: index + 1, column: line.indexOf(kernelMatch[0]), gridDim: { x: '0', isDynamic: false }, blockDim: { x: '0', isDynamic: false }, sharedMemory: 0, parameters: this.parseKernelParameters(kernelMatch[0]), complexity: 0, memoryAccess: [], syncPoints: [], estimatedOccupancy: 0, registerPressure: 0, memoryCoalescing: { score: 0, issues: [], recommendations: [] }, isBsgsKernel: this.detectBsgsKernel(kernelMatch[1] || 'unknown', lines), }; if (kernel.isBsgsKernel) { kernel.bsgsCharacteristics = this.analyzeBsgsKernel(kernelMatch[1] || 'unknown', lines); } result.kernels.push(kernel); } // Find kernel launches const launchMatch = launchRegex.exec(trimmed); if (launchMatch) { const kernelName = launchMatch[1] || 'unknown'; const launchConfig = launchMatch[2] || ''; // Find corresponding kernel and update launch configuration const kernel = result.kernels.find(k => k.name === kernelName); if (kernel) { this.parseLaunchConfiguration(launchConfig, kernel); } } }); } /** * Analyze device functions */ async analyzeDeviceFunctions(lines, result) { const deviceFuncRegex = /__device__\s+(?:__inline__)?\s*\w+\s+(\w+)\s*\([^)]*\)/g; lines.forEach((line, index) => { const trimmed = line.trim(); const match = deviceFuncRegex.exec(trimmed); if (match) { const deviceFunc = { name: match[1] || 'unknown_device_func', line: index + 1, column: line.indexOf(match[0]), isInline: trimmed.includes('__inline__'), parameters: this.parseKernelParameters(match[0]), calledBy: [], }; result.deviceFunctions.push(deviceFunc); } }); } /** * Analyze memory operations */ async analyzeMemoryOperations(lines, result) { const memoryOps = [ 'cudaMalloc', 'cudaFree', 'cudaMemcpy', 'cudaMemcpyAsync', 'cudaMemset' ]; lines.forEach((line, index) => { const trimmed = line.trim(); memoryOps.forEach(op => { if (trimmed.includes(op)) { const memOp = { type: op, line: index + 1, column: line.indexOf(op), isAsync: op.includes('Async'), }; // Parse additional details based on operation type if (op === 'cudaMemcpy' || op === 'cudaMemcpyAsync') { memOp.direction = this.parseMemcpyDirection(trimmed); memOp.size = this.parseMemcpySize(trimmed); } result.memoryOperations.push(memOp); } }); }); } /** * Analyze BSGS patterns */ async analyzeBsgsPatterns(lines, result) { const bsgsKeywords = [ 'baby_step', 'giant_step', 'bsgs', 'discrete_log', 'collision', 'hash_table', 'lookup_table', 'precompute', 'sqrt' ]; let bsgsScore = 0; const bsgsLines = []; lines.forEach((line, index) => { const lowerLine = line.toLowerCase(); bsgsKeywords.forEach(keyword => { if (lowerLine.includes(keyword)) { bsgsScore++; bsgsLines.push(index + 1); } }); }); if (bsgsScore > 3) { // Threshold for BSGS detection result.bsgs = { isImplemented: true, algorithm: this.detectBsgsAlgorithm(lines), characteristics: this.analyzeBsgsCharacteristics(lines), optimizations: this.generateBsgsOptimizations(lines), performance: this.analyzeBsgsPerformance(lines), }; } } /** * Analyze performance characteristics */ async analyzePerformance(lines, result) { // Analyze occupancy result.performance.occupancy = this.analyzeOccupancy(result.kernels); // Analyze memory bandwidth result.performance.memoryBandwidth = this.analyzeMemoryBandwidth(result.memoryOperations); // Analyze compute intensity result.performance.computeIntensity = this.analyzeComputeIntensity(lines); } /** * Detect common CUDA issues */ async detectIssues(lines, result) { lines.forEach((line, index) => { const trimmed = line.trim(); // Check for race conditions if (trimmed.includes('__shared__') && !trimmed.includes('__syncthreads')) { result.issues.push({ type: 'race_condition', severity: 'high', line: index + 1, column: 0, description: 'Potential race condition with shared memory access', fix: 'Add __syncthreads() after shared memory writes', }); } // Check for uncoalesced memory access if (this.detectUncoalescedAccess(trimmed)) { result.issues.push({ type: 'uncoalesced_access', severity: 'medium', line: index + 1, column: 0, description: 'Potentially uncoalesced global memory access', fix: 'Ensure contiguous memory access patterns', }); } }); } /** * Generate optimization recommendations */ async generateOptimizations(result) { // Memory optimizations if (result.memoryOperations.length > 10) { result.optimizations.push({ type: 'memory', priority: 'high', description: 'Consider using unified memory or memory pools', benefit: 'Reduced memory allocation overhead', effort: 'medium', locations: result.memoryOperations.map(op => op.line), }); } // Occupancy optimizations const lowOccupancyKernels = result.kernels.filter(k => k.estimatedOccupancy < 50); if (lowOccupancyKernels.length > 0) { result.optimizations.push({ type: 'occupancy', priority: 'high', description: 'Optimize kernel launch configuration for better occupancy', benefit: 'Improved GPU utilization', effort: 'low', locations: lowOccupancyKernels.map(k => k.line), }); } } // Helper methods parseKernelParameters(kernelSignature) { // Extract parameters from kernel signature const paramMatch = kernelSignature.match(/\(([^)]*)\)/); if (!paramMatch || !paramMatch[1]) return []; const paramStr = paramMatch[1].trim(); if (!paramStr) return []; const params = paramStr.split(',').map(p => p.trim()); return params.map(param => { const parts = param.split(/\s+/); const name = parts[parts.length - 1] || 'unknown'; const type = parts.slice(0, -1).join(' ') || 'unknown'; return { name, type, isPointer: type.includes('*'), isConst: type.includes('const'), isRestrict: type.includes('__restrict__'), memorySpace: type.includes('__shared__') ? 'shared' : 'global', }; }); } parseLaunchConfiguration(config, kernel) { const parts = config.split(',').map(p => p.trim()); if (parts.length >= 2) { kernel.gridDim = { x: parts[0] || '1', isDynamic: (parts[0] || '').includes('(') }; kernel.blockDim = { x: parts[1] || '1', isDynamic: (parts[1] || '').includes('(') }; } } parseMemcpyDirection(line) { if (line.includes('cudaMemcpyHostToDevice')) return 'H2D'; if (line.includes('cudaMemcpyDeviceToHost')) return 'D2H'; if (line.includes('cudaMemcpyDeviceToDevice')) return 'D2D'; return 'H2D'; // default } parseMemcpySize(line) { // Extract size parameter from cudaMemcpy call const match = line.match(/cudaMemcpy[^(]*\([^,]*,[^,]*,([^,]*),/); return match && match[1] ? match[1].trim() : 'unknown'; } detectBsgsKernel(kernelName, lines) { const bsgsPatterns = ['bsgs', 'baby', 'giant', 'step', 'discrete_log']; const kernelNameLower = kernelName.toLowerCase(); const codeContent = lines.join('\n').toLowerCase(); return bsgsPatterns.some(pattern => kernelNameLower.includes(pattern) || codeContent.includes(pattern)); } analyzeBsgsKernel(kernelName, lines) { const content = lines.join('\n').toLowerCase(); const kernelLower = kernelName.toLowerCase(); // Determine phase based on kernel name and content let phase = 'preprocessing'; if (kernelLower.includes('baby') || content.includes('baby')) phase = 'baby_steps'; else if (kernelLower.includes('giant') || content.includes('giant')) phase = 'giant_steps'; else if (kernelLower.includes('collision') || content.includes('collision')) phase = 'collision_detection'; // Analyze memory pattern let memoryPattern = 'random'; if (content.includes('sequential') || content.includes('coalesced')) memoryPattern = 'sequential'; else if (content.includes('stride')) memoryPattern = 'strided'; // Determine compute intensity let computeIntensity = 'medium'; const mathOps = (content.match(/\*|\/|\+|\-|pow|sqrt|exp/g) || []).length; if (mathOps > 20) computeIntensity = 'high'; else if (mathOps < 5) computeIntensity = 'low'; // Analyze synchronization needs let synchronizationNeeds = 'none'; if (content.includes('__syncthreads')) synchronizationNeeds = 'block'; else if (content.includes('cudadevicesynchronize')) synchronizationNeeds = 'grid'; return { phase, memoryPattern, computeIntensity, synchronizationNeeds, }; } detectBsgsAlgorithm(lines) { // Analyze code patterns to determine specific algorithm const content = lines.join('\n').toLowerCase(); if (content.includes('baby') && content.includes('giant')) { return 'baby_step_giant_step'; } else if (content.includes('pollard') || content.includes('rho')) { return 'pollard_rho'; } else if (content.includes('pohlig') || content.includes('hellman')) { return 'pohlig_hellman'; } return 'other'; } analyzeBsgsCharacteristics(lines) { const content = lines.join('\n').toLowerCase(); // Extract baby steps and giant steps counts let babySteps = 0; let giantSteps = 0; const babyMatch = content.match(/baby.*steps?\s*[=:]\s*(\d+)/); const giantMatch = content.match(/giant.*steps?\s*[=:]\s*(\d+)/); if (babyMatch && babyMatch[1]) babySteps = parseInt(babyMatch[1]); if (giantMatch && giantMatch[1]) giantSteps = parseInt(giantMatch[1]); // Detect parallelization level let parallelization = 'none'; if (content.includes('threadidx')) parallelization = 'thread_level'; if (content.includes('blockidx')) parallelization = 'block_level'; if (content.includes('griddim')) parallelization = 'grid_level'; // Detect data structure let dataStructure = 'other'; if (content.includes('hash') || content.includes('unordered_map')) dataStructure = 'hash_table'; if (content.includes('sort') || content.includes('binary_search')) dataStructure = 'sorted_array'; if (content.includes('tree') || content.includes('bst')) dataStructure = 'binary_tree'; return { babySteps, giantSteps, memoryUsage: `${Math.max(babySteps, giantSteps) * 8} bytes (estimated)`, parallelization, dataStructure, }; } generateBsgsOptimizations(lines) { const content = lines.join('\n').toLowerCase(); const optimizations = []; // Memory layout optimization if (!content.includes('coalesced') && content.includes('global')) { optimizations.push({ type: 'memory_layout', description: 'Use coalesced memory access for baby steps table', benefit: 'Improved memory bandwidth utilization by 2-4x', implementation: 'Reorganize data structure for sequential access patterns', }); } // Parallelization optimization if (!content.includes('shared') && content.includes('collision')) { optimizations.push({ type: 'parallelization', description: 'Implement parallel collision detection using shared memory', benefit: 'Reduced computation time by utilizing block-level parallelism', implementation: 'Use __shared__ memory for collision detection and __syncthreads()', }); } // Algorithm optimization if (content.includes('linear') && !content.includes('binary')) { optimizations.push({ type: 'algorithm', description: 'Replace linear search with binary search', benefit: 'Logarithmic search complexity instead of linear', implementation: 'Sort baby steps table and use binary search for collision detection', }); } // Data structure optimization if (!content.includes('texture') && content.includes('lookup')) { optimizations.push({ type: 'data_structure', description: 'Use texture memory for read-only lookup tables', benefit: 'Better cache performance and reduced memory latency', implementation: 'Bind lookup tables to texture memory with cudaBindTexture', }); } return optimizations; } analyzeBsgsPerformance(lines) { const content = lines.join('\n').toLowerCase(); const bottlenecks = []; // Analyze potential bottlenecks if (content.includes('global') && !content.includes('coalesced')) { bottlenecks.push('Uncoalesced global memory access'); } if (content.includes('hash') && content.includes('collision')) { bottlenecks.push('Hash table collisions'); } if (content.includes('atomic') && !content.includes('shared')) { bottlenecks.push('Atomic operations on global memory'); } if (!content.includes('async') && content.includes('memcpy')) { bottlenecks.push('Synchronous memory transfers'); } // Estimate performance characteristics let estimatedSpeedup = 1.0; let memoryEfficiency = 50; let scalability = 'fair'; // Calculate speedup based on parallelization if (content.includes('threadidx')) estimatedSpeedup *= 32; // Warp-level parallelism if (content.includes('blockidx')) estimatedSpeedup *= 4; // Block-level parallelism if (content.includes('shared')) estimatedSpeedup *= 1.5; // Shared memory usage // Calculate memory efficiency if (content.includes('coalesced')) memoryEfficiency += 30; if (content.includes('shared')) memoryEfficiency += 20; if (content.includes('texture')) memoryEfficiency += 15; memoryEfficiency = Math.min(100, memoryEfficiency); // Determine scalability if (estimatedSpeedup > 50 && memoryEfficiency > 80) scalability = 'excellent'; else if (estimatedSpeedup > 20 && memoryEfficiency > 60) scalability = 'good'; else if (estimatedSpeedup > 5 && memoryEfficiency > 40) scalability = 'fair'; else scalability = 'poor'; return { estimatedSpeedup: Math.round(estimatedSpeedup * 10) / 10, memoryEfficiency, scalability, bottlenecks, }; } analyzeOccupancy(kernels) { if (kernels.length === 0) { return { theoretical: 0, achieved: 0, limitingFactor: 'blocks', recommendations: ['No kernels found to analyze'], }; } let totalOccupancy = 0; let limitingFactor = 'blocks'; const recommendations = []; for (const kernel of kernels) { // Estimate occupancy based on launch configuration const blockSize = parseInt(kernel.blockDim.x) || 256; // const warpsPerBlock = Math.ceil(blockSize / 32); // For future use // Simplified occupancy calculation let occupancy = 100; // Register pressure (estimated) if (kernel.registerPressure > 32) { occupancy *= 0.5; limitingFactor = 'registers'; recommendations.push(`Reduce register usage in ${kernel.name} (estimated: ${kernel.registerPressure})`); } // Shared memory usage if (kernel.sharedMemory > 32768) { // 32KB typical limit occupancy *= 0.7; limitingFactor = 'shared_memory'; recommendations.push(`Optimize shared memory usage in ${kernel.name} (${kernel.sharedMemory} bytes)`); } // Block size optimization if (blockSize < 128 || blockSize > 512) { occupancy *= 0.8; recommendations.push(`Optimize block size for ${kernel.name} (current: ${blockSize})`); } totalOccupancy += occupancy; } const avgOccupancy = totalOccupancy / kernels.length; // Add general recommendations if (avgOccupancy < 50) { recommendations.push('Consider using CUDA Occupancy Calculator for optimization'); recommendations.push('Profile with nvprof or Nsight Compute for detailed analysis'); } return { theoretical: 100, achieved: Math.round(avgOccupancy), limitingFactor, recommendations: [...new Set(recommendations)], // Remove duplicates }; } analyzeMemoryBandwidth(memOps) { if (memOps.length === 0) { return { utilization: 0, bottlenecks: ['No memory operations found'], recommendations: ['Add memory operations analysis'], }; } const bottlenecks = []; const recommendations = []; let utilization = 80; // Start with baseline // Analyze memory operations const asyncOps = memOps.filter(op => op.isAsync).length; const syncOps = memOps.length - asyncOps; if (syncOps > asyncOps) { bottlenecks.push('Excessive synchronous memory transfers'); recommendations.push('Use asynchronous memory transfers (cudaMemcpyAsync)'); utilization -= 20; } // Check for small transfers const smallTransfers = memOps.filter(op => op.size && (op.size.includes('sizeof') || parseInt(op.size) < 1024)).length; if (smallTransfers > memOps.length * 0.5) { bottlenecks.push('Many small memory transfers'); recommendations.push('Batch small transfers into larger operations'); utilization -= 15; } // Check for H2D/D2H patterns const h2dOps = memOps.filter(op => op.direction === 'H2D').length; const d2hOps = memOps.filter(op => op.direction === 'D2H').length; if (h2dOps > 0 && d2hOps > 0) { recommendations.push('Consider using unified memory for bidirectional transfers'); } // Memory access pattern analysis (simplified) if (memOps.some(op => op.type === 'cudaMemcpy' && !op.isAsync)) { bottlenecks.push('Blocking memory transfers'); recommendations.push('Use streams to overlap computation and memory transfers'); utilization -= 10; } utilization = Math.max(0, Math.min(100, utilization)); return { utilization, bottlenecks, recommendations, }; } analyzeComputeIntensity(lines) { const content = lines.join('\n').toLowerCase(); // Count arithmetic operations (simplified) const arithmeticOps = (content.match(/[+\-*/]/g) || []).length; const mathFunctions = (content.match(/\b(sin|cos|tan|exp|log|sqrt|pow)\b/g) || []).length; const totalFlops = arithmeticOps + mathFunctions * 10; // Math functions are more expensive // Count memory operations const memoryOps = (content.match(/\[|\]|cudamemcpy|global|shared/g) || []).length; const bytesPerOp = 4; // Assume 4-byte operations const totalBytes = memoryOps * bytesPerOp; // Calculate compute intensity (FLOPs per byte) const ratio = totalBytes > 0 ? totalFlops / totalBytes : 0; let classification; const recommendations = []; if (ratio < 1.0) { classification = 'memory_bound'; recommendations.push('Increase arithmetic intensity by fusing operations'); recommendations.push('Use shared memory to reduce global memory accesses'); recommendations.push('Consider loop unrolling to increase compute per memory access'); } else if (ratio > 4.0) { classification = 'compute_bound'; recommendations.push('Optimize arithmetic operations'); recommendations.push('Use faster math functions (__sinf, __cosf, etc.)'); recommendations.push('Consider reducing precision if acceptable'); } else { classification = 'balanced'; recommendations.push('Good balance between compute and memory operations'); recommendations.push('Focus on occupancy optimization'); } // Add specific BSGS recommendations if (content.includes('bsgs') || content.includes('baby') || content.includes('giant')) { if (classification === 'memory_bound') { recommendations.push('BSGS: Use shared memory for baby steps table'); recommendations.push('BSGS: Implement collision detection in shared memory'); } } return { ratio: Math.round(ratio * 100) / 100, classification, recommendations, }; } detectUncoalescedAccess(line) { // Simplified detection of potentially uncoalesced access return line.includes('[') && (line.includes('*') || line.includes('+')); } } //# sourceMappingURL=CudaAnalyzer.js.map