UNPKG

agentsqripts

Version:

Comprehensive static code analysis toolkit for identifying technical debt, security vulnerabilities, performance issues, and code quality problems

269 lines (222 loc) 9.69 kB
/** * @file Optimized duplicate grouping logic with O(n log n) performance * @description High-performance grouping for large codebases */ const crypto = require('crypto'); const { calculateSimilarity } = require('./similarityCalculator'); const { calculateEstimatedSavings } = require('./savingsCalculator'); const { WET_CODE_CONFIG, WET_PATTERNS } = require('./wetCodeConfig'); /** * Groups duplicate blocks using optimized hash-based algorithm * @param {Array} allBlocks - All blocks from all files * @param {Object} options - Grouping options * @returns {Array} Grouped duplicates with metadata */ function groupDuplicateBlocksOptimized(allBlocks, options = {}) { const { maxBlocks = 50000, batchSize = 500, showProgress = true, minSimilarity = WET_CODE_CONFIG.minSimilarityThreshold, skipSimilarity = false, maxSimilarityChecks = 5000, smartSampling = true } = options; // Limit analysis for extremely large projects if (allBlocks.length > maxBlocks) { console.warn(`⚠️ Project has ${allBlocks.length} blocks. Analyzing first ${maxBlocks} blocks for performance.`); allBlocks = allBlocks.slice(0, maxBlocks); } const groups = []; const processed = new Set(); // Phase 1: Group exact duplicates by hash (O(n)) if (showProgress) console.log(' Phase 1: Finding exact duplicates...'); const hashGroups = new Map(); allBlocks.forEach((block, index) => { if (!block.hash) { // Generate hash if not present block.hash = crypto.createHash('md5').update(block.content).digest('hex'); } if (!hashGroups.has(block.hash)) { hashGroups.set(block.hash, []); } hashGroups.get(block.hash).push({ block, index }); }); // Process exact duplicate groups let exactDuplicatesFound = 0; for (const [hash, groupItems] of hashGroups) { if (groupItems.length > 1) { const blocks = groupItems.map(item => item.block); const group = { type: 'exact_duplicate', blocks, similarity: 1.0, pattern: WET_PATTERNS['exact_duplicate'], hash, complexity: blocks[0].complexity, deduplicationOpportunity: blocks.length > 3 ? 'HIGH' : 'MEDIUM' }; group.estimatedSavings = calculateEstimatedSavings(group); groups.push(group); exactDuplicatesFound++; // Mark as processed groupItems.forEach(item => processed.add(item.index)); } } if (showProgress) { console.log(` ✓ Found ${exactDuplicatesFound} exact duplicate groups`); } // Phase 2: Find similar blocks (only among unprocessed blocks) let similarGroupsFound = 0; // Skip similarity checking for very large projects or if explicitly disabled const unprocessedCount = allBlocks.filter((_, i) => !processed.has(i)).length; if (skipSimilarity || unprocessedCount > 3000) { if (showProgress) { if (skipSimilarity) { console.log(' Phase 2: Skipping similarity analysis (disabled)'); } else { console.log(` Phase 2: Skipping similarity analysis (${unprocessedCount} blocks too large for O(n²) analysis)`); } } } else { if (showProgress) console.log(' Phase 2: Finding similar code blocks...'); const unprocessedBlocks = allBlocks .map((block, index) => ({ block, index })) .filter(item => !processed.has(item.index)); // Smart sampling for large datasets let blocksToCheck; if (smartSampling && unprocessedBlocks.length > maxSimilarityChecks) { // Sample high-complexity and diverse blocks for better coverage const highComplexity = unprocessedBlocks.filter(item => item.block.complexity >= 3); const mediumComplexity = unprocessedBlocks.filter(item => item.block.complexity >= 2 && item.block.complexity < 3); const lowComplexity = unprocessedBlocks.filter(item => item.block.complexity < 2); const sampleSize = Math.min(maxSimilarityChecks, unprocessedBlocks.length); const highSample = highComplexity.slice(0, Math.floor(sampleSize * 0.4)); const mediumSample = mediumComplexity.slice(0, Math.floor(sampleSize * 0.4)); const lowSample = lowComplexity.slice(0, sampleSize - highSample.length - mediumSample.length); blocksToCheck = [...highSample, ...mediumSample, ...lowSample]; if (showProgress) { console.log(` 📊 Smart sampling: ${blocksToCheck.length} blocks (${highSample.length} high, ${mediumSample.length} medium, ${lowSample.length} low complexity)`); } } else { // Standard limitation blocksToCheck = unprocessedBlocks.slice(0, Math.min(unprocessedBlocks.length, maxSimilarityChecks)); if (blocksToCheck.length < unprocessedBlocks.length && showProgress) { console.log(` ⚠️ Checking similarity for first ${blocksToCheck.length} of ${unprocessedBlocks.length} blocks`); } } // Group by similar characteristics for efficient comparison const characteristicGroups = groupByCharacteristics(blocksToCheck); let totalComparisons = 0; // Process each characteristic group in batches for (const [characteristic, candidates] of characteristicGroups) { if (candidates.length < 2) continue; // Process in batches for memory efficiency for (let i = 0; i < candidates.length; i += batchSize) { const batch = candidates.slice(i, Math.min(i + batchSize, candidates.length)); const batchGroups = findSimilarInBatch(batch, minSimilarity, processed); batchGroups.forEach(group => { group.estimatedSavings = calculateEstimatedSavings(group); groups.push(group); similarGroupsFound++; }); totalComparisons += batch.length * (batch.length - 1) / 2; if (showProgress && i > 0 && i % (batchSize * 5) === 0) { console.log(` Processing similarity batch ${Math.floor(i / batchSize)}...`); } // Stop if we've done too many comparisons (more aggressive limit) if (totalComparisons > maxSimilarityChecks * 3) { if (showProgress) { console.log(` ⚠️ Stopping similarity analysis after ${totalComparisons} comparisons`); } break; } } } } if (showProgress) { console.log(` ✓ Found ${similarGroupsFound} similar code groups`); } // Sort by impact return groups.sort((a, b) => { const scoreA = (a.blocks.length - 1) * a.complexity * (a.similarity || 1); const scoreB = (b.blocks.length - 1) * b.complexity * (b.similarity || 1); return scoreB - scoreA; }); } /** * Group blocks by characteristics for efficient comparison */ function groupByCharacteristics(blockItems) { const groups = new Map(); blockItems.forEach(item => { const { block } = item; // More refined grouping for better pre-filtering const lineGroup = Math.floor(block.lineCount / 3) * 3; // Smaller buckets for lines const complexityGroup = Math.floor(block.complexity); // Integer complexity const lengthGroup = Math.floor(block.content.length / 100) * 100; // Group by content length // Generate partial content hash for similarity pre-filtering const firstWords = block.content.split(/\s+/).slice(0, 5).join(' '); const contentPrefix = crypto.createHash('md5').update(firstWords).digest('hex').substring(0, 4); const key = `${lineGroup}-${complexityGroup}-${lengthGroup}-${contentPrefix}`; if (!groups.has(key)) { groups.set(key, []); } groups.get(key).push(item); }); // Filter out single-item groups to reduce comparisons const filteredGroups = new Map(); for (const [key, items] of groups) { if (items.length > 1) { filteredGroups.set(key, items); } } return filteredGroups; } /** * Find similar blocks within a batch */ function findSimilarInBatch(batch, minSimilarity, processed) { const groups = []; const batchProcessed = new Set(); for (let i = 0; i < batch.length; i++) { if (batchProcessed.has(i)) continue; const { block: currentBlock, index: currentIndex } = batch[i]; const group = { type: 'similar_logic', blocks: [currentBlock], similarity: minSimilarity, pattern: WET_PATTERNS['similar_logic'], complexity: currentBlock.complexity, deduplicationOpportunity: 'LOW' }; // Only compare with remaining items in batch for (let j = i + 1; j < batch.length; j++) { if (batchProcessed.has(j)) continue; const { block: otherBlock, index: otherIndex } = batch[j]; // Quick checks before expensive similarity calculation if (Math.abs(currentBlock.lineCount - otherBlock.lineCount) > 10) continue; if (Math.abs(currentBlock.complexity - otherBlock.complexity) > 5) continue; const similarity = calculateSimilarity(currentBlock.content, otherBlock.content); if (similarity >= minSimilarity) { group.blocks.push(otherBlock); group.similarity = Math.min(group.similarity, similarity); batchProcessed.add(j); processed.add(otherIndex); } } // Only include groups with multiple blocks if (group.blocks.length > 1) { const filesInvolved = new Set(group.blocks.map(b => b.file)).size; group.deduplicationOpportunity = filesInvolved > 2 ? 'HIGH' : group.blocks.length > 3 ? 'HIGH' : 'MEDIUM'; groups.push(group); } batchProcessed.add(i); processed.add(currentIndex); } return groups; } module.exports = { groupDuplicateBlocksOptimized };