agentsqripts
Version:
Comprehensive static code analysis toolkit for identifying technical debt, security vulnerabilities, performance issues, and code quality problems
269 lines (222 loc) • 9.69 kB
JavaScript
/**
* @file Optimized duplicate grouping logic with O(n log n) performance
* @description High-performance grouping for large codebases
*/
const crypto = require('crypto');
const { calculateSimilarity } = require('./similarityCalculator');
const { calculateEstimatedSavings } = require('./savingsCalculator');
const { WET_CODE_CONFIG, WET_PATTERNS } = require('./wetCodeConfig');
/**
* Groups duplicate blocks using optimized hash-based algorithm
* @param {Array} allBlocks - All blocks from all files
* @param {Object} options - Grouping options
* @returns {Array} Grouped duplicates with metadata
*/
function groupDuplicateBlocksOptimized(allBlocks, options = {}) {
const {
maxBlocks = 50000,
batchSize = 500,
showProgress = true,
minSimilarity = WET_CODE_CONFIG.minSimilarityThreshold,
skipSimilarity = false,
maxSimilarityChecks = 5000,
smartSampling = true
} = options;
// Limit analysis for extremely large projects
if (allBlocks.length > maxBlocks) {
console.warn(`⚠️ Project has ${allBlocks.length} blocks. Analyzing first ${maxBlocks} blocks for performance.`);
allBlocks = allBlocks.slice(0, maxBlocks);
}
const groups = [];
const processed = new Set();
// Phase 1: Group exact duplicates by hash (O(n))
if (showProgress) console.log(' Phase 1: Finding exact duplicates...');
const hashGroups = new Map();
allBlocks.forEach((block, index) => {
if (!block.hash) {
// Generate hash if not present
block.hash = crypto.createHash('md5').update(block.content).digest('hex');
}
if (!hashGroups.has(block.hash)) {
hashGroups.set(block.hash, []);
}
hashGroups.get(block.hash).push({ block, index });
});
// Process exact duplicate groups
let exactDuplicatesFound = 0;
for (const [hash, groupItems] of hashGroups) {
if (groupItems.length > 1) {
const blocks = groupItems.map(item => item.block);
const group = {
type: 'exact_duplicate',
blocks,
similarity: 1.0,
pattern: WET_PATTERNS['exact_duplicate'],
hash,
complexity: blocks[0].complexity,
deduplicationOpportunity: blocks.length > 3 ? 'HIGH' : 'MEDIUM'
};
group.estimatedSavings = calculateEstimatedSavings(group);
groups.push(group);
exactDuplicatesFound++;
// Mark as processed
groupItems.forEach(item => processed.add(item.index));
}
}
if (showProgress) {
console.log(` ✓ Found ${exactDuplicatesFound} exact duplicate groups`);
}
// Phase 2: Find similar blocks (only among unprocessed blocks)
let similarGroupsFound = 0;
// Skip similarity checking for very large projects or if explicitly disabled
const unprocessedCount = allBlocks.filter((_, i) => !processed.has(i)).length;
if (skipSimilarity || unprocessedCount > 3000) {
if (showProgress) {
if (skipSimilarity) {
console.log(' Phase 2: Skipping similarity analysis (disabled)');
} else {
console.log(` Phase 2: Skipping similarity analysis (${unprocessedCount} blocks too large for O(n²) analysis)`);
}
}
} else {
if (showProgress) console.log(' Phase 2: Finding similar code blocks...');
const unprocessedBlocks = allBlocks
.map((block, index) => ({ block, index }))
.filter(item => !processed.has(item.index));
// Smart sampling for large datasets
let blocksToCheck;
if (smartSampling && unprocessedBlocks.length > maxSimilarityChecks) {
// Sample high-complexity and diverse blocks for better coverage
const highComplexity = unprocessedBlocks.filter(item => item.block.complexity >= 3);
const mediumComplexity = unprocessedBlocks.filter(item => item.block.complexity >= 2 && item.block.complexity < 3);
const lowComplexity = unprocessedBlocks.filter(item => item.block.complexity < 2);
const sampleSize = Math.min(maxSimilarityChecks, unprocessedBlocks.length);
const highSample = highComplexity.slice(0, Math.floor(sampleSize * 0.4));
const mediumSample = mediumComplexity.slice(0, Math.floor(sampleSize * 0.4));
const lowSample = lowComplexity.slice(0, sampleSize - highSample.length - mediumSample.length);
blocksToCheck = [...highSample, ...mediumSample, ...lowSample];
if (showProgress) {
console.log(` 📊 Smart sampling: ${blocksToCheck.length} blocks (${highSample.length} high, ${mediumSample.length} medium, ${lowSample.length} low complexity)`);
}
} else {
// Standard limitation
blocksToCheck = unprocessedBlocks.slice(0, Math.min(unprocessedBlocks.length, maxSimilarityChecks));
if (blocksToCheck.length < unprocessedBlocks.length && showProgress) {
console.log(` ⚠️ Checking similarity for first ${blocksToCheck.length} of ${unprocessedBlocks.length} blocks`);
}
}
// Group by similar characteristics for efficient comparison
const characteristicGroups = groupByCharacteristics(blocksToCheck);
let totalComparisons = 0;
// Process each characteristic group in batches
for (const [characteristic, candidates] of characteristicGroups) {
if (candidates.length < 2) continue;
// Process in batches for memory efficiency
for (let i = 0; i < candidates.length; i += batchSize) {
const batch = candidates.slice(i, Math.min(i + batchSize, candidates.length));
const batchGroups = findSimilarInBatch(batch, minSimilarity, processed);
batchGroups.forEach(group => {
group.estimatedSavings = calculateEstimatedSavings(group);
groups.push(group);
similarGroupsFound++;
});
totalComparisons += batch.length * (batch.length - 1) / 2;
if (showProgress && i > 0 && i % (batchSize * 5) === 0) {
console.log(` Processing similarity batch ${Math.floor(i / batchSize)}...`);
}
// Stop if we've done too many comparisons (more aggressive limit)
if (totalComparisons > maxSimilarityChecks * 3) {
if (showProgress) {
console.log(` ⚠️ Stopping similarity analysis after ${totalComparisons} comparisons`);
}
break;
}
}
}
}
if (showProgress) {
console.log(` ✓ Found ${similarGroupsFound} similar code groups`);
}
// Sort by impact
return groups.sort((a, b) => {
const scoreA = (a.blocks.length - 1) * a.complexity * (a.similarity || 1);
const scoreB = (b.blocks.length - 1) * b.complexity * (b.similarity || 1);
return scoreB - scoreA;
});
}
/**
* Group blocks by characteristics for efficient comparison
*/
function groupByCharacteristics(blockItems) {
const groups = new Map();
blockItems.forEach(item => {
const { block } = item;
// More refined grouping for better pre-filtering
const lineGroup = Math.floor(block.lineCount / 3) * 3; // Smaller buckets for lines
const complexityGroup = Math.floor(block.complexity); // Integer complexity
const lengthGroup = Math.floor(block.content.length / 100) * 100; // Group by content length
// Generate partial content hash for similarity pre-filtering
const firstWords = block.content.split(/\s+/).slice(0, 5).join(' ');
const contentPrefix = crypto.createHash('md5').update(firstWords).digest('hex').substring(0, 4);
const key = `${lineGroup}-${complexityGroup}-${lengthGroup}-${contentPrefix}`;
if (!groups.has(key)) {
groups.set(key, []);
}
groups.get(key).push(item);
});
// Filter out single-item groups to reduce comparisons
const filteredGroups = new Map();
for (const [key, items] of groups) {
if (items.length > 1) {
filteredGroups.set(key, items);
}
}
return filteredGroups;
}
/**
* Find similar blocks within a batch
*/
function findSimilarInBatch(batch, minSimilarity, processed) {
const groups = [];
const batchProcessed = new Set();
for (let i = 0; i < batch.length; i++) {
if (batchProcessed.has(i)) continue;
const { block: currentBlock, index: currentIndex } = batch[i];
const group = {
type: 'similar_logic',
blocks: [currentBlock],
similarity: minSimilarity,
pattern: WET_PATTERNS['similar_logic'],
complexity: currentBlock.complexity,
deduplicationOpportunity: 'LOW'
};
// Only compare with remaining items in batch
for (let j = i + 1; j < batch.length; j++) {
if (batchProcessed.has(j)) continue;
const { block: otherBlock, index: otherIndex } = batch[j];
// Quick checks before expensive similarity calculation
if (Math.abs(currentBlock.lineCount - otherBlock.lineCount) > 10) continue;
if (Math.abs(currentBlock.complexity - otherBlock.complexity) > 5) continue;
const similarity = calculateSimilarity(currentBlock.content, otherBlock.content);
if (similarity >= minSimilarity) {
group.blocks.push(otherBlock);
group.similarity = Math.min(group.similarity, similarity);
batchProcessed.add(j);
processed.add(otherIndex);
}
}
// Only include groups with multiple blocks
if (group.blocks.length > 1) {
const filesInvolved = new Set(group.blocks.map(b => b.file)).size;
group.deduplicationOpportunity = filesInvolved > 2 ? 'HIGH' :
group.blocks.length > 3 ? 'HIGH' : 'MEDIUM';
groups.push(group);
}
batchProcessed.add(i);
processed.add(currentIndex);
}
return groups;
}
module.exports = {
groupDuplicateBlocksOptimized
};