agentsqripts
Version:
Comprehensive static code analysis toolkit for identifying technical debt, security vulnerabilities, performance issues, and code quality problems
119 lines (99 loc) • 4.28 kB
JavaScript
/**
* @file Find semantic duplicates using optimized algorithm
* @description Single responsibility: Find similar code blocks across project using optimized O(n log n) algorithm
*/
const { calculateSemanticSimilarity, identifyDuplicationPattern } = require('../../ast/semanticSimilarityCalculator');
const { classifyDuplicationType } = require('../../ast/duplicateTypeClassifier');
const { getContextAwareRecommendations } = require('../../ast/contextAnalyzer');
const calculateAverageSimilarity = require('../metrics/calculateAverageSimilarity');
const calculateDuplicationImpact = require('../metrics/calculateDuplicationImpact');
/**
* Find semantic duplicates using optimized O(n log n) algorithm
*/
function findSemanticDuplicatesOptimized(allBlocks, fileContexts) {
const duplicateGroups = [];
const processed = new Set();
// First pass: Group exact duplicates by hash (O(n))
const hashGroups = new Map();
allBlocks.forEach((block, index) => {
if (!hashGroups.has(block.hash)) {
hashGroups.set(block.hash, []);
}
hashGroups.get(block.hash).push({ block, index });
});
// Process exact duplicate groups
for (const [hash, group] of hashGroups) {
if (group.length > 1) {
const blocks = group.map(g => g.block);
const pattern = identifyDuplicationPattern(blocks);
const classification = classifyDuplicationType(blocks, fileContexts);
// Even exact duplicates might be acceptable (e.g., test boilerplate)
if (classification.type !== 'ACCEPTABLE_SIMILARITY') {
duplicateGroups.push({
type: 'exact_duplicate',
pattern,
blocks,
similarity: 1.0,
impact: calculateDuplicationImpact(blocks),
classification,
contextRecommendations: getContextAwareRecommendations({ blocks }, fileContexts)
});
}
// Mark as processed
group.forEach(g => processed.add(g.index));
}
}
// Second pass: Find semantic duplicates among remaining blocks
const remainingBlocks = allBlocks
.map((block, index) => ({ block, index }))
.filter(item => !processed.has(item.index));
// Group by type and similar complexity for efficient comparison
const typeGroups = new Map();
remainingBlocks.forEach(item => {
const key = `${item.block.type}-${Math.floor(item.block.complexity / 5)}`;
if (!typeGroups.has(key)) {
typeGroups.set(key, []);
}
typeGroups.get(key).push(item);
});
// Compare within type groups (much smaller n)
for (const [key, group] of typeGroups) {
if (group.length < 2) continue;
// Use similarity threshold to create groups
const semanticGroups = [];
for (let i = 0; i < group.length; i++) {
if (processed.has(group[i].index)) continue;
const currentGroup = [group[i].block];
processed.add(group[i].index);
for (let j = i + 1; j < group.length; j++) {
if (processed.has(group[j].index)) continue;
const similarity = calculateSemanticSimilarity(group[i].block, group[j].block);
if (similarity >= 0.7) { // Configurable threshold
currentGroup.push(group[j].block);
processed.add(group[j].index);
}
}
if (currentGroup.length > 1) {
const pattern = identifyDuplicationPattern(currentGroup);
const classification = classifyDuplicationType(currentGroup, fileContexts);
// Only include if it's a real duplicate, not a template pattern
if (classification.type !== 'TEMPLATE_PATTERN' &&
classification.type !== 'ACCEPTABLE_SIMILARITY') {
duplicateGroups.push({
type: 'semantic_duplicate',
pattern,
blocks: currentGroup,
similarity: calculateAverageSimilarity(currentGroup),
impact: calculateDuplicationImpact(currentGroup),
classification,
contextRecommendations: getContextAwareRecommendations({ blocks: currentGroup }, fileContexts)
});
}
}
}
}
// Sort by impact
duplicateGroups.sort((a, b) => b.impact.score - a.impact.score);
return duplicateGroups;
}
module.exports = findSemanticDuplicatesOptimized;