agentsqripts
Version:
Comprehensive static code analysis toolkit for identifying technical debt, security vulnerabilities, performance issues, and code quality problems
135 lines (120 loc) • 5.13 kB
JavaScript
/**
* @file Advanced similarity calculation using optimized Levenshtein distance
* @description Single responsibility: Compute accurate code similarity metrics for duplicate detection
*
* This module implements sophisticated similarity calculation using optimized Levenshtein
* distance algorithms with performance enhancements and length-aware scoring. It serves
* as the core similarity engine for WET code detection, balancing accuracy with performance
* for large-scale codebase analysis.
*
* Design rationale:
* - Levenshtein distance provides mathematically sound similarity measurement
* - Space-optimized algorithm maintains performance for large code blocks
* - Length penalty prevents false matches between significantly different block sizes
* - Line-by-line analysis preserves code structure awareness
*/
const { WET_CODE_CONFIG } = require('./wetCodeConfig');
/**
* Calculate similarity between code blocks using length-aware Levenshtein analysis
*
* Technical function: Multi-phase similarity calculation with length penalty and line-based analysis
*
* Implementation rationale:
* - Exact match early return optimizes for identical blocks
* - Length difference analysis prevents false positives from size mismatches
* - Line-by-line comparison preserves code structure in similarity assessment
* - Partial similarity calculation handles blocks with different line counts
*
* Similarity calculation strategy:
* - Phase 1: Exact match detection for O(1) identical block identification
* - Phase 2: Length analysis with penalty for significant size differences
* - Phase 3: Line-by-line similarity aggregation for structural comparison
* - Phase 4: Length penalty application to prevent size-based false matches
*
* Length penalty algorithm:
* - Calculates ratio of minimum to maximum length as penalty factor
* - Applies penalty to partial match scores for fair comparison
* - Prevents small blocks from matching large blocks with few similar lines
* - Maintains accuracy across diverse block size distributions
*
* Line-based analysis advantages:
* - Preserves code structure in similarity assessment
* - Enables identification of structurally similar but differently formatted code
* - Supports detection of refactored code with similar logic patterns
* - Balances granular analysis with computational efficiency
*
* @param {string} block1 - First code block content for similarity comparison
* @param {string} block2 - Second code block content for similarity comparison
* @returns {number} Similarity score from 0.0 (completely different) to 1.0 (identical)
* @example
* const similarity = calculateSimilarity(`
* const x = 1;
* return x + 2;
* `, `
* const y = 1;
* return y + 2;
* `);
* // Returns high similarity (~0.9) due to structural similarity
*/
function calculateSimilarity(block1, block2) {
if (block1 === block2) return 1.0;
const lines1 = block1.split('\n').filter(l => l.trim().length > 0);
const lines2 = block2.split('\n').filter(l => l.trim().length > 0);
if (lines1.length !== lines2.length) {
// Different lengths - calculate partial similarity
const minLength = Math.min(lines1.length, lines2.length);
const maxLength = Math.max(lines1.length, lines2.length);
const lengthPenalty = minLength / maxLength;
let matchingLines = 0;
for (let i = 0; i < minLength; i++) {
if (calculateLineSimilarity(lines1[i], lines2[i]) > 0.8) {
matchingLines++;
}
}
return (matchingLines / maxLength) * lengthPenalty;
}
// Same length - calculate line-by-line similarity
let totalSimilarity = 0;
for (let i = 0; i < lines1.length; i++) {
totalSimilarity += calculateLineSimilarity(lines1[i], lines2[i]);
}
return totalSimilarity / lines1.length;
}
/**
* Calculates similarity between two lines of code
* @param {string} line1 - First line
* @param {string} line2 - Second line
* @returns {number} Similarity score (0-1)
*/
function calculateLineSimilarity(line1, line2) {
if (line1 === line2) return 1.0;
// Optimized Levenshtein distance with early termination
const len1 = line1.length;
const len2 = line2.length;
// Early exit for significant length difference
if (Math.abs(len1 - len2) > Math.max(len1, len2) * 0.5) {
return 0;
}
// Use single array optimization for space complexity
let prev = Array(len1 + 1).fill(0).map((_, i) => i);
let curr = Array(len1 + 1).fill(0);
for (let j = 1; j <= len2; j++) {
curr[0] = j;
for (let i = 1; i <= len1; i++) {
const substitutionCost = line1[i - 1] === line2[j - 1] ? 0 : 1;
curr[i] = Math.min(
curr[i - 1] + 1, // deletion
prev[i] + 1, // insertion
prev[i - 1] + substitutionCost // substitution
);
}
[prev, curr] = [curr, prev]; // Swap arrays
}
const distance = prev[len1];
const maxLength = Math.max(len1, len2);
return maxLength > 0 ? 1 - (distance / maxLength) : 1;
}
module.exports = {
calculateSimilarity,
calculateLineSimilarity
};