UNPKG

agentsqripts

Version:

Comprehensive static code analysis toolkit for identifying technical debt, security vulnerabilities, performance issues, and code quality problems

135 lines (120 loc) 5.13 kB
/** * @file Advanced similarity calculation using optimized Levenshtein distance * @description Single responsibility: Compute accurate code similarity metrics for duplicate detection * * This module implements sophisticated similarity calculation using optimized Levenshtein * distance algorithms with performance enhancements and length-aware scoring. It serves * as the core similarity engine for WET code detection, balancing accuracy with performance * for large-scale codebase analysis. * * Design rationale: * - Levenshtein distance provides mathematically sound similarity measurement * - Space-optimized algorithm maintains performance for large code blocks * - Length penalty prevents false matches between significantly different block sizes * - Line-by-line analysis preserves code structure awareness */ const { WET_CODE_CONFIG } = require('./wetCodeConfig'); /** * Calculate similarity between code blocks using length-aware Levenshtein analysis * * Technical function: Multi-phase similarity calculation with length penalty and line-based analysis * * Implementation rationale: * - Exact match early return optimizes for identical blocks * - Length difference analysis prevents false positives from size mismatches * - Line-by-line comparison preserves code structure in similarity assessment * - Partial similarity calculation handles blocks with different line counts * * Similarity calculation strategy: * - Phase 1: Exact match detection for O(1) identical block identification * - Phase 2: Length analysis with penalty for significant size differences * - Phase 3: Line-by-line similarity aggregation for structural comparison * - Phase 4: Length penalty application to prevent size-based false matches * * Length penalty algorithm: * - Calculates ratio of minimum to maximum length as penalty factor * - Applies penalty to partial match scores for fair comparison * - Prevents small blocks from matching large blocks with few similar lines * - Maintains accuracy across diverse block size distributions * * Line-based analysis advantages: * - Preserves code structure in similarity assessment * - Enables identification of structurally similar but differently formatted code * - Supports detection of refactored code with similar logic patterns * - Balances granular analysis with computational efficiency * * @param {string} block1 - First code block content for similarity comparison * @param {string} block2 - Second code block content for similarity comparison * @returns {number} Similarity score from 0.0 (completely different) to 1.0 (identical) * @example * const similarity = calculateSimilarity(` * const x = 1; * return x + 2; * `, ` * const y = 1; * return y + 2; * `); * // Returns high similarity (~0.9) due to structural similarity */ function calculateSimilarity(block1, block2) { if (block1 === block2) return 1.0; const lines1 = block1.split('\n').filter(l => l.trim().length > 0); const lines2 = block2.split('\n').filter(l => l.trim().length > 0); if (lines1.length !== lines2.length) { // Different lengths - calculate partial similarity const minLength = Math.min(lines1.length, lines2.length); const maxLength = Math.max(lines1.length, lines2.length); const lengthPenalty = minLength / maxLength; let matchingLines = 0; for (let i = 0; i < minLength; i++) { if (calculateLineSimilarity(lines1[i], lines2[i]) > 0.8) { matchingLines++; } } return (matchingLines / maxLength) * lengthPenalty; } // Same length - calculate line-by-line similarity let totalSimilarity = 0; for (let i = 0; i < lines1.length; i++) { totalSimilarity += calculateLineSimilarity(lines1[i], lines2[i]); } return totalSimilarity / lines1.length; } /** * Calculates similarity between two lines of code * @param {string} line1 - First line * @param {string} line2 - Second line * @returns {number} Similarity score (0-1) */ function calculateLineSimilarity(line1, line2) { if (line1 === line2) return 1.0; // Optimized Levenshtein distance with early termination const len1 = line1.length; const len2 = line2.length; // Early exit for significant length difference if (Math.abs(len1 - len2) > Math.max(len1, len2) * 0.5) { return 0; } // Use single array optimization for space complexity let prev = Array(len1 + 1).fill(0).map((_, i) => i); let curr = Array(len1 + 1).fill(0); for (let j = 1; j <= len2; j++) { curr[0] = j; for (let i = 1; i <= len1; i++) { const substitutionCost = line1[i - 1] === line2[j - 1] ? 0 : 1; curr[i] = Math.min( curr[i - 1] + 1, // deletion prev[i] + 1, // insertion prev[i - 1] + substitutionCost // substitution ); } [prev, curr] = [curr, prev]; // Swap arrays } const distance = prev[len1]; const maxLength = Math.max(len1, len2); return maxLength > 0 ? 1 - (distance / maxLength) : 1; } module.exports = { calculateSimilarity, calculateLineSimilarity };