agentsqripts
Version:
Comprehensive static code analysis toolkit for identifying technical debt, security vulnerabilities, performance issues, and code quality problems
106 lines (99 loc) • 4.5 kB
JavaScript
/**
* @file Code block extraction utilities for WET code duplicate detection
* @description Single responsibility: Extract logical code blocks with metadata for similarity analysis
*
* This module implements sliding window extraction of code blocks with normalization,
* hashing, and complexity analysis. It serves as the foundation for duplicate code
* detection by converting source code into comparable, analyzable blocks while
* maintaining sufficient context for accurate similarity assessment.
*
* Design rationale:
* - Sliding window approach captures all possible duplicate patterns
* - Line normalization reduces false negatives from formatting differences
* - Block metadata enables sophisticated duplicate analysis and prioritization
* - Configurable thresholds adapt to different project requirements
*/
const { WET_CODE_CONFIG } = require('./wetCodeConfig');
const { generateBlockHash } = require('./blockHasher');
const { calculateBlockComplexity } = require('./complexityCalculator');
const { normalizeLine } = require('./lineNormalizer');
/**
* Extract logical code blocks using sliding window with normalization and metadata enrichment
*
* Technical function: Sliding window block extraction with quality filtering and metadata annotation
*
* Implementation rationale:
* - Sliding window ensures comprehensive coverage of all potential duplicate patterns
* - Line normalization reduces formatting noise while preserving semantic content
* - Quality threshold (70% non-empty lines) filters out low-value blocks
* - Rich metadata enables advanced duplicate analysis and prioritization
*
* Block extraction strategy:
* - Fixed-size sliding window for consistent comparison across blocks
* - Normalization applied before similarity comparison for robustness
* - Original content preserved for accurate location reporting
* - Hash generation enables fast duplicate identification
*
* Quality filtering approach:
* - 70% non-empty line threshold balances inclusivity with quality
* - Empty line tolerance accommodates natural code formatting
* - Block size configurability adapts to different duplication granularities
* - Minimum line count prevents trivial duplicate detection
*
* Metadata enrichment:
* - Block hashing enables O(1) exact duplicate identification
* - Complexity scoring prioritizes refactoring high-impact duplicates
* - Line position tracking enables precise IDE navigation
* - Line count metrics support duplicate severity assessment
*
* Performance considerations:
* - Single pass through content with sliding window for O(n*m) complexity
* - Normalized line caching prevents redundant normalization
* - Hash computation deferred until block validation for efficiency
* - Memory usage scales with block size and content length
*
* @param {string} content - Source code content to extract blocks from
* @param {number} minLines - Minimum lines per block (default from config)
* @returns {Array<Object>} Array of code blocks with hash, complexity, and position metadata
* @example
* const blocks = extractLogicalBlocks(`
* function helper() {
* const x = 1;
* return x + 2;
* }
*
* function main() {
* const y = 1;
* return y + 2;
* }
* `);
* // Returns blocks with startLine, endLine, content, hash, complexity metrics
*/
function extractLogicalBlocks(content, minLines = WET_CODE_CONFIG.minDuplicateLines) {
const lines = content.split('\n');
const normalizedLines = lines.map(line => normalizeLine(line, WET_CODE_CONFIG));
const blocks = [];
// Extract blocks of consecutive non-empty lines
for (let i = 0; i <= normalizedLines.length - minLines; i++) {
const blockLines = normalizedLines.slice(i, i + minLines);
const validLines = blockLines.filter(line => line.length > 0);
if (validLines.length >= Math.floor(minLines * 0.7)) { // At least 70% non-empty lines
const blockContent = blockLines.join('\n');
const originalContent = lines.slice(i, i + minLines).join('\n');
blocks.push({
startLine: i + 1,
endLine: i + minLines,
content: blockContent,
originalContent: originalContent,
hash: generateBlockHash(blockContent),
complexity: calculateBlockComplexity(blockContent),
lineCount: minLines,
nonEmptyLines: validLines.length
});
}
}
return blocks;
}
module.exports = {
extractLogicalBlocks
};