@drmhse/remove-comments
Version:
A robust, zero-dependency CLI tool for removing comments from source code using token-based state machine parsing. Supports JavaScript, TypeScript, Java, Kotlin, Python, Vue, React, and Dart with advanced template literal processing.
553 lines (489 loc) • 19.5 kB
JavaScript
const fs = require('fs').promises;
const path = require('path');
// Zero-dependency, superior comment removal implementation
// Based on token-based state machine with enhanced template literal support
const STATES = {
CODE: 'code',
STRING_SINGLE: 'string_single',
STRING_DOUBLE: 'string_double',
STRING_TEMPLATE: 'string_template',
TEMPLATE_EXPR: 'template_expr',
REGEX: 'regex',
LINE_COMMENT: 'line_comment',
BLOCK_COMMENT: 'block_comment',
ESCAPE: 'escape'
};
// Enhanced JavaScript/TypeScript processor with proper template literal handling
function processJavaScriptFamily(content) {
let result = '';
let state = STATES.CODE;
let i = 0;
let braceDepth = 0;
let regexAllowed = true; // Track if regex is allowed in current context
let previousStringState = null; // Track which string context we were in
let previousCodeState = STATES.CODE; // Track which code context to return to after comments
let isEscaped = false;
while (i < content.length) {
const char = content[i];
const next = content[i + 1];
// Check if current character is escaped
isEscaped = (i > 0 && content[i - 1] === '\\' && !isEscaped);
// Handle escape sequences
if (state === STATES.ESCAPE) {
result += char;
state = previousStringState || STATES.CODE; // Return to previous string context
i++;
continue;
}
// Handle backslash escaping
if (char === '\\' && (state === STATES.STRING_SINGLE || state === STATES.STRING_DOUBLE ||
state === STATES.STRING_TEMPLATE || state === STATES.REGEX)) {
result += char;
previousStringState = state;
state = STATES.ESCAPE;
i++;
continue;
}
switch (state) {
case STATES.CODE:
if (char === "'") {
state = STATES.STRING_SINGLE;
previousStringState = STATES.CODE;
result += char;
} else if (char === '"') {
state = STATES.STRING_DOUBLE;
previousStringState = STATES.CODE;
result += char;
} else if (char === '`') {
state = STATES.STRING_TEMPLATE;
result += char;
} else if (char === '/' && next === '/') {
state = STATES.LINE_COMMENT;
previousCodeState = STATES.CODE;
i++; // Skip the second /
} else if (char === '/' && next === '*') {
state = STATES.BLOCK_COMMENT;
previousCodeState = STATES.CODE;
i++; // Skip the *
} else if (char === '/' && isRegexContext(content, i)) {
state = STATES.REGEX;
result += char;
} else {
result += char;
}
break;
case STATES.STRING_SINGLE:
result += char;
if (char === "'" && !isEscaped) {
state = previousStringState || STATES.CODE;
previousStringState = null;
}
break;
case STATES.STRING_DOUBLE:
result += char;
if (char === '"' && !isEscaped) {
state = previousStringState || STATES.CODE;
previousStringState = null;
}
break;
case STATES.STRING_TEMPLATE:
result += char;
if (char === '`') {
state = STATES.CODE;
} else if (char === '$' && next === '{') {
state = STATES.TEMPLATE_EXPR;
braceDepth = 1;
result += next;
i++; // Skip the {
}
break;
case STATES.TEMPLATE_EXPR:
// Template expressions are JavaScript contexts - handle like CODE but track braces
if (char === '{') {
braceDepth++;
result += char;
} else if (char === '}') {
braceDepth--;
result += char;
if (braceDepth === 0) {
state = STATES.STRING_TEMPLATE;
}
} else if (char === "'" && !isEscaped) {
state = STATES.STRING_SINGLE;
previousStringState = STATES.TEMPLATE_EXPR;
result += char;
} else if (char === '"' && !isEscaped) {
state = STATES.STRING_DOUBLE;
previousStringState = STATES.TEMPLATE_EXPR;
result += char;
} else if (char === '`' && !isEscaped) {
state = STATES.STRING_TEMPLATE;
previousStringState = STATES.TEMPLATE_EXPR;
result += char;
} else if (char === '/' && next === '/' && !isEscaped) {
state = STATES.LINE_COMMENT;
previousCodeState = STATES.TEMPLATE_EXPR;
i++; // Skip the second /
// Don't add comment to result
} else if (char === '/' && next === '*' && !isEscaped) {
state = STATES.BLOCK_COMMENT;
previousCodeState = STATES.TEMPLATE_EXPR;
i++; // Skip the *
// Don't add comment to result
} else if (char === '/' && !isEscaped && isRegexContext(content, i)) {
state = STATES.REGEX;
result += char;
} else {
result += char;
}
break;
case STATES.REGEX:
result += char;
if (char === '/') {
// End of regex - look for flags
while (i + 1 < content.length && /[gimsuyx]/.test(content[i + 1])) {
i++;
result += content[i];
}
state = STATES.CODE;
}
break;
case STATES.LINE_COMMENT:
if (char === '\n' || char === '\r') {
result += char;
state = previousCodeState;
previousCodeState = STATES.CODE;
}
// Skip comment content
break;
case STATES.BLOCK_COMMENT:
if (char === '*' && next === '/') {
state = previousCodeState;
previousCodeState = STATES.CODE;
i++; // Skip the /
} else if (char === '\n') {
result += char; // Preserve newlines
}
// Skip comment content
break;
}
i++;
}
return result;
}
// Helper functions
function getPreviousState(currentState) {
// Return to appropriate string state after escape
return STATES.CODE; // Simplified for now
}
function isRegexContext(content, pos) {
// Look backwards to determine if this is likely a regex
let j = pos - 1;
while (j >= 0 && /\s/.test(content[j])) j--; // Skip whitespace
if (j < 0) return true;
const prevChar = content[j];
// Regex likely after these tokens
return /[=,({\[;:!&|?+\-*%^~<>]/.test(prevChar) ||
(j >= 5 && content.substring(j-5, j+1) === 'return') ||
(j >= 2 && content.substring(j-2, j+1) === 'new');
}
function updateRegexAllowed(char, current) {
// Update whether regex is allowed based on previous character
if (/[)\]}\w]/.test(char)) return false; // After identifier, ), ], }
if (/[=,({\[;:!&|?+\-*%^~<>]/.test(char)) return true;
return current;
}
// Processors using the new implementation
const JS_TS_PROCESSOR = { process: processJavaScriptFamily };
const DART_PROCESSOR = { process: processJavaScriptFamily };
const REACT_PROCESSOR = {
process: (content) => {
// First process JSX comments {/* */}
content = content.replace(/\{\s*\/\*[\s\S]*?\*\/\s*\}/g, (match) => {
return '\n'.repeat((match.match(/\n/g) || []).length);
});
// Then process regular JavaScript
return processJavaScriptFamily(content);
}
};
// Simple Java/Kotlin processor (C-style comments)
function processJavaFamily(content) {
let result = '';
let state = STATES.CODE;
let i = 0;
let previousStringState = null;
let isEscaped = false;
while (i < content.length) {
const char = content[i];
const next = content[i + 1];
// Check if current character is escaped
isEscaped = (i > 0 && content[i - 1] === '\\' && !isEscaped);
// Handle escape sequences
if (state === STATES.ESCAPE) {
result += char;
state = previousStringState || STATES.CODE;
previousStringState = null;
i++;
continue;
}
// Handle backslash escaping
if (char === '\\' && (state === STATES.STRING_SINGLE || state === STATES.STRING_DOUBLE)) {
result += char;
previousStringState = state;
state = STATES.ESCAPE;
i++;
continue;
}
switch (state) {
case STATES.CODE:
if (char === "'") {
state = STATES.STRING_SINGLE;
result += char;
} else if (char === '"') {
state = STATES.STRING_DOUBLE;
result += char;
} else if (char === '/' && next === '/') {
state = STATES.LINE_COMMENT;
i++; // Skip second /
} else if (char === '/' && next === '*') {
state = STATES.BLOCK_COMMENT;
i++; // Skip *
} else {
result += char;
}
break;
case STATES.STRING_SINGLE:
result += char;
if (char === "'" && !isEscaped) state = STATES.CODE;
break;
case STATES.STRING_DOUBLE:
result += char;
if (char === '"' && !isEscaped) state = STATES.CODE;
break;
case STATES.LINE_COMMENT:
if (char === '\n' || char === '\r') {
result += char;
state = STATES.CODE;
}
break;
case STATES.BLOCK_COMMENT:
if (char === '*' && next === '/') {
state = STATES.CODE;
i++; // Skip /
} else if (char === '\n') {
result += char;
}
break;
}
i++;
}
return result;
}
const JAVA_KOTLIN_PROCESSOR = { process: processJavaFamily };
// Python processor - keep our working regex-based approach as strip-comments is too aggressive
const PYTHON_PROCESSOR = {
process: (content) => {
const regex = new RegExp([
/("""[\s\S]*?""")/, // Triple double quotes
/('''[\s\S]*?''')/, // Triple single quotes
/("(?:[^"\\]|\\.)*")/, // Double quoted strings
/('(?:[^'\\]|\\.)*')/, // Single quoted strings
/(#[^\r\n]*)/ // Hash comments
].map(r => r.source).join('|'), 'g');
return content.replace(regex, (match, tripleDouble, tripleSingle, doubleQuote, singleQuote, hashComment) => {
// Keep all string literals (including docstrings)
if (tripleDouble || tripleSingle || doubleQuote || singleQuote) return match;
// Remove hash comments
if (hashComment) return '';
return match;
});
}
};
const LANGUAGE_PROCESSORS = {
'.js': JS_TS_PROCESSOR,
'.ts': JS_TS_PROCESSOR,
'.dart': DART_PROCESSOR,
'.vue': JS_TS_PROCESSOR, // Vue script sections use JavaScript
'.jsx': REACT_PROCESSOR,
'.tsx': REACT_PROCESSOR,
'.java': JAVA_KOTLIN_PROCESSOR,
'.kt': JAVA_KOTLIN_PROCESSOR,
'.kts': JAVA_KOTLIN_PROCESSOR,
'.py': PYTHON_PROCESSOR,
};
const TARGET_EXTENSIONS = new Set(Object.keys(LANGUAGE_PROCESSORS));
// Vue processor - only processes <script> sections
const VUE_PROCESSOR = {
process: (content) => {
return content.replace(/(<script[^>]*>)([\s\S]*?)(<\/script>)/g, (match, open, script, close) => {
const cleanedScript = JS_TS_PROCESSOR.process(script);
return `${open}${cleanedScript}${close}`;
});
}
};
function cleanFileContent(content, filePath) {
const extension = path.extname(filePath);
if (extension === '.vue') {
return VUE_PROCESSOR.process(content);
}
const processor = LANGUAGE_PROCESSORS[extension];
if (processor) {
return processor.process(content);
}
return content;
}
function gitignorePatternToRegex(pattern) {
pattern = pattern.trim();
if (pattern === '' || pattern.startsWith('#')) return null;
const isNegated = pattern.startsWith('!');
if (isNegated) pattern = pattern.slice(1);
const isAnchored = pattern.startsWith('/');
if (isAnchored) pattern = pattern.slice(1);
const mustBeDir = pattern.endsWith('/');
if (mustBeDir) pattern = pattern.slice(0, -1);
let regexString = pattern
.replace(/[.+?^${}()|[\]\\]/g, '\\$&')
.replace(/\*\*/g, '.*')
.replace(/\*/g, '[^/]*');
if (isAnchored || pattern.includes('/')) {
regexString = `^${regexString}`;
} else {
regexString = `(^|/)${regexString}`;
}
if (mustBeDir) {
regexString += '(/|$)';
} else {
regexString += '($|/)';
}
return { regex: new RegExp(regexString), isNegated };
}
async function createHierarchicalFilter(directory, rootDir, parentIsIgnored) {
const gitignorePath = path.join(directory, '.gitignore');
let localRules = [];
try {
const gitignoreContent = await fs.readFile(gitignorePath, 'utf8');
localRules = gitignoreContent
.split(/\r?\n/)
.map(gitignorePatternToRegex)
.filter(rule => rule !== null);
} catch (error) {
if (error.code !== 'ENOENT') {
console.error(`[ERROR] Could not read .gitignore in ${directory}:`, error);
}
}
return (relativePathFromRoot) => {
if (parentIsIgnored(relativePathFromRoot)) {
return true;
}
const fullPath = path.join(rootDir, relativePathFromRoot);
const pathRelativeToHere = path.relative(directory, fullPath);
if (pathRelativeToHere === '') {
return false;
}
const normalizedPath = pathRelativeToHere.replace(/\\/g, '/');
let isLocallyIgnored = false;
for (const rule of localRules) {
if (rule.regex.test(normalizedPath)) {
isLocallyIgnored = !rule.isNegated;
}
}
return isLocallyIgnored;
};
}
// --- Core File System & Execution Logic ---
const BASE_EXCLUSIONS = ['node_modules/', 'dist/', '.git/', '.vscode/'];
async function processFile(filePath, isDryRun) {
try {
const originalContent = await fs.readFile(filePath, 'utf8');
const cleanedContent = cleanFileContent(originalContent, filePath);
if (originalContent !== cleanedContent) {
if (isDryRun) {
console.log(`[DRY RUN] Would clean: ${filePath}`);
} else {
await fs.writeFile(filePath, cleanedContent, 'utf8');
console.log(`[CLEANED] ${filePath}`);
}
}
} catch (error) {
console.error(`[ERROR] Failed to process ${filePath}:`, error);
}
}
async function walkAndClean(directory, rootDir, parentIsIgnored, isDryRun) {
const isIgnored = await createHierarchicalFilter(directory, rootDir, parentIsIgnored);
try {
const entries = await fs.readdir(directory, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(directory, entry.name);
const relativePathFromRoot = path.relative(rootDir, fullPath);
if (isIgnored(relativePathFromRoot)) {
continue;
}
if (entry.isDirectory()) {
await walkAndClean(fullPath, rootDir, isIgnored, isDryRun);
} else if (TARGET_EXTENSIONS.has(path.extname(entry.name))) {
await processFile(fullPath, isDryRun);
}
}
} catch (error) {
if (error.code !== 'ENOENT') {
console.error(`[ERROR] Could not read directory ${directory}:`, error);
}
}
}
async function main() {
console.log('Starting comment cleanup...');
const args = process.argv.slice(2);
const isDryRun = args.includes('--dry-run');
let targetPath = args.find(arg => !arg.startsWith('--')) || '.';
targetPath = path.resolve(targetPath);
console.log(`Target: ${targetPath}`);
if (isDryRun) {
console.log('--- DRY RUN MODE: No files will be changed. ---');
}
try {
const stats = await fs.stat(targetPath);
if (stats.isDirectory()) {
console.log('Target is a directory. Starting recursive scan...');
const baseRules = BASE_EXCLUSIONS.map(gitignorePatternToRegex).filter(r => r !== null);
const baseIsIgnored = (relativePath) => {
const normalizedPath = relativePath.replace(/\\/g, '/');
for (const rule of baseRules) {
if (rule.regex.test(normalizedPath)) {
return true;
}
}
return false;
};
await walkAndClean(targetPath, targetPath, baseIsIgnored, isDryRun);
} else if (stats.isFile()) {
console.log('Target is a single file.');
if (TARGET_EXTENSIONS.has(path.extname(targetPath))) {
await processFile(targetPath, isDryRun);
} else {
console.log(`[SKIPPED] File type (${path.extname(targetPath)}) is not supported.`);
}
} else {
console.error(`[ERROR] Unsupported path type for: ${targetPath}`);
}
} catch (error) {
if (error.code === 'ENOENT') {
console.error(`[ERROR] Path not found: ${targetPath}`);
} else {
throw error;
}
}
console.log('Cleanup complete.');
}
// Only run main if this file is executed directly (not imported)
if (require.main === module) {
main().catch(err => {
console.error('An unexpected error occurred:', err);
});
}
// Export for testing
module.exports = {
cleanFileContent,
LANGUAGE_PROCESSORS,
VUE_PROCESSOR
};