mergerocket
Version:
A CLI tool for recursively merging text file contents into a single output file, featuring customizable parameters and formatting optimized for LLMs.
451 lines (393 loc) • 13.2 kB
JavaScript
import fs from "fs";
import path from "path";
/**
* @typedef {Object} DefaultConfig
* @property {string} defaultDir - Default directory to process
* @property {string} defaultOutput - Default output file path
* @property {string} defaultIgnore - Default file extensions to ignore (comma separated)
* @property {string} defaultStartTemplate - Default start marker template
* @property {string} defaultEndTemplate - Default end marker template
*/
/**
* @type {DefaultConfig}
*/
export const DEFAULT_CONFIG = {
defaultDir: ".",
defaultOutput: `merged_${Date.now()}.txt`,
defaultIgnore: ".png,.jpg,.jpeg,.gif,.bmp,.ico,.zip,.gz,.tar,.rar,.exe",
defaultStartTemplate: "--- START: {file} ---",
defaultEndTemplate: "--- END: {file} ---",
};
/**
* @typedef {Object} MergeOptions
* @property {string} [dir=DEFAULT_CONFIG.defaultDir] - Base directory to process
* @property {string} [out=DEFAULT_CONFIG.defaultOutput] - Output file path
* @property {string[]} [blacklist] - File extensions to ignore
* @property {string} [start=DEFAULT_CONFIG.defaultStartTemplate] - Start marker template
* @property {string} [end=DEFAULT_CONFIG.defaultEndTemplate] - End marker template
* @property {boolean} [keepHidden=false] - Whether to include hidden files
* @property {boolean} [ignoreGitignore=false] - Whether to ignore .gitignore rules
* @property {boolean} [attachSummary=false] - Whether to add summary to output file
*/
/**
* @typedef {Object} MergeResult
* @property {number} mergedCount - Total number of files processed
* @property {number} textFileCount - Number of text files merged
* @property {number} binarySkippedCount - Number of binary files skipped
* @property {number} failedReadCount - Number of files that failed to read
* @property {Object.<string, number>} fileTypeCounts - Count of files by extension
* @property {string} summaryText - Generated summary text
* @property {string} outFile - Path to the output file
* @property {number} durationMs - Duration of the operation in milliseconds
*/
/**
* @typedef {Object} WalkDirectoryOptions
* @property {boolean} [keepHidden=false] - Whether to include hidden files and directories
* @property {boolean} [ignoreGitignore=false] - Whether to ignore .gitignore rules
* @property {string[]} [gitignorePatterns=[]] - Patterns from .gitignore file
* @property {string} [baseDir] - Base directory for resolving relative paths
*/
/**
* Reads gitignore patterns from a directory
* @param {string} baseDir - Base directory to look for .gitignore
* @returns {string[]} - Array of gitignore patterns
*/
const readGitignorePatterns = (baseDir) => {
const gitignorePath = path.join(baseDir, ".gitignore");
if (!fs.existsSync(gitignorePath)) {
return [];
}
try {
const gitignoreContent = fs.readFileSync(gitignorePath, "utf8");
return gitignoreContent
.split(/\r?\n/)
.filter((line) => {
line = line.trim();
return line && !line.startsWith("#");
})
.map((line) => line.trim());
} catch {
return [];
}
};
/**
* Generates summary text based on merge results
* @param {Object} params - Parameters for generating summary
* @param {number} params.mergedCount - Total files processed
* @param {number} params.textFileCount - Text files processed
* @param {number} params.binarySkippedCount - Binary files skipped
* @param {number} params.failedReadCount - Files failed to read
* @param {Object.<string, number>} params.fileTypeCounts - Count by file type
* @param {number} params.durationMs - Duration in milliseconds
* @param {string} params.startMarker - Start marker template
* @param {string} params.endMarker - End marker template
* @returns {string} - Generated summary text
*/
const generateSummaryText = ({
mergedCount,
textFileCount,
binarySkippedCount,
failedReadCount,
fileTypeCounts,
durationMs,
startMarker,
endMarker,
}) => {
const summaryHeader = "Merged File Summary";
const summaryLines = [];
summaryLines.push(startMarker.replace("{file}", summaryHeader));
summaryLines.push(`Execution Date: ${new Date().toLocaleString()}`);
summaryLines.push(`Duration: ${durationMs} ms`);
summaryLines.push(`Files processed for merging: ${mergedCount}`);
summaryLines.push(`Text files merged: ${textFileCount}`);
summaryLines.push(`Binary files skipped: ${binarySkippedCount}`);
summaryLines.push(`Files failed to read: ${failedReadCount}`);
summaryLines.push("Merged file count by type:");
Object.entries(fileTypeCounts).forEach(([ext, count]) => {
summaryLines.push(` ${ext || "[none]"}: ${count}`);
});
summaryLines.push(endMarker.replace("{file}", summaryHeader));
return summaryLines.join("\n");
};
/**
* Checks if a file is binary
* @param {string} filePath - Path to the file
* @returns {boolean} - True if the file is binary
*/
export const isBinaryFile = (filePath) => {
const BUF_LENGTH = 8000;
const buffer = Buffer.alloc(BUF_LENGTH);
let fd;
try {
fd = fs.openSync(filePath, "r");
const bytesRead = fs.readSync(fd, buffer, 0, BUF_LENGTH, 0);
for (let i = 0; i < bytesRead; i++) {
if (buffer[i] === 0) {
return true;
}
}
return false;
} catch {
return true;
} finally {
if (fd !== undefined) fs.closeSync(fd);
}
};
/**
* Checks if a pattern matches a path using glob-like matching
* @param {string} relativePath - Path relative to base directory
* @param {string} pattern - The glob pattern to match
* @returns {boolean} - True if the path matches the pattern
*/
const globMatch = (relativePath, pattern) => {
const regexPattern = pattern
.replace(/\./g, "\\.")
.replace(/\*/g, ".*")
.replace(/\?/g, ".");
const regex = new RegExp(`^${regexPattern}$`);
return regex.test(relativePath);
};
/**
* Checks if a file is ignored by gitignore patterns
* @param {string} filePath - Path to the file
* @param {string} baseDir - Base directory for relative paths
* @param {string[]} gitignorePatterns - Patterns from .gitignore
* @returns {boolean} - True if the file should be ignored
*/
export const isIgnoredByGitignore = (filePath, baseDir, gitignorePatterns) => {
if (!gitignorePatterns.length) {
return false;
}
const relativePath = path
.relative(baseDir, filePath)
.split(path.sep)
.join("/");
for (const pattern of gitignorePatterns) {
const normalizedPattern = pattern.startsWith("/")
? pattern.substring(1)
: pattern;
// Exact match
if (
relativePath === normalizedPattern ||
relativePath === normalizedPattern.replace(/\/$/, "")
) {
return true;
}
// Directory match
if (normalizedPattern.endsWith("/")) {
const folderPattern = normalizedPattern.slice(0, -1);
if (
relativePath === folderPattern ||
relativePath.startsWith(`${folderPattern}/`)
) {
return true;
}
}
// Non-globbing pattern
else if (
!normalizedPattern.includes("*") &&
!normalizedPattern.includes("?")
) {
const segments = relativePath.split("/");
if (segments.includes(normalizedPattern)) {
return true;
}
if (relativePath.startsWith(`${normalizedPattern}/`)) {
return true;
}
}
// Glob pattern matching
if (normalizedPattern.includes("*") || normalizedPattern.includes("?")) {
if (globMatch(relativePath, normalizedPattern)) {
return true;
}
const segments = relativePath.split("/");
const lastSegment = segments[segments.length - 1];
// Extension pattern like *.js
if (
normalizedPattern.startsWith("*") &&
normalizedPattern.includes(".")
) {
if (globMatch(lastSegment, normalizedPattern)) {
return true;
}
}
// Directory wildcard pattern like dir/*
if (normalizedPattern.endsWith("/*")) {
const folderPrefix = normalizedPattern.slice(0, -2);
if (relativePath.startsWith(`${folderPrefix}/`)) {
return true;
}
}
}
}
return false;
};
/**
* Recursively walks through a directory and processes files
* @param {string} dirPath - Directory path to walk
* @param {Function} callback - Function to call for each file
* @param {WalkDirectoryOptions} options - Options for walking
*/
export const walkDirectory = (dirPath, callback, options = {}) => {
const { keepHidden = false, ignoreGitignore = false, gitignorePatterns = [] } = options;
const baseDir = options.baseDir || dirPath;
try {
fs.readdirSync(dirPath).forEach((file) => {
const fullPath = path.join(dirPath, file);
try {
const stat = fs.statSync(fullPath);
const isHidden = file.startsWith(".");
if (!keepHidden && isHidden) {
return;
}
if (!ignoreGitignore && isIgnoredByGitignore(fullPath, baseDir, gitignorePatterns)) {
return;
}
if (stat.isDirectory()) {
walkDirectory(fullPath, callback, { ...options, baseDir });
} else {
callback(fullPath);
}
} catch {
// Skip files that can't be accessed
}
});
} catch {
// Skip directories that can't be accessed
}
};
/**
* Processes a single file and appends its content to the output file
* @param {Object} params - Parameters for processing file
* @param {string} params.filePath - Path to the file
* @param {string} params.outFile - Output file path
* @param {string} params.startMarker - Start marker template
* @param {string} params.endMarker - End marker template
* @param {boolean} params.isFirstFile - Whether this is the first file
* @returns {Object} - Processing results
*/
const processFile = ({
filePath,
outFile,
startMarker,
endMarker,
isFirstFile,
}) => {
const result = {
textProcessed: false,
binary: false,
failed: false,
fileExt: path.extname(filePath).toLowerCase(),
};
const startText = startMarker.replace("{file}", filePath);
if (!isFirstFile) {
fs.appendFileSync(outFile, "\n");
}
fs.appendFileSync(outFile, `${startText}\n`);
if (!isBinaryFile(filePath)) {
try {
const content = fs.readFileSync(filePath, "utf8");
fs.appendFileSync(outFile, content);
result.textProcessed = true;
} catch {
fs.appendFileSync(outFile, `[SKIP] Failed to read file: ${filePath}`);
result.failed = true;
}
} else {
fs.appendFileSync(outFile, `[SKIP] Binary file: ${filePath}`);
result.binary = true;
}
fs.appendFileSync(outFile, `\n${endMarker.replace("{file}", filePath)}\n`);
return result;
};
/**
* Merges multiple files into a single output file
* @param {MergeOptions} options - Merge options
* @returns {MergeResult} - Result of the merge operation
*/
export const mergeFiles = (options = {}) => {
const {
dir = DEFAULT_CONFIG.defaultDir,
out = DEFAULT_CONFIG.defaultOutput,
blacklist = DEFAULT_CONFIG.defaultIgnore.split(",").map((ext) => ext.trim().toLowerCase()),
start = DEFAULT_CONFIG.defaultStartTemplate,
end = DEFAULT_CONFIG.defaultEndTemplate,
keepHidden = false,
ignoreGitignore = false,
attachSummary = false,
} = options;
if (fs.existsSync(out)) {
fs.unlinkSync(out);
}
const outDir = path.dirname(out);
if (!fs.existsSync(outDir)) {
fs.mkdirSync(outDir, { recursive: true });
}
const gitignorePatterns = ignoreGitignore ? [] : readGitignorePatterns(dir);
let mergedCount = 0;
let textFileCount = 0;
let binarySkippedCount = 0;
let failedReadCount = 0;
let fileTypeCounts = {};
let isFirstFile = true;
const startTime = Date.now();
walkDirectory(
dir,
(filePath) => {
if (path.resolve(filePath) === path.resolve(out)) {
return;
}
const ext = path.extname(filePath).toLowerCase();
if (blacklist.includes(ext)) {
return;
}
mergedCount++;
const result = processFile({
filePath,
outFile: out,
startMarker: start,
endMarker: end,
isFirstFile,
});
if (result.textProcessed) {
textFileCount++;
fileTypeCounts[result.fileExt] = (fileTypeCounts[result.fileExt] || 0) + 1;
}
if (result.binary) binarySkippedCount++;
if (result.failed) failedReadCount++;
isFirstFile = false;
},
{
keepHidden,
ignoreGitignore,
gitignorePatterns,
baseDir: dir,
}
);
const durationMs = Date.now() - startTime;
const summaryText = generateSummaryText({
mergedCount,
textFileCount,
binarySkippedCount,
failedReadCount,
fileTypeCounts,
durationMs,
startMarker: start,
endMarker: end,
});
if (attachSummary && fs.existsSync(out)) {
const mergedContent = fs.readFileSync(out, "utf8");
fs.writeFileSync(out, `${summaryText}\n\n${mergedContent}`);
}
return {
mergedCount,
textFileCount,
binarySkippedCount,
failedReadCount,
fileTypeCounts,
summaryText,
outFile: out,
durationMs,
};
};