UNPKG

git-spark

Version:

Git repository analytics and reporting tool for analyzing commit patterns and contributor activity

453 lines 19.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.DataCollector = void 0; const git_1 = require("../utils/git"); const input_validation_1 = require("../utils/input-validation"); const logger_1 = require("../utils/logger"); // import { validateCommitMessage } from '../utils/validation'; const logger = (0, logger_1.createLogger)('collector'); /** * Git repository data collection engine * * The DataCollector is responsible for efficiently extracting raw data from Git repositories * using git commands. It handles commit history traversal, file change analysis, and progress * tracking for large repositories. * * Key features: * - Efficient batch processing of git log data * - Comprehensive file change tracking with diff statistics * - Memory-optimized streaming for large repositories * - Robust error handling and validation * - Progress reporting for long-running operations * - Flexible filtering by date, author, branch, and file patterns * * @example * ```typescript * const collector = new DataCollector('/path/to/repo', (progress) => { * console.log(`${progress.stage}: ${progress.percentage}%`); * }); * * const commits = await collector.collectCommits({ * since: '2024-01-01', * author: 'john@example.com', * maxCommits: 1000 * }); * * console.log(`Collected ${commits.length} commits`); * ``` */ class DataCollector { /** * Create a new DataCollector instance * * @param repoPath - Absolute path to the Git repository * @param progressCallback - Optional callback for progress updates during collection * @throws {Error} When repository path is invalid or git is not accessible * * @example * ```typescript * // Basic usage * const collector = new DataCollector('/path/to/repo'); * * // With progress tracking * const collector = new DataCollector('/path/to/repo', (progress) => { * console.log(`${progress.stage}: ${progress.current}/${progress.total}`); * }); * ``` */ constructor(repoPath, progressCallback) { this.lastWarnings = []; this.repoPath = repoPath; this.git = new git_1.GitExecutor(repoPath); this.progressCallback = progressCallback; } /** * Get the repository path * @returns The absolute path to the Git repository */ getRepositoryPath() { return this.repoPath; } /** * Collect commit data from Git repository with comprehensive metadata * * Efficiently processes git log to extract commit information including: * - Basic commit metadata (hash, author, date, message) * - File change statistics (additions, deletions, modifications) * - Diff analysis for each changed file * - Merge commit handling and parent tracking * * @param options - Collection options and filters * @param options.since - Start date for commit range (ISO string or git date format) * @param options.until - End date for commit range (ISO string or git date format) * @param options.branch - Specific branch to analyze (default: current branch) * @param options.author - Filter commits by author email or name * @param options.maxCommits - Maximum number of commits to collect (for performance) * @param options.excludePatterns - File patterns to exclude from analysis * * @returns Promise resolving to array of processed commit data * @throws {Error} When git operations fail or repository is inaccessible * * @example * ```typescript * // Collect all commits from last 6 months * const commits = await collector.collectCommits({ * since: '2024-06-01', * maxCommits: 5000 * }); * * // Collect commits from specific author * const commits = await collector.collectCommits({ * author: 'developer@company.com', * branch: 'main' * }); * ``` */ async collectCommits(options) { this.reportProgress('Collecting commit data', 0, 100); const warnings = []; // Get the first commit date in the repository to cap the analysis range const firstCommitDate = await this.git.getFirstCommitDate(options.branch); logger.debug(`Repository first commit date: ${firstCommitDate?.toISOString() || 'null'}`); // Derive since date from --days if user supplied days but not an explicit since if (!options.since && options.days && options.days > 0) { const d = new Date(); d.setDate(d.getDate() - options.days); // Use ISO date (no time) to let git interpret midnight boundary options.since = d.toISOString().split('T')[0]; logger.debug(`Calculated since date from --days=${options.days}: ${options.since}`); } // Cap the analysis range to repository lifetime if (options.since && firstCommitDate) { const requestedStartDate = new Date(options.since); logger.debug(`Comparing dates: requested=${requestedStartDate.toISOString()}, firstCommit=${firstCommitDate.toISOString()}`); if (requestedStartDate < firstCommitDate) { const originalSince = options.since; // Convert to local date at start of day to ensure we include the first commit // Subtract one day to ensure we capture commits on the first commit date const adjustedDate = new Date(firstCommitDate); adjustedDate.setDate(adjustedDate.getDate() - 1); options.since = adjustedDate.toISOString().split('T')[0]; logger.info(`Analysis start date (${originalSince}) is before repository first commit. ` + `Adjusting range to start from ${options.since}.`); } } const gitOptions = {}; if (options.since) gitOptions.since = options.since; if (options.until) gitOptions.until = options.until; if (options.branch) gitOptions.branch = options.branch; if (options.author) gitOptions.author = options.author; const totalCommits = await this.git.getCommitCount(gitOptions); logger.info(`Collecting up to ${totalCommits} commits (streaming)`, { options }); // Build streaming git log command with safe delimiters // Use unit separator (0x1F) for fields and record separator (0x1E) for commits // Validate and sanitize all input parameters to prevent command injection const gitParams = {}; if (gitOptions.since) gitParams.since = gitOptions.since; if (gitOptions.until) gitParams.until = gitOptions.until; if (gitOptions.author) gitParams.author = gitOptions.author; if (gitOptions.branch) gitParams.branch = gitOptions.branch; if (options.path) gitParams.path = options.path; const validation = (0, input_validation_1.validateGitOptions)(gitParams); if (!validation.isValid) { throw new Error(`Invalid Git parameters: ${validation.errors.join(', ')}`); } const safeOptions = validation.sanitized; // We prefix each commit with a record separator so we can split reliably even for large bodies. // Format: \x1e<fields...> then numstat lines, then the next commit begins with \x1e const args = [ 'log', '--no-merges', '--numstat', `--pretty=format:%x1e%H%x1f%h%x1f%an%x1f%ae%x1f%ai%x1f%s%x1f%b%x1f%P`, ]; if (safeOptions.since) args.push('--since', safeOptions.since); if (safeOptions.until) args.push('--until', safeOptions.until); if (safeOptions.author) args.push('--author', safeOptions.author); if (safeOptions.path) args.push('--', safeOptions.path); if (safeOptions.branch) args.push(safeOptions.branch); const spawn = require('child_process').spawn; const child = spawn('git', args, { cwd: this.git.repoPath }); const commits = []; let buffer = ''; let processed = 0; let currentCommit = null; const finalizeCurrent = () => { if (currentCommit && currentCommit.hash) { const finalized = this.finalizeCommit(currentCommit); commits.push(finalized); currentCommit = null; } }; child.stdout.on('data', (chunk) => { buffer += chunk.toString(); // Split on record separator (each commit starts with \x1e) const records = buffer.split('\x1e'); buffer = records.pop() || ''; for (const record of records) { if (!record.trim()) continue; // We need 7 field separators (\x1f) to yield 8 header fields (hash..parents) // The commit body (%b) may contain newlines, so the header can span multiple lines // until the parents field is encountered. Only after the parents field do file // change (numstat) lines begin on subsequent lines. // Count field separators (0x1F) without regex to satisfy lint (no-control-regex) let fieldSepCount = 0; for (let i = 0; i < record.length; i++) { if (record.charCodeAt(i) === 0x1f) fieldSepCount++; } if (fieldSepCount < 7) { // Incomplete header (body truncated across chunk) - re-buffer buffer = '\x1e' + record + buffer; continue; } // Find the index of the 7th separator to know where parents field starts let sepCount = 0; let idx = -1; for (let i = 0; i < record.length; i++) { if (record[i] === '\x1f') { sepCount++; if (sepCount === 7) { // after this comes the parents field value // Find newline ending the parents field const newlineAfterParents = record.indexOf('\n', i + 1); if (newlineAfterParents === -1) { // Parents field not terminated yet; wait for more data buffer = '\x1e' + record + buffer; idx = -1; } else { idx = newlineAfterParents; // header ends here } break; } } } if (idx === -1) { continue; // wait for more data } const headerLine = record.slice(0, idx); const parts = headerLine.split('\x1f'); if (parts.length < 8) { warnings.push('Malformed commit header encountered (insufficient fields)'); continue; } currentCommit = this.parseCommitHeader(parts); // Remainder after header newline are numstat lines (may contain blank line at end) const remainder = record.slice(idx + 1); if (remainder) { const lines = remainder.split('\n'); for (const l of lines) { if (!l.trim()) continue; const fileChange = this.parseFileStats(l.replace(/\r$/, '')); if (fileChange) { currentCommit.files = currentCommit.files || []; currentCommit.files.push(fileChange); } } } finalizeCurrent(); processed++; if (processed % 200 === 0 || processed === totalCommits) { const pct = totalCommits ? Math.min(100, Math.round((processed / totalCommits) * 70)) : 0; this.reportProgress('Streaming commit collection', pct, 100); } } }); await new Promise((resolve, reject) => { child.on('error', (e) => reject(e)); child.on('close', () => { // Process any remaining buffered (partial) record if (buffer.trim()) { const newlineIdx = buffer.indexOf('\n'); if (newlineIdx !== -1) { const headerLine = buffer.slice(0, newlineIdx); let fieldSepCount = 0; for (let i = 0; i < headerLine.length; i++) { if (headerLine.charCodeAt(i) === 0x1f) fieldSepCount++; } if (fieldSepCount >= 7) { const parts = headerLine.split('\x1f'); if (parts.length >= 8) { currentCommit = this.parseCommitHeader(parts); const remainder = buffer.slice(newlineIdx + 1); const lines = remainder ? remainder.split('\n') : []; for (const l of lines) { if (!l.trim()) continue; const fc = this.parseFileStats(l); if (fc) { currentCommit.files = currentCommit.files || []; currentCommit.files.push(fc); } } finalizeCurrent(); } } } } resolve(); }); }); this.reportProgress('Enhancing commits', 80, 100); for (let i = 0; i < commits.length; i++) { try { commits[i] = await this.enhanceCommit(commits[i]); } catch (e) { warnings.push(`Enhancement failed for ${commits[i].hash}: ${e.message || 'unknown'}`); } if (i % 250 === 0) { const pct = 80 + Math.min(15, Math.round((i / commits.length) * 15)); this.reportProgress('Enhancing commits', pct, 100); } } this.reportProgress('Commit collection complete', 100, 100); if (warnings.length) { logger.warn(`Completed with ${warnings.length} warnings`); } this.lastWarnings = warnings.slice(); return commits; } // Legacy parseCommitLog removed in favor of streaming parser above parseCommitHeader(parts) { const [hash, shortHash, author, authorEmail, date, subject, body, parents] = parts; return { hash: hash.trim(), shortHash: shortHash.trim(), author: author.trim(), authorEmail: authorEmail.trim(), date: new Date(date.trim()), subject: subject.trim(), body: body.trim(), message: `${subject.trim()}\n${body.trim()}`.trim(), isMerge: parents ? parents.trim().split(' ').length > 1 : false, files: [], insertions: 0, deletions: 0, filesChanged: 0, isCoAuthored: false, coAuthors: [], }; } parseFileStats(line) { const match = line.match(/^(\d+|\-)\s+(\d+|\-)\s+(.+)$/); if (!match) return null; const [, insertions, deletions, path] = match; // Handle binary files const ins = insertions === '-' ? 0 : parseInt(insertions, 10); const dels = deletions === '-' ? 0 : parseInt(deletions, 10); // Determine file status and handle renames let status = 'modified'; let filePath = path; let oldPath; if (path.includes(' => ')) { status = 'renamed'; const parts = path.split(' => '); oldPath = parts[0].trim(); filePath = parts[1].trim(); } else if (ins > 0 && dels === 0) { status = 'added'; } else if (ins === 0 && dels > 0) { status = 'deleted'; } const result = { path: filePath, insertions: ins, deletions: dels, status, }; if (oldPath) { result.oldPath = oldPath; } return result; } finalizeCommit(commit) { // Calculate totals from file changes const insertions = commit.files?.reduce((sum, f) => sum + f.insertions, 0) || 0; const deletions = commit.files?.reduce((sum, f) => sum + f.deletions, 0) || 0; const filesChanged = commit.files?.length || 0; // Detect co-authored commits const coAuthorMatches = commit.body?.match(/Co-authored-by: (.+) <(.+)>/g) || []; const coAuthors = coAuthorMatches .map(match => { const authorMatch = match.match(/Co-authored-by: (.+) <(.+)>/); return authorMatch ? authorMatch[1] : ''; }) .filter(Boolean); return { hash: commit.hash, shortHash: commit.shortHash, author: commit.author, authorEmail: commit.authorEmail, date: commit.date, message: commit.message, subject: commit.subject, body: commit.body || '', insertions, deletions, filesChanged, isMerge: commit.isMerge, isCoAuthored: coAuthors.length > 0, coAuthors, files: commit.files || [], }; } async enhanceCommit(commit) { // Add commit message analysis // const messageAnalysis = validateCommitMessage(commit.message); // You could add more enhancements here: // - Language detection for files // - File size information // - Complexity metrics return commit; } async getBranches() { return this.git.getBranches(); } async getCurrentBranch() { return this.git.getCurrentBranch(); } async getRemoteUrl() { return this.git.getRemoteUrl(); } async getLanguageStats() { return this.git.getLanguageStats(); } async validateRepository() { return this.git.validateRepository(); } reportProgress(phase, current, total) { if (this.progressCallback) { this.progressCallback(phase, current, total); } } /** * Get warnings from the most recent collection run */ getWarnings() { return this.lastWarnings.slice(); } } exports.DataCollector = DataCollector; //# sourceMappingURL=collector.js.map