UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

502 lines 17.8 kB
"use strict"; /** * Structural Analyzer - Dataset dimensions and memory analysis * Handles memory estimation, sparsity analysis, and column profiling */ Object.defineProperty(exports, "__esModule", { value: true }); exports.StructuralAnalyzer = void 0; class StructuralAnalyzer { config; warnings = []; constructor(config) { this.config = config; } /** * Analyze dataset structural dimensions and memory characteristics */ analyzeStructure(rows, hasHeader) { if (rows.length === 0) { return this.createEmptyStructure(); } // Basic dimensions const totalRowsRead = rows.length; const totalDataRows = hasHeader ? totalRowsRead - 1 : totalRowsRead; const totalColumns = rows[0]?.data.length || 0; const totalDataCells = totalDataRows * totalColumns; // Column inventory const columnInventory = this.createColumnInventory(rows, hasHeader); // Memory estimation const estimatedInMemorySizeMB = this.estimateMemoryUsage(rows, totalDataRows); // Row length analysis const averageRowLengthBytes = this.calculateAverageRowLength(rows); // Sparsity analysis const sparsityAnalysis = this.analyzeSparsity(rows, hasHeader); // Quick statistics if enabled const quickStatistics = this.config.enableQuickStatistics ? this.generateQuickStatistics(rows, hasHeader, columnInventory) : undefined; // Add warnings for structural issues this.addStructuralWarnings(totalDataRows, totalColumns, estimatedInMemorySizeMB); return { totalRowsRead, totalDataRows, totalColumns, totalDataCells, columnInventory, estimatedInMemorySizeMB, averageRowLengthBytes, sparsityAnalysis, quickStatistics, }; } /** * Create column inventory with names and indices */ createColumnInventory(rows, hasHeader) { if (rows.length === 0) { return []; } const firstRow = rows[0]; const columnCount = firstRow.data.length; const inventory = []; for (let i = 0; i < columnCount; i++) { let columnName; if (hasHeader) { // Use header row for column names columnName = firstRow.data[i] || `Column_${i}`; } else { // Generate generic column names columnName = `Col_${i}`; } inventory.push({ index: i + 1, // 1-based indexing for display name: columnName, originalIndex: i, // 0-based original index }); } return inventory; } /** * Estimate memory usage of the dataset */ estimateMemoryUsage(rows, dataRows) { if (rows.length === 0) return 0; // Sample-based estimation for large datasets const sampleSize = Math.min(rows.length, 1000); const sampleRows = rows.slice(0, sampleSize); let totalSampleBytes = 0; for (const row of sampleRows) { for (const field of row.data) { // Estimate memory per field: // - String storage overhead (~24 bytes for V8 string object) // - Character storage (2 bytes per character for UTF-16 in V8) totalSampleBytes += 24 + field.length * 2; } // Row object overhead (~16 bytes) totalSampleBytes += 16; } // Average bytes per row const avgBytesPerRow = totalSampleBytes / sampleSize; // Extrapolate to full dataset const totalEstimatedBytes = avgBytesPerRow * dataRows; // Add overhead for data structures (arrays, indices, etc.) - roughly 30% const totalWithOverhead = totalEstimatedBytes * 1.3; // Convert to MB return Number((totalWithOverhead / (1024 * 1024)).toFixed(2)); } /** * Calculate average row length in bytes */ calculateAverageRowLength(rows) { if (rows.length === 0) return 0; // Sample first 100 rows for performance const sampleRows = rows.slice(0, Math.min(100, rows.length)); let totalBytes = 0; for (const row of sampleRows) { let rowBytes = 0; for (const field of row.data) { // UTF-8 byte estimation (most characters are 1 byte, some are 2-4) rowBytes += this.estimateUtf8Bytes(field); } // Add delimiter and line ending bytes rowBytes += row.data.length - 1; // delimiters between fields rowBytes += 1; // line ending totalBytes += rowBytes; } return Math.round(totalBytes / sampleRows.length); } /** * Estimate UTF-8 byte count for a string */ estimateUtf8Bytes(str) { let bytes = 0; for (let i = 0; i < str.length; i++) { const code = str.charCodeAt(i); if (code < 0x80) { bytes += 1; } else if (code < 0x800) { bytes += 2; } else if (code < 0x10000) { bytes += 3; } else { bytes += 4; } } return bytes; } /** * Analyze dataset sparsity (empty/null values) */ analyzeSparsity(rows, hasHeader) { if (rows.length === 0) { return { sparsityPercentage: 0, method: 'No data available', sampleSize: 0, description: 'Empty dataset', }; } // Determine sample size const dataRows = hasHeader ? rows.slice(1) : rows; const maxSampleSize = this.config.maxSampleSizeForSparsity || 10000; const sampleSize = Math.min(dataRows.length, maxSampleSize); const sampleRows = dataRows.slice(0, sampleSize); let emptyCells = 0; let totalCells = 0; for (const row of sampleRows) { for (const field of row.data) { totalCells++; if (this.isEmptyCell(field)) { emptyCells++; } } } const sparsityPercentage = totalCells > 0 ? Number(((emptyCells / totalCells) * 100).toFixed(2)) : 0; let description; if (sparsityPercentage < 5) { description = 'Dense dataset with minimal missing values'; } else if (sparsityPercentage < 20) { description = 'Moderately dense with some missing values'; } else if (sparsityPercentage < 50) { description = 'Moderately sparse with significant missing values'; } else { description = 'Highly sparse dataset with extensive missing values'; } const method = sampleSize === dataRows.length ? 'Full dataset analysis' : `Statistical sampling of ${sampleSize} rows`; return { sparsityPercentage, method, sampleSize, description, }; } /** * Check if a cell is considered empty */ isEmptyCell(value) { // Consider various representations of empty/null values const trimmed = value.trim().toLowerCase(); return (trimmed === '' || trimmed === 'null' || trimmed === 'undefined' || trimmed === 'na' || trimmed === 'n/a' || trimmed === '-' || trimmed === '#n/a'); } /** * Add warnings for structural characteristics */ addStructuralWarnings(dataRows, columns, memoryMB) { // Large dataset warnings if (dataRows > 1000000) { this.warnings.push({ category: 'structural', severity: 'medium', message: `Large dataset detected (${dataRows.toLocaleString()} rows)`, impact: 'Higher memory usage and longer processing times', suggestion: 'Consider using sampling for exploratory analysis', }); } // Wide dataset warnings if (columns > 100) { this.warnings.push({ category: 'structural', severity: 'medium', message: `Wide dataset detected (${columns} columns)`, impact: 'Complex correlation analysis and visualization challenges', suggestion: 'Consider feature selection or dimensionality reduction', }); } // Memory warnings if (memoryMB > 1000) { this.warnings.push({ category: 'structural', severity: 'high', message: `High memory usage estimated (${memoryMB}MB)`, impact: 'May exceed available system memory', suggestion: 'Consider processing in chunks or using sampling', }); } // Small dataset warnings if (dataRows < 10) { this.warnings.push({ category: 'structural', severity: 'low', message: `Very small dataset (${dataRows} rows)`, impact: 'Limited statistical analysis capability', suggestion: 'Statistical tests may have low power', }); } } /** * Create empty structure for edge cases */ createEmptyStructure() { return { totalRowsRead: 0, totalDataRows: 0, totalColumns: 0, totalDataCells: 0, columnInventory: [], estimatedInMemorySizeMB: 0, averageRowLengthBytes: 0, sparsityAnalysis: { sparsityPercentage: 0, method: 'No data available', sampleSize: 0, description: 'Empty dataset', }, }; } /** * Generate quick column statistics for overview */ generateQuickStatistics(rows, hasHeader, columnInventory) { if (rows.length === 0) { return this.createEmptyStatistics(); } const dataStartIndex = hasHeader ? 1 : 0; const sampleSize = Math.min(rows.length - dataStartIndex, 1000); // Sample first 1000 data rows const sampleRows = rows.slice(dataStartIndex, dataStartIndex + sampleSize); let numericColumns = 0; let textColumns = 0; let dateColumns = 0; let booleanColumns = 0; let emptyColumns = 0; let highCardinalityColumns = 0; let lowCardinalityColumns = 0; const potentialIdColumns = []; const columnTypes = []; for (let colIndex = 0; colIndex < columnInventory.length; colIndex++) { const column = columnInventory[colIndex]; const values = []; // Collect values from sample for (const row of sampleRows) { if (row.data[colIndex] !== undefined) { values.push(row.data[colIndex]); } } if (values.length === 0) { emptyColumns++; columnTypes.push({ columnName: column.name, detectedType: 'empty', uniqueValueCount: 0, cardinality: 'low', }); continue; } // Analyze column type and characteristics const typeAnalysis = this.analyzeColumnType(values); const uniqueValueCount = new Set(values).size; const cardinalityRatio = uniqueValueCount / values.length; let cardinality = 'medium'; if (cardinalityRatio > 0.5) { cardinality = 'high'; highCardinalityColumns++; } else if (cardinalityRatio < 0.1) { cardinality = 'low'; lowCardinalityColumns++; } // Count by type switch (typeAnalysis.primaryType) { case 'numeric': numericColumns++; break; case 'date': dateColumns++; break; case 'boolean': booleanColumns++; break; case 'text': default: textColumns++; break; } if (typeAnalysis.primaryType === 'empty') { emptyColumns++; } // Check if potentially an ID column if (this.isPotentialIdColumn(column.name, values, uniqueValueCount)) { potentialIdColumns.push(column.name); } columnTypes.push({ columnName: column.name, detectedType: typeAnalysis.primaryType, uniqueValueCount, cardinality, }); } return { numericColumns, textColumns, dateColumns, booleanColumns, emptyColumns, highCardinalityColumns, lowCardinalityColumns, potentialIdColumns, columnTypes, analysisMethod: `Sample-based analysis (${sampleSize} rows)`, sampleSize, }; } /** * Analyze the type of a column based on its values */ analyzeColumnType(values) { if (values.length === 0) { return { primaryType: 'empty' }; } let numericCount = 0; let dateCount = 0; let booleanCount = 0; let emptyCount = 0; for (const value of values.slice(0, 50)) { // Sample first 50 values for type detection const trimmed = value.trim(); if (this.isEmptyCell(trimmed)) { emptyCount++; } else if (this.isNumeric(trimmed)) { numericCount++; } else if (this.isDate(trimmed)) { dateCount++; } else if (this.isBoolean(trimmed)) { booleanCount++; } } const sampleSize = Math.min(values.length, 50); const threshold = sampleSize * 0.6; // 60% threshold for type determination if (numericCount >= threshold) return { primaryType: 'numeric' }; if (dateCount >= threshold) return { primaryType: 'date' }; if (booleanCount >= threshold) return { primaryType: 'boolean' }; if (emptyCount >= threshold) return { primaryType: 'empty' }; // Check for mixed types const totalTypedValues = numericCount + dateCount + booleanCount; if (totalTypedValues >= threshold) { return { primaryType: 'mixed' }; } return { primaryType: 'text' }; } /** * Check if a value is numeric */ isNumeric(value) { if (value === '') return false; const num = Number(value.replace(/,/g, '')); // Remove commas return !isNaN(num) && isFinite(num); } /** * Check if a value looks like a date */ isDate(value) { if (value === '') return false; // Common date patterns const datePatterns = [ /^\d{4}-\d{2}-\d{2}$/, // YYYY-MM-DD /^\d{2}\/\d{2}\/\d{4}$/, // MM/DD/YYYY /^\d{2}-\d{2}-\d{4}$/, // MM-DD-YYYY /^\d{4}\/\d{2}\/\d{2}$/, // YYYY/MM/DD ]; return datePatterns.some(pattern => pattern.test(value)) || !isNaN(Date.parse(value)); } /** * Check if a value is boolean */ isBoolean(value) { const lower = value.toLowerCase().trim(); return ['true', 'false', 'yes', 'no', 'y', 'n', '1', '0'].includes(lower); } /** * Check if a column might be an ID column */ isPotentialIdColumn(columnName, values, uniqueValueCount) { // Check name patterns const namePatterns = [/id$/i, /^id/i, /key$/i, /^key/i, /index$/i, /^index/i]; const hasIdName = namePatterns.some(pattern => pattern.test(columnName)); // Check uniqueness (high cardinality suggests ID) const uniquenessRatio = uniqueValueCount / values.length; const isHighlyUnique = uniquenessRatio > 0.9; // Check if values look like IDs (sequential numbers, UUIDs, etc.) const sampleValues = values.slice(0, 10); const looksLikeIds = sampleValues.every(value => { const trimmed = value.trim(); return /^[a-zA-Z0-9\-_]+$/.test(trimmed) && trimmed.length >= 3; }); return hasIdName || (isHighlyUnique && looksLikeIds); } /** * Create empty statistics for edge cases */ createEmptyStatistics() { return { numericColumns: 0, textColumns: 0, dateColumns: 0, booleanColumns: 0, emptyColumns: 0, highCardinalityColumns: 0, lowCardinalityColumns: 0, potentialIdColumns: [], columnTypes: [], analysisMethod: 'No data available', sampleSize: 0, }; } /** * Get collected warnings */ getWarnings() { return [...this.warnings]; } /** * Clear warnings */ clearWarnings() { this.warnings = []; } } exports.StructuralAnalyzer = StructuralAnalyzer; //# sourceMappingURL=structural-analyzer.js.map