UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

github.com/Mrassimo/datapilot

Mrassimo/datapilot

998 lines • 46.1 kB

JavaScript

"use strict"; /** * Streaming Data Analysis Engine * Memory-efficient analysis using online algorithms and chunk processing */ Object.defineProperty(exports, "__esModule", { value: true }); exports.StreamingAnalyzer = void 0; exports.analyzeFileStreaming = analyzeFileStreaming; exports.analyzeFileStreamingWithSampling = analyzeFileStreamingWithSampling; const fs_1 = require("fs"); const fs_2 = require("fs"); const stream_1 = require("stream"); const promises_1 = require("stream/promises"); const logger_1 = require("../../utils/logger"); const csv_parser_1 = require("../../parsers/csv-parser"); const config_1 = require("../../core/config"); const types_1 = require("../../core/types"); const memory_optimizer_1 = require("../../performance/memory-optimizer"); const adaptive_streamer_1 = require("../../performance/adaptive-streamer"); const streaming_univariate_analyzer_1 = require("./streaming-univariate-analyzer"); const streaming_bivariate_analyzer_1 = require("./streaming-bivariate-analyzer"); const multivariate_orchestrator_1 = require("../multivariate/multivariate-orchestrator"); const enhanced_type_detector_1 = require("./enhanced-type-detector"); const smart_sampler_1 = require("./smart-sampler"); const types_2 = require("../eda/types"); /** * Main Streaming Analysis Engine * Processes any size dataset with constant memory usage */ class StreamingAnalyzer { config; state; progressCallback; // Analyzers columnAnalyzers = new Map(); bivariateAnalyzer; // Smart sampling smartSampler; // Metadata headers = []; detectedTypes = []; semanticTypes = []; warnings = []; typeDetectionResults = []; // Store enhanced detection results (limited) hasHeaders = false; // Track if CSV has headers // Data collection for multivariate analysis (when enabled) collectedData = []; maxCollectedRows; /** * Create a StreamingAnalyzer with smart sampling options from CLI */ static withSamplingOptions(config = {}, samplingOptions) { return new StreamingAnalyzer({ ...config, samplingOptions, }); } constructor(config = {}) { const configManager = (0, config_1.getConfig)(); const streamingConfig = configManager.getStreamingConfig(); const analysisConfig = configManager.getAnalysisConfig(); const statisticalConfig = configManager.getStatisticalConfig(); this.config = { // Default streaming config from configuration manager chunkSize: streamingConfig.adaptiveChunkSizing.minChunkSize, memoryThresholdMB: streamingConfig.memoryThresholdMB, maxRowsAnalyzed: streamingConfig.maxRowsAnalyzed, adaptiveChunkSizing: streamingConfig.adaptiveChunkSizing.enabled, enableMemoryOptimization: true, enableAdaptiveStreaming: true, enableParallelProcessing: false, // Keep sequential for base analyzer // Default Section3Config from configuration manager enabledAnalyses: analysisConfig.enabledAnalyses, significanceLevel: statisticalConfig.significanceLevel, maxCorrelationPairs: analysisConfig.maxCorrelationPairs, outlierMethods: analysisConfig.outlierMethods, normalityTests: analysisConfig.normalityTests, maxCategoricalLevels: analysisConfig.maxCategoricalLevels, enableMultivariate: analysisConfig.enableMultivariate, samplingThreshold: analysisConfig.samplingThreshold, ...config, }; // Set maxCollectedRows from configuration const perfConfig = configManager.getPerformanceConfig(); this.maxCollectedRows = perfConfig.maxCollectedRowsMultivariate; this.state = { rowsProcessed: 0, chunksProcessed: 0, currentMemoryMB: 0, peakMemoryMB: 0, startTime: 0, currentChunkSize: this.config.chunkSize, hasSkippedHeader: false, samplingEnabled: false, }; this.bivariateAnalyzer = new streaming_bivariate_analyzer_1.StreamingBivariateAnalyzer(this.config.maxCorrelationPairs); // Initialize memory optimization if enabled if (this.config.enableMemoryOptimization) { this.initializeMemoryOptimization(); } } setProgressCallback(callback) { this.progressCallback = callback; } /** * Initialize smart sampling if conditions are met */ initializeSmartSampling(filePath, fileSize) { const samplingOptions = this.config.samplingOptions; if (!samplingOptions) return; const shouldSample = smart_sampler_1.SmartSampler.shouldEnableSampling(fileSize, samplingOptions); if (shouldSample) { this.state.samplingEnabled = true; // Create default options for any missing required fields const defaultSamplingOptions = { autoSample: false, samplePercentage: undefined, sampleRows: undefined, sampleSizeBytes: undefined, sampleMethod: 'random', stratifyBy: undefined, seed: undefined, ...samplingOptions, }; this.smartSampler = new smart_sampler_1.SmartSampler(defaultSamplingOptions, fileSize, { section: 'eda', analyzer: 'StreamingAnalyzer', operation: 'smart-sampling', filePath, }); logger_1.logger.info(`Smart sampling enabled for ${this.formatBytes(fileSize)} file`, { section: 'eda', analyzer: 'StreamingAnalyzer', operation: 'initializeSmartSampling', filePath, }); } } /** * Analyze a CSV file using streaming processing */ async analyzeFile(filePath) { // Wrap with memory optimization if enabled if (this.config.enableMemoryOptimization) { return this.analyzeFileWithMemoryOptimization(filePath); } return this.analyzeFileInternal(filePath); } /** * Internal file analysis with memory optimization wrapper */ analyzeFileWithMemoryOptimization; initializeMemoryOptimization() { this.analyzeFileWithMemoryOptimization = (0, memory_optimizer_1.withMemoryOptimization)(async (filePath) => { return this.analyzeFileInternal(filePath); }, { enableGc: true, bufferPooling: true, memoryThreshold: this.config.memoryThresholdMB / 512, // Convert MB to ratio }); } /** * Core file analysis implementation */ async analyzeFileInternal(filePath) { const context = { section: 'eda', analyzer: 'StreamingAnalyzer', filePath, operation: 'analyzeFile', }; logger_1.logger.info('Starting streaming analysis of file', context); this.state.startTime = Date.now(); // Initialize smart sampling if configured try { const fileStats = await fs_2.promises.stat(filePath); this.initializeSmartSampling(filePath, fileStats.size); } catch (error) { logger_1.logger.warn('Could not determine file size for sampling decisions', context, error); } // Initialize memory optimizer if enabled if (this.config.enableMemoryOptimization) { this.initializeMemoryOptimization(); const memoryOptimizer = (0, memory_optimizer_1.getGlobalMemoryOptimizer)({ maxMemoryMB: this.config.memoryThresholdMB, enableMemoryPooling: true, adaptiveChunkSizing: this.config.adaptiveChunkSizing, }); // Listen for memory pressure events memoryOptimizer.on('memory-pressure', (data) => { this.handleMemoryPressure(data.pressure); }); } try { // Enhanced streaming with adaptive processing if (this.config.enableAdaptiveStreaming) { return await this.analyzeFileWithAdaptiveStreaming(filePath); } // Traditional streaming analysis return await this.analyzeFileTraditional(filePath); } catch (error) { logger_1.logger.errorWithStack(error instanceof Error ? error : new Error(String(error)), context); throw error; } } /** * Traditional streaming analysis (backward compatibility) */ async analyzeFileTraditional(filePath) { // Phase 1: Initialize parsers and detect format this.reportProgress('initialization', 0, 'Initializing streaming analysis...'); const parser = new csv_parser_1.CSVParser({ maxRows: this.config.maxRowsAnalyzed, autoDetect: true, }); // Phase 2: First pass - data type detection and initialization await this.firstPass(parser, filePath); // Phase 3: Main streaming analysis await this.streamingPass(parser, filePath); // Phase 4: Finalize results return await this.finalizeResults(); } /** * Enhanced streaming analysis with adaptive chunk sizing */ async analyzeFileWithAdaptiveStreaming(filePath) { const adaptiveStreamer = (0, adaptive_streamer_1.getGlobalAdaptiveStreamer)({ initialChunkSize: this.config.chunkSize, memoryPressureThreshold: this.config.memoryThresholdMB / 512, }); // Phase 1: Initialize this.reportProgress('initialization', 0, 'Initializing adaptive streaming analysis...'); const parser = new csv_parser_1.CSVParser({ maxRows: this.config.maxRowsAnalyzed, autoDetect: true, }); // Create streaming session const sessionId = await adaptiveStreamer.createSession(filePath); try { // Phase 2: Quick sample for type detection await this.firstPass(parser, filePath); // Phase 3: Adaptive streaming analysis this.reportProgress('univariate', 0, 'Starting adaptive streaming analysis...'); let processedChunks = 0; for await (const result of adaptiveStreamer.streamFile(sessionId, async (chunk, metadata) => { // Convert buffer to string and parse const chunkText = chunk.toString('utf8'); const rows = await this.parseChunkText(chunkText, parser, metadata.chunkIndex > 0); // Process the chunk if (rows.length > 0) { this.processChunk(rows); processedChunks++; // Update progress const progress = Math.min(95, (metadata.filePosition / metadata.fileSize) * 100); this.reportProgress('univariate', progress, `Processed ${processedChunks} chunks (${this.formatBytes(metadata.filePosition)}/${this.formatBytes(metadata.fileSize)})`); } return { processed: rows.length, bytes: chunk.length }; })) { // Optional: collect streaming results } // Phase 4: Finalize this.reportProgress('finalization', 95, 'Finalizing analysis...'); return await this.finalizeResults(); } finally { // Cleanup streaming session const sessionStats = adaptiveStreamer.getSessionStats(sessionId); if (sessionStats) { logger_1.logger.info(`Adaptive streaming completed: ${sessionStats.metrics.processingRate.toFixed(2)} MB/s, ${sessionStats.metrics.adaptationCount} adaptations`); } } } /** * First pass: Quick scan for headers, types, and basic metadata */ async firstPass(parser, filePath) { this.reportProgress('initialization', 25, 'Detecting data types...'); let sampleData; if (this.state.samplingEnabled && this.smartSampler) { // Use smart sampling for data collection sampleData = await this.collectSmartSample(parser, filePath); } else { // Use traditional fixed sampling sampleData = await this.collectFixedSample(parser, filePath); } if (sampleData.length === 0) { throw new Error('No data found in file'); } // Extract headers from parser config and sample data this.headers = this.extractHeaders(sampleData, parser); // Store whether CSV has headers for later use in data processing const parserOptions = parser.getOptions(); this.hasHeaders = parserOptions.hasHeader ?? true; // Detect column types from sample this.detectedTypes = this.detectColumnTypes(sampleData); // Infer semantic types this.semanticTypes = this.inferSemanticTypes(); // Initialize column analyzers this.initializeColumnAnalyzers(); // Initialize bivariate analysis this.initializeBivariateAnalysis(); this.reportProgress('initialization', 100, 'Initialization complete'); } /** * Collect sample data using smart sampling */ async collectSmartSample(parser, filePath) { if (!this.smartSampler) { throw new Error('Smart sampler not initialized'); } // First, collect all data (or a reasonable subset for very large files) const allData = []; let rowCount = 0; const maxInitialSample = 100000; // Limit initial collection to prevent memory issues const dataStream = new stream_1.Transform({ objectMode: true, transform(chunk, _encoding, callback) { if (rowCount < maxInitialSample) { allData.push(chunk); rowCount++; } callback(); }, }); const readStream = (0, fs_1.createReadStream)(filePath); const parseStream = parser.createStream(); await (0, promises_1.pipeline)(readStream, parseStream, dataStream); // Get preliminary headers for sampling strategy calculation const preliminaryHeaders = this.extractHeaders(allData.slice(0, 10), parser); const estimatedRowCount = Math.max(allData.length, rowCount); // Calculate sampling strategy const strategy = this.smartSampler.calculateSamplingStrategy(estimatedRowCount, preliminaryHeaders); // Perform sampling const samplingResult = await this.smartSampler.performSampling(allData, strategy, preliminaryHeaders); // Store sampling result for reporting this.state.samplingResult = samplingResult; // Add sampling notice to warnings this.warnings.push({ category: 'data', severity: 'medium', message: 'Smart sampling was applied to this dataset for efficient processing', suggestion: samplingResult.warnings.join('; ') || 'Use --seed option for reproducible sampling', }); logger_1.logger.info(`Smart sampling completed: ${samplingResult.sampledRowCount} samples from ${samplingResult.originalRowCount} rows using ${samplingResult.strategy.name} method`, { section: 'eda', analyzer: 'StreamingAnalyzer', operation: 'collectSmartSample', filePath, }); return samplingResult.samples; } /** * Collect sample data using traditional fixed sampling */ async collectFixedSample(parser, filePath) { let sampleRowCount = 0; const maxSampleRows = 1000; const sampleData = []; const sampleStream = new stream_1.Transform({ objectMode: true, transform(chunk, _encoding, callback) { if (sampleRowCount < maxSampleRows) { sampleData.push(chunk); sampleRowCount++; } callback(); }, }); const readStream = (0, fs_1.createReadStream)(filePath); const parseStream = parser.createStream(); await (0, promises_1.pipeline)(readStream, parseStream, sampleStream); return sampleData; } /** * Parse chunk text into rows */ async parseChunkText(chunkText, parser, skipHeader = false) { const lines = chunkText.split('\n').filter((line) => line.trim()); const rows = []; const startIndex = skipHeader ? 1 : 0; for (let i = startIndex; i < lines.length; i++) { try { // Simple CSV line parsing - this would need proper implementation const fields = lines[i].split(',').map((field) => field.trim()); const parsedRow = { index: i, data: fields, raw: lines[i], }; if (parsedRow && parsedRow.data) { rows.push(parsedRow); } } catch (error) { // Skip invalid rows logger_1.logger.warn(`Skipped invalid row at line ${i}: ${error.message}`); } } return rows; } /** * Handle memory pressure by adapting chunk size */ handleMemoryPressure(pressure) { if (pressure > 0.8) { // Reduce chunk size under memory pressure const reductionFactor = Math.max(0.3, 1 - pressure); this.state.currentChunkSize = Math.max(100, // Minimum chunk size Math.floor(this.state.currentChunkSize * reductionFactor)); logger_1.logger.warn(`Memory pressure detected (${(pressure * 100).toFixed(1)}%), reduced chunk size to ${this.state.currentChunkSize}`); this.warnings.push({ category: 'performance', severity: 'medium', message: `Reduced chunk size due to memory pressure (${(pressure * 100).toFixed(1)}%)`, suggestion: 'Consider reducing maxRowsAnalyzed or increasing available memory', }); } } /** * Format bytes for human readable display */ formatBytes(bytes) { if (bytes === undefined || bytes === null || isNaN(bytes)) { return '0B'; } const units = ['B', 'KB', 'MB', 'GB']; let size = bytes; let unitIndex = 0; while (size >= 1024 && unitIndex < units.length - 1) { size /= 1024; unitIndex++; } return `${size.toFixed(unitIndex > 0 ? 1 : 0)}${units[unitIndex]}`; } /** * Main streaming pass: Process data in chunks */ async streamingPass(parser, filePath) { this.reportProgress('univariate', 0, 'Starting streaming analysis...'); let currentChunk = []; const chunkProcessor = new stream_1.Transform({ objectMode: true, transform: (row, _encoding, callback) => { try { currentChunk.push(row); // Process chunk when it reaches target size if (currentChunk.length >= this.state.currentChunkSize) { this.processChunk(currentChunk); currentChunk = []; // Adaptive memory management this.manageMemory(); // Check if we've hit row limit if (this.state.rowsProcessed >= this.config.maxRowsAnalyzed) { this.warnings.push({ category: 'performance', severity: 'medium', message: `Analysis stopped at ${this.config.maxRowsAnalyzed} rows to prevent memory issues`, impact: 'Results based on subset of data', suggestion: 'Increase maxRowsAnalyzed in configuration if more memory is available', }); return callback(); } } callback(); } catch (error) { callback(error instanceof Error ? error : new Error(String(error))); } }, flush: (callback) => { // Process final partial chunk if (currentChunk.length > 0) { this.processChunk(currentChunk); } callback(); }, }); const readStream = (0, fs_1.createReadStream)(filePath); const parseStream = parser.createStream(); await (0, promises_1.pipeline)(readStream, parseStream, chunkProcessor); this.reportProgress('univariate', 100, 'Streaming analysis complete'); } /** * Process a single chunk of data */ processChunk(chunk) { this.state.chunksProcessed++; for (const row of chunk) { // Skip header row if this is the first row and CSV has headers if (this.hasHeaders && !this.state.hasSkippedHeader && row.index === 0) { this.state.hasSkippedHeader = true; continue; // Skip processing the header row as data } this.state.rowsProcessed++; // Process each column for univariate analysis for (let colIndex = 0; colIndex < this.headers.length; colIndex++) { const columnName = this.headers[colIndex]; const analyzer = this.columnAnalyzers.get(columnName); if (analyzer && row.data[colIndex] !== undefined) { analyzer.processValue(row.data[colIndex]); } } // Process for bivariate analysis this.bivariateAnalyzer.processRow(row.data, this.detectedTypes); // Collect data for multivariate analysis (if enabled and within limit) if (this.config.enableMultivariate && this.collectedData.length < this.maxCollectedRows) { this.collectedData.push([...row.data]); // Store a copy of the row data } } // Clear chunk from memory immediately chunk.length = 0; // Update progress periodically based on configuration const configManager = (0, config_1.getConfig)(); const perfConfig = configManager.getPerformanceConfig(); if (this.state.chunksProcessed % perfConfig.performanceMonitoringInterval === 0) { const progress = Math.min(90, (this.state.rowsProcessed / this.config.maxRowsAnalyzed) * 90); this.reportProgress('univariate', progress, `Processed ${this.state.rowsProcessed.toLocaleString()} rows in ${this.state.chunksProcessed} chunks`); } // Memory cleanup based on configuration interval if (this.state.chunksProcessed % perfConfig.memoryCleanupInterval === 0) { this.performMemoryCleanup(); } } /** * Perform aggressive memory cleanup */ performMemoryCleanup() { const configManager = (0, config_1.getConfig)(); const perfConfig = configManager.getPerformanceConfig(); const streamingConfig = configManager.getStreamingConfig(); // Clear type detection results after initial setup if (this.state.chunksProcessed > perfConfig.performanceMonitoringInterval) { this.typeDetectionResults = []; } // Clear memory from all column analyzers for (const analyzer of this.columnAnalyzers.values()) { if (analyzer.clearMemory) { analyzer.clearMemory(); } } // If under extreme memory pressure and we have sufficient data for multivariate analysis, // limit the collected data to prevent memory issues const emergencyThreshold = this.config.memoryThresholdMB * streamingConfig.memoryManagement.emergencyThresholdMultiplier; const minMultivariateRows = Math.min(1000, perfConfig.maxCollectedRowsMultivariate / 2); if (this.state.currentMemoryMB > emergencyThreshold && this.collectedData.length > minMultivariateRows) { // Keep only the minimum required rows for multivariate analysis this.collectedData = this.collectedData.slice(0, minMultivariateRows); } // Force garbage collection if available and enabled if (streamingConfig.memoryManagement.forceGarbageCollection && global.gc) { global.gc(); } } /** * Adaptive memory management with aggressive cleanup */ manageMemory() { const memUsage = process.memoryUsage(); this.state.currentMemoryMB = Math.round(memUsage.heapUsed / (1024 * 1024)); this.state.peakMemoryMB = Math.max(this.state.peakMemoryMB, this.state.currentMemoryMB); const configManager = (0, config_1.getConfig)(); const streamingConfig = configManager.getStreamingConfig(); if (this.config.adaptiveChunkSizing) { if (this.state.currentMemoryMB > this.config.memoryThresholdMB) { // Reduce chunk size to use less memory based on configuration this.state.currentChunkSize = Math.max(streamingConfig.adaptiveChunkSizing.minChunkSize, Math.floor(this.state.currentChunkSize * streamingConfig.adaptiveChunkSizing.reductionFactor)); // Clear type detection results to free memory this.typeDetectionResults = []; // Force garbage collection if available and enabled if (streamingConfig.memoryManagement.forceGarbageCollection && global.gc) { global.gc(); } } else if (this.state.currentMemoryMB < this.config.memoryThresholdMB * 0.3) { // Increase chunk size for better performance this.state.currentChunkSize = Math.min(streamingConfig.adaptiveChunkSizing.maxChunkSize, Math.floor(this.state.currentChunkSize * streamingConfig.adaptiveChunkSizing.expansionFactor)); } } // Emergency brake if memory gets too high const emergencyThreshold = this.config.memoryThresholdMB * streamingConfig.memoryManagement.emergencyThresholdMultiplier; if (this.state.currentMemoryMB > emergencyThreshold) { this.warnings.push({ category: 'performance', severity: 'high', message: `High memory usage detected (${this.state.currentMemoryMB}MB). Consider reducing maxRowsAnalyzed in configuration.`, impact: 'Analysis may slow down or fail', suggestion: 'Reduce dataset size, increase available memory, or adjust memory thresholds in configuration', }); // Aggressive memory cleanup this.typeDetectionResults = []; if (streamingConfig.memoryManagement.forceGarbageCollection && global.gc) { global.gc(); } } } /** * Finalize analysis and generate results */ async finalizeResults() { this.reportProgress('finalization', 0, 'Finalizing results...'); // Collect univariate results const univariateAnalysis = []; for (const [columnName, analyzer] of this.columnAnalyzers) { try { const result = analyzer.finalize(); univariateAnalysis.push(result); this.warnings.push(...analyzer.getWarnings()); } catch (error) { logger_1.logger.error(`Error finalizing analysis for column ${columnName}:`, { section: 'eda', analyzer: 'StreamingAnalyzer', operation: 'finalizeColumnAnalysis', }, error); this.warnings.push({ category: 'error', severity: 'high', message: `Failed to complete analysis for column ${columnName}`, impact: 'Column excluded from results', suggestion: 'Check data quality or column type detection', }); } } // Collect bivariate results const bivariateAnalysis = this.bivariateAnalyzer.finalize(this.headers); this.warnings.push(...this.bivariateAnalyzer.getWarnings()); // Generate insights const insights = this.generateStreamingInsights(univariateAnalysis); const endTime = Date.now(); const analysisTime = endTime - this.state.startTime; // Perform multivariate analysis if enabled and applicable let multivariateAnalysis; if (this.config.enableMultivariate && this.state.rowsProcessed > 50) { this.reportProgress('multivariate', 90, 'Performing multivariate analysis...'); try { multivariateAnalysis = await multivariate_orchestrator_1.MultivariateOrchestrator.analyze(this.collectedData || [], this.headers, this.detectedTypes, this.state.rowsProcessed); logger_1.logger.info('Multivariate analysis completed successfully'); } catch (error) { logger_1.logger.warn('Multivariate analysis failed:', { section: 'eda', analyzer: 'StreamingAnalyzer', operation: 'multivariateAnalysis', }, error); // Fallback to minimal analysis multivariateAnalysis = await multivariate_orchestrator_1.MultivariateOrchestrator.analyze([], [], [], 0); } } else { // Create minimal multivariate analysis when disabled or insufficient data multivariateAnalysis = await multivariate_orchestrator_1.MultivariateOrchestrator.analyze([], [], [], 0); } const edaAnalysis = { univariateAnalysis, bivariateAnalysis, multivariateAnalysis, crossVariableInsights: insights, }; this.reportProgress('finalization', 100, 'Analysis complete'); return { edaAnalysis, warnings: this.warnings, performanceMetrics: { analysisTimeMs: analysisTime, rowsAnalyzed: this.state.rowsProcessed, chunksProcessed: this.state.chunksProcessed, peakMemoryMB: this.state.peakMemoryMB, avgChunkSize: Math.round(this.state.rowsProcessed / this.state.chunksProcessed), memoryEfficiency: `Constant ~${this.state.peakMemoryMB}MB usage`, }, metadata: { analysisApproach: this.state.samplingEnabled ? 'Streaming with smart sampling and online algorithms' : 'Streaming with online algorithms', datasetSize: this.state.rowsProcessed, columnsAnalyzed: this.headers.length, samplingApplied: this.state.samplingEnabled || this.state.rowsProcessed >= this.config.maxRowsAnalyzed, }, }; } extractHeaders(sampleData, parser) { if (sampleData.length === 0) return []; const firstRow = sampleData[0]; // Get header setting from parser options const parserOptions = parser.getOptions(); const hasHeader = parserOptions.hasHeader ?? true; // Default to true for CSV files if (hasHeader) { // Use actual column names from header row return firstRow.data.map((headerValue, index) => headerValue && headerValue.trim() ? headerValue.trim() : `Column_${index + 1}`); } else { // Generate generic column names only if no headers return firstRow.data.map((_, index) => `Column_${index + 1}`); } } detectColumnTypes(sampleData) { if (sampleData.length === 0) return []; const columnCount = sampleData[0].data.length; // Skip header row if present when sampling for type detection const dataStartIndex = this.hasHeaders ? 1 : 0; const effectiveSampleData = sampleData.slice(dataStartIndex); // Prepare column samples for enhanced detection const columnSamples = []; for (let colIndex = 0; colIndex < columnCount; colIndex++) { const values = effectiveSampleData.slice(0, 500).map((row) => row.data[colIndex]); // Use more samples, excluding header const columnName = this.headers[colIndex] || `Column_${colIndex + 1}`; columnSamples.push({ values, columnName, columnIndex: colIndex, }); } // Use enhanced type detection const detectionResults = enhanced_type_detector_1.EnhancedTypeDetector.detectColumnTypes(columnSamples); // Store detection results for semantic type inference (clear after use to save memory) this.typeDetectionResults = detectionResults; // Log detection results for debugging for (let i = 0; i < detectionResults.length; i++) { const result = detectionResults[i]; if (result.confidence > 0.7) { logger_1.logger.info(`Column ${this.headers[i]}: ${result.dataType} (${result.semanticType}) - Confidence: ${result.confidence.toFixed(2)}`); } } return detectionResults.map((result) => result.dataType); } inferSemanticTypes() { // Use enhanced detection results if available if (this.typeDetectionResults && this.typeDetectionResults.length > 0) { return this.typeDetectionResults.map((result) => result.semanticType); } // Fallback to simple inference return this.headers.map((header, index) => { const headerLower = header.toLowerCase(); const type = this.detectedTypes[index]; // Simple semantic type inference if (headerLower.includes('price') || headerLower.includes('cost') || headerLower.includes('amount')) { return types_2.SemanticType.CURRENCY; } else if (headerLower.includes('age')) { return types_2.SemanticType.AGE; } else if (headerLower.includes('id') || headerLower.includes('identifier')) { return types_2.SemanticType.IDENTIFIER; } else if (type === types_2.EdaDataType.CATEGORICAL) { return types_2.SemanticType.CATEGORY; } return types_2.SemanticType.UNKNOWN; }); } initializeColumnAnalyzers() { for (let i = 0; i < this.headers.length; i++) { const columnName = this.headers[i]; const columnType = this.detectedTypes[i]; const semanticType = this.semanticTypes[i]; let analyzer; // Select appropriate analyzer based on detected column type switch (columnType) { case types_2.EdaDataType.NUMERICAL_FLOAT: case types_2.EdaDataType.NUMERICAL_INTEGER: analyzer = new streaming_univariate_analyzer_1.StreamingNumericalAnalyzer(columnName, columnType, semanticType); break; case types_2.EdaDataType.DATE_TIME: analyzer = new streaming_univariate_analyzer_1.StreamingDateTimeAnalyzer(columnName, columnType, semanticType); break; case types_2.EdaDataType.BOOLEAN: analyzer = new streaming_univariate_analyzer_1.StreamingBooleanAnalyzer(columnName, columnType, semanticType); break; case types_2.EdaDataType.TEXT_GENERAL: case types_2.EdaDataType.TEXT_ADDRESS: analyzer = new streaming_univariate_analyzer_1.StreamingTextAnalyzer(columnName, columnType, semanticType); break; case types_2.EdaDataType.CATEGORICAL: default: analyzer = new streaming_univariate_analyzer_1.StreamingCategoricalAnalyzer(columnName, columnType, semanticType); break; } this.columnAnalyzers.set(columnName, analyzer); } } initializeBivariateAnalysis() { const pairs = []; for (let i = 0; i < this.headers.length; i++) { for (let j = i + 1; j < this.headers.length; j++) { pairs.push({ col1Index: i, col1Name: this.headers[i], col1Type: this.detectedTypes[i], col2Index: j, col2Name: this.headers[j], col2Type: this.detectedTypes[j], }); } } this.bivariateAnalyzer.initializePairs(pairs); } generateStreamingInsights(univariateAnalysis) { const topFindings = []; const dataQualityIssues = []; const hypothesesGenerated = []; const preprocessingRecommendations = []; // Analyze data quality const poorQualityColumns = univariateAnalysis.filter((col) => col.missingPercentage > 20); if (poorQualityColumns.length > 0) { dataQualityIssues.push(`${poorQualityColumns.length} columns have >20% missing values: ${poorQualityColumns.map((c) => c.columnName).join(', ')}`); } // High cardinality detection const highCardinalityColumns = univariateAnalysis.filter((col) => col.uniquePercentage > 80 && col.totalValues > 100); if (highCardinalityColumns.length > 0) { preprocessingRecommendations.push(`Consider encoding or grouping high-cardinality columns: ${highCardinalityColumns.map((c) => c.columnName).join(', ')}`); } // Memory efficiency insight topFindings.push(`Streaming analysis processed ${this.state.rowsProcessed.toLocaleString()} rows using only ${this.state.peakMemoryMB}MB peak memory`); return { topFindings, dataQualityIssues, hypothesesGenerated, preprocessingRecommendations, }; } reportProgress(stage, percentage, message) { if (this.progressCallback) { this.progressCallback({ stage: stage, percentage, message, currentStep: this.state.chunksProcessed, totalSteps: Math.ceil(this.config.maxRowsAnalyzed / this.state.currentChunkSize), }); } } /** * Handle analysis errors with graceful degradation */ async handleAnalysisError(error, logContext) { logger_1.logger.errorWithStack(error instanceof Error ? error : new Error(String(error)), logContext); if (error instanceof types_1.DataPilotError) { // Check if we can provide a degraded result if (error.recoverable && this.state.rowsProcessed > 0) { this.warnings.push({ category: 'error', message: `Analysis completed with errors: ${error.message}`, severity: 'high', impact: 'Partial results available', suggestion: 'Check data quality or review error logs', }); logger_1.logger.warn('Returning partial results due to recoverable error', logContext); return await this.createDegradedResult(error); } } // Re-throw non-recoverable errors throw error; } /** * Create a degraded result when full analysis fails */ async createDegradedResult(error) { // Use the existing MultivariateOrchestrator to create an empty analysis const emptyMultivariateAnalysis = await multivariate_orchestrator_1.MultivariateOrchestrator.analyze([], [], [], 0); return { edaAnalysis: { univariateAnalysis: [], bivariateAnalysis: { numericalVsNumerical: { totalPairsAnalyzed: 0, correlationPairs: [], strongestPositiveCorrelation: null, strongestNegativeCorrelation: null, strongCorrelations: [], scatterPlotInsights: [], regressionInsights: [], }, numericalVsCategorical: [], categoricalVsCategorical: [], }, multivariateAnalysis: emptyMultivariateAnalysis, crossVariableInsights: { topFindings: [`Analysis interrupted: ${error.message}`], dataQualityIssues: ['Incomplete analysis due to processing error'], hypothesesGenerated: [], preprocessingRecommendations: [], }, }, warnings: [ ...this.warnings, { category: 'error', message: 'Analysis completed with reduced functionality due to errors', severity: 'high', impact: 'No analysis results available', suggestion: 'Check error logs and retry with different configuration', }, ], performanceMetrics: { analysisTimeMs: Date.now() - this.state.startTime, peakMemoryMB: this.state.peakMemoryMB, rowsAnalyzed: this.state.rowsProcessed, chunksProcessed: this.state.chunksProcessed, }, metadata: { analysisApproach: 'StreamingAnalyzer (degraded)', datasetSize: this.state.rowsProcessed, columnsAnalyzed: this.headers.length, samplingApplied: false, }, }; } /** * Check if multivariate data should be collected */ shouldCollectMultivariateData() { return (this.collectedData.length < this.maxCollectedRows && this.state.currentMemoryMB < this.config.memoryThresholdMB * 0.8); } /** * Collect data for multivariate analysis with memory-efficient approach */ collectMultivariateData(rowData) { // Use a more memory-efficient approach by sampling if needed const shouldSample = this.collectedData.length > this.maxCollectedRows * 0.8; if (!shouldSample || Math.random() < 0.1) { this.collectedData.push(Object.freeze([...rowData])); } } /** * Handle memory pressure for multivariate data collection */ handleMultivariateMemoryPressure(streamingConfig, perfConfig) { const emergencyThreshold = this.config.memoryThresholdMB * streamingConfig.memoryManagement.emergencyThresholdMultiplier; const minMultivariateRows = Math.min(1000, perfConfig.maxCollectedRowsMultivariate / 2); if (this.state.currentMemoryMB > emergencyThreshold && this.collectedData.length > minMultivariateRows) { // Keep only the minimum required rows for multivariate analysis this.collectedData = this.collectedData.slice(0, minMultivariateRows); } } /** * Perform multivariate analysis with enhanced type safety */ async performMultivariateAnalysis() { const dataArray = this.collectedData.map((row) => [...row]); return await multivariate_orchestrator_1.MultivariateOrchestrator.analyze(dataArray, this.headers, this.detectedTypes, this.state.rowsProcessed); } /** * Create minimal multivariate analysis when disabled or insufficient data */ async createMinimalMultivariateAnalysis() { return await multivariate_orchestrator_1.MultivariateOrchestrator.analyze([], [], [], 0); } /** * Validate analyzer state before operations */ validateAnalyzerState(operation) { if (this.headers.length === 0) { throw types_1.DataPilotError.analysis(`Cannot perform ${operation}: no headers detected`, 'NO_HEADERS_DETECTED', { analyzer: 'StreamingAnalyzer', operationName: operation }, [ { action: 'Check data format', description: 'Ensure the CSV file has proper column headers', severity: types_1.ErrorSeverity.HIGH, }, ]); } if (this.detectedTypes.length !== this.headers.length) { throw types_1.DataPilotError.analysis(`Type detection mismatch: ${this.headers.length} headers, ${this.detectedTypes.length} types`, 'TYPE_HEADER_MISMATCH', { analyzer: 'StreamingAnalyzer', operationName: operation }, [ { action: 'Re-run type detection', description: 'Retry the analysis to fix type detection', severity: types_1.ErrorSeverity.MEDIUM, }, ]); } } } exports.StreamingAnalyzer = StreamingAnalyzer; /** * Convenience function to analyze a file using streaming approach */ async function analyzeFileStreaming(filePath, config = {}) { const analyzer = new StreamingAnalyzer(config); return analyzer.analyzeFile(filePath); } /** * Convenience function to analyze a file using streaming approach with smart sampling */ async function analyzeFileStreamingWithSampling(filePath, samplingOptions, config = {}) { const analyzer = StreamingAnalyzer.withSamplingOptions(config, samplingOptions); return analyzer.analyzeFile(filePath); } //# sourceMappingURL=streaming-analyzer.js.map