UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

424 lines 19 kB
"use strict"; /** * Parallel Streaming Analyzer * High-performance streaming analysis with parallel processing capabilities */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ParallelStreamingAnalyzer = void 0; exports.createParallelStreamingAnalyzer = createParallelStreamingAnalyzer; const events_1 = require("events"); const perf_hooks_1 = require("perf_hooks"); const parallel_analyzer_1 = require("../../performance/parallel-analyzer"); const logger_1 = require("../../utils/logger"); const types_1 = require("../../core/types"); const streaming_analyzer_1 = require("./streaming-analyzer"); /** * Enhanced streaming analyzer with parallel processing capabilities */ class ParallelStreamingAnalyzer extends events_1.EventEmitter { options; parallelAnalyzer; baseAnalyzer; isInitialized = false; constructor(options = {}) { super(); this.options = { // Base streaming options chunkSize: options.chunkSize || 1000, memoryThresholdMB: options.memoryThresholdMB || 100, maxRowsAnalyzed: options.maxRowsAnalyzed || 500000, enabledAnalyses: options.enabledAnalyses || ['univariate', 'bivariate', 'correlations'], significanceLevel: options.significanceLevel || 0.05, maxCorrelationPairs: options.maxCorrelationPairs || 50, enableMultivariate: options.enableMultivariate ?? true, // Parallel processing options enableParallelProcessing: options.enableParallelProcessing ?? true, maxWorkers: options.maxWorkers || Math.max(2, require('os').cpus().length - 1), batchSize: options.batchSize || 1000, parallelThreshold: options.parallelThreshold || 5000, // 5K rows minimum for parallel processing memoryLimitPerWorker: options.memoryLimitPerWorker || 256, }; // Initialize parallel analyzer if enabled if (this.options.enableParallelProcessing) { this.parallelAnalyzer = (0, parallel_analyzer_1.getGlobalParallelAnalyzer)({ maxWorkers: this.options.maxWorkers, memoryLimitMB: this.options.memoryLimitPerWorker, batchSize: this.options.batchSize, }); } // Initialize base analyzer for fallback and small datasets this.baseAnalyzer = new streaming_analyzer_1.StreamingAnalyzer({ chunkSize: this.options.chunkSize, memoryThresholdMB: this.options.memoryThresholdMB, maxRowsAnalyzed: this.options.maxRowsAnalyzed, enabledAnalyses: this.options.enabledAnalyses, significanceLevel: this.options.significanceLevel, maxCorrelationPairs: this.options.maxCorrelationPairs, enableMultivariate: this.options.enableMultivariate, }); logger_1.logger.info(`Parallel streaming analyzer initialized with ${this.options.maxWorkers} workers`); } /** * Analyze file with intelligent parallel/sequential routing */ async analyzeFile(filePath) { const startTime = perf_hooks_1.performance.now(); try { // First, analyze the file to determine if parallel processing is beneficial const fileInfo = await this.analyzeFileCharacteristics(filePath); if (this.shouldUseParallelProcessing(fileInfo)) { logger_1.logger.info(`Using parallel processing for large dataset (${fileInfo.estimatedRows} rows)`); return await this.analyzeFileParallel(filePath, fileInfo); } else { logger_1.logger.info(`Using sequential processing for smaller dataset (${fileInfo.estimatedRows} rows)`); return await this.analyzeFileSequential(filePath, fileInfo); } } catch (error) { throw new types_1.DataPilotError(`Parallel streaming analysis failed: ${error.message}`, 'PARALLEL_STREAMING_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PERFORMANCE); } } /** * Analyze file characteristics to determine processing strategy */ async analyzeFileCharacteristics(filePath) { const { promises: fs } = await Promise.resolve().then(() => __importStar(require('fs'))); const fileStats = await fs.stat(filePath); // Quick sample analysis to estimate data characteristics const sampleSize = Math.min(fileStats.size, 64 * 1024); // 64KB sample const sampleBuffer = Buffer.alloc(sampleSize); const fileHandle = await fs.open(filePath, 'r'); await fileHandle.read(sampleBuffer, 0, sampleSize, 0); await fileHandle.close(); const sampleText = sampleBuffer.toString('utf8'); const lineCount = (sampleText.match(/\n/g) || []).length; const avgLineLength = sampleText.length / lineCount; // Estimate total rows and complexity const estimatedRows = Math.floor(fileStats.size / avgLineLength); const estimatedColumns = (sampleText.split('\n')[0] || '').split(',').length; // Determine data complexity based on content analysis const hasQuotedFields = /["']/.test(sampleText); const hasComplexStructures = /[\[\]{},:]/.test(sampleText); const complexity = hasComplexStructures ? 'high' : hasQuotedFields ? 'medium' : 'low'; return { fileSize: fileStats.size, estimatedRows, estimatedColumns, avgLineLength, complexity, sample: sampleText.slice(0, 1000), // Keep first 1KB for format detection }; } /** * Determine if parallel processing should be used */ shouldUseParallelProcessing(fileInfo) { if (!this.options.enableParallelProcessing) return false; // Use parallel processing for large datasets or complex data const sizeCriteria = fileInfo.estimatedRows >= this.options.parallelThreshold; const complexityCriteria = fileInfo.complexity === 'high' && fileInfo.estimatedRows >= 1000; const columnCriteria = fileInfo.estimatedColumns >= 20 && fileInfo.estimatedRows >= 2000; return sizeCriteria || complexityCriteria || columnCriteria; } /** * Parallel file analysis for large datasets */ async analyzeFileParallel(filePath, fileInfo) { const parallelStartTime = perf_hooks_1.performance.now(); try { // Step 1: Parse data in parallel chunks logger_1.logger.info('Phase 1: Parallel data parsing'); const parseResult = await this.parseFileInParallel(filePath, fileInfo); // Step 2: Parallel statistical analysis logger_1.logger.info('Phase 2: Parallel statistical analysis'); const statsResult = await this.calculateParallelStatistics(parseResult.data); // Step 3: Parallel correlation analysis (if enabled) let correlationResult = { results: [] }; if (this.options.enabledAnalyses.includes('correlations')) { logger_1.logger.info('Phase 3: Parallel correlation analysis'); correlationResult = await this.calculateParallelCorrelations(parseResult.numericColumns); } // Step 4: Parallel outlier detection let outlierResult = { results: [] }; if (this.options.enabledAnalyses.includes('outlier_detection')) { logger_1.logger.info('Phase 4: Parallel outlier detection'); outlierResult = await this.detectParallelOutliers(parseResult.numericColumns); } // Step 5: Parallel frequency analysis for categorical data let frequencyResult = { results: [] }; if (this.options.enabledAnalyses.includes('frequency_analysis')) { logger_1.logger.info('Phase 5: Parallel frequency analysis'); frequencyResult = await this.calculateParallelFrequencies(parseResult.categoricalColumns); } const parallelExecutionTime = perf_hooks_1.performance.now() - parallelStartTime; // Get performance metrics const performanceMetrics = this.calculatePerformanceMetrics(parallelExecutionTime, parseResult, statsResult, correlationResult, outlierResult, frequencyResult); return { overview: { totalRows: parseResult.totalRows, totalColumns: parseResult.totalColumns, numericColumns: parseResult.numericColumns.length, categoricalColumns: parseResult.categoricalColumns.length, processingMode: 'parallel', fileInfo, }, descriptiveStats: statsResult.results, correlations: correlationResult.results, outliers: outlierResult.results, frequencyDistributions: frequencyResult.results, performanceMetrics, }; } catch (error) { logger_1.logger.error(`Parallel analysis failed: ${error.message}`); throw error; } } /** * Sequential file analysis for smaller datasets */ async analyzeFileSequential(filePath, fileInfo) { const sequentialStartTime = perf_hooks_1.performance.now(); try { // Use base analyzer for sequential processing const baseResult = await this.baseAnalyzer.analyzeFile(filePath); const sequentialExecutionTime = perf_hooks_1.performance.now() - sequentialStartTime; // Convert base result to parallel result format return { overview: { ...baseResult, processingMode: 'sequential', fileInfo, }, descriptiveStats: baseResult.edaAnalysis?.univariateAnalysis || [], correlations: Array.isArray(baseResult.edaAnalysis?.bivariateAnalysis?.correlation) ? baseResult.edaAnalysis.bivariateAnalysis.correlation : [], outliers: [], frequencyDistributions: [], performanceMetrics: { totalExecutionTime: sequentialExecutionTime, parallelExecutionTime: 0, sequentialExecutionTime, speedupFactor: 1, tasksExecuted: 1, memoryEfficiency: 1, }, }; } catch (error) { logger_1.logger.error(`Sequential analysis failed: ${error.message}`); throw error; } } /** * Parse file in parallel chunks */ async parseFileInParallel(filePath, fileInfo) { const { promises: fs } = await Promise.resolve().then(() => __importStar(require('fs'))); const fileSize = fileInfo.fileSize; const chunkSize = this.calculateOptimalChunkSize(fileSize); // Split file into chunks const chunks = []; const fileHandle = await fs.open(filePath, 'r'); try { for (let offset = 0; offset < fileSize; offset += chunkSize) { const actualChunkSize = Math.min(chunkSize, fileSize - offset); const chunk = Buffer.alloc(actualChunkSize); await fileHandle.read(chunk, 0, actualChunkSize, offset); chunks.push(chunk); } } finally { await fileHandle.close(); } // Parse chunks in parallel const chunkStrings = chunks.map((chunk) => chunk.toString('utf8')); const parseResult = await this.parallelAnalyzer.parseMultipleCSVChunks(chunkStrings, { delimiter: ',', // This could be detected from the sample hasHeader: true, trimFields: true, }); // Combine and organize results const allRows = []; let headers = []; parseResult.results.forEach((chunkResult, index) => { if (index === 0 && chunkResult.rows.length > 0) { headers = chunkResult.rows[0]; // First row as headers allRows.push(...chunkResult.rows.slice(1)); } else { allRows.push(...chunkResult.rows); } }); // Separate numeric and categorical columns const numericColumns = []; const categoricalColumns = []; for (let colIndex = 0; colIndex < headers.length; colIndex++) { const columnData = allRows.map((row) => row[colIndex]); // Simple type detection const numericValues = columnData.map((val) => parseFloat(val)).filter((val) => !isNaN(val)); if (numericValues.length / columnData.length > 0.7) { numericColumns.push(numericValues); } else { categoricalColumns.push(columnData); } } return { data: allRows, headers, totalRows: allRows.length, totalColumns: headers.length, numericColumns, categoricalColumns, }; } /** * Calculate statistics in parallel */ async calculateParallelStatistics(numericColumns) { if (numericColumns.length === 0) { return { results: [], executionTime: 0 }; } return await this.parallelAnalyzer.calculateMultipleDescriptiveStats(numericColumns); } /** * Calculate correlations in parallel */ async calculateParallelCorrelations(numericColumns) { if (numericColumns.length < 2) { return { results: [], executionTime: 0 }; } // Generate correlation pairs (limit to maxCorrelationPairs) const pairs = []; const maxPairs = Math.min(this.options.maxCorrelationPairs, (numericColumns.length * (numericColumns.length - 1)) / 2); let pairCount = 0; for (let i = 0; i < numericColumns.length && pairCount < maxPairs; i++) { for (let j = i + 1; j < numericColumns.length && pairCount < maxPairs; j++) { pairs.push({ x: numericColumns[i], y: numericColumns[j] }); pairCount++; } } return await this.parallelAnalyzer.calculateMultipleCorrelations(pairs); } /** * Detect outliers in parallel */ async detectParallelOutliers(numericColumns) { if (numericColumns.length === 0) { return { results: [], executionTime: 0 }; } return await this.parallelAnalyzer.detectMultipleOutliers(numericColumns, 1.5); } /** * Calculate frequency distributions in parallel */ async calculateParallelFrequencies(categoricalColumns) { if (categoricalColumns.length === 0) { return { results: [], executionTime: 0 }; } return await this.parallelAnalyzer.calculateMultipleFrequencyDistributions(categoricalColumns); } /** * Calculate optimal chunk size for file parsing */ calculateOptimalChunkSize(fileSize) { const targetChunks = this.options.maxWorkers * 2; // 2 chunks per worker let chunkSize = Math.ceil(fileSize / targetChunks); // Ensure reasonable bounds (1MB to 64MB) chunkSize = Math.max(1024 * 1024, Math.min(64 * 1024 * 1024, chunkSize)); return chunkSize; } /** * Calculate performance metrics */ calculatePerformanceMetrics(parallelTime, parseResult, statsResult, correlationResult, outlierResult, frequencyResult) { const totalTasks = 1 + // parsing (statsResult.totalTasks || 0) + (correlationResult.totalTasks || 0) + (outlierResult.totalTasks || 0) + (frequencyResult.totalTasks || 0); // Estimate sequential execution time (for comparison) const estimatedSequentialTime = parallelTime * this.options.maxWorkers * 0.7; // Assume 70% efficiency const speedupFactor = estimatedSequentialTime / parallelTime; return { totalExecutionTime: parallelTime, parallelExecutionTime: parallelTime, sequentialExecutionTime: estimatedSequentialTime, speedupFactor: Math.max(1, speedupFactor), tasksExecuted: totalTasks, memoryEfficiency: this.calculateMemoryEfficiency(), }; } /** * Calculate memory efficiency metric */ calculateMemoryEfficiency() { const memoryUsage = process.memoryUsage(); const totalMemoryMB = memoryUsage.heapUsed / 1024 / 1024; const maxAllowedMemoryMB = this.options.memoryThresholdMB; return Math.max(0, 1 - totalMemoryMB / maxAllowedMemoryMB); } /** * Get real-time performance statistics */ getPerformanceStats() { if (this.parallelAnalyzer) { return this.parallelAnalyzer.getPerformanceStats(); } return null; } /** * Gracefully shutdown parallel resources */ async shutdown() { if (this.parallelAnalyzer) { await this.parallelAnalyzer.shutdown(); } logger_1.logger.info('Parallel streaming analyzer shutdown complete'); } } exports.ParallelStreamingAnalyzer = ParallelStreamingAnalyzer; /** * Factory function for creating parallel streaming analyzer */ function createParallelStreamingAnalyzer(options) { return new ParallelStreamingAnalyzer(options); } //# sourceMappingURL=parallel-streaming-analyzer.js.map