UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

436 lines 17.8 kB
"use strict"; /** * Parallel Analysis Engine * Orchestrates parallel processing for multi-format data analysis */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ParallelAnalyzer = void 0; exports.getGlobalParallelAnalyzer = getGlobalParallelAnalyzer; exports.shutdownGlobalParallelAnalyzer = shutdownGlobalParallelAnalyzer; const path = __importStar(require("path")); const uuid_1 = require("uuid"); const worker_pool_1 = require("./worker-pool"); const logger_1 = require("../utils/logger"); const types_1 = require("../core/types"); const perf_hooks_1 = require("perf_hooks"); /** * High-performance parallel analysis engine */ class ParallelAnalyzer { statisticalWorkerPool; parsingWorkerPool; options; constructor(options = {}) { this.options = { maxWorkers: options.maxWorkers || Math.max(2, require('os').cpus().length - 1), enableMemoryMonitoring: options.enableMemoryMonitoring ?? true, memoryLimitMB: options.memoryLimitMB || 256, batchSize: options.batchSize || 1000, taskTimeout: options.taskTimeout || 60000, }; // Initialize worker pools with different scripts const statisticalWorkerScript = path.join(__dirname, 'workers', 'statistical-worker.js'); const parsingWorkerScript = path.join(__dirname, 'workers', 'parsing-worker.js'); this.statisticalWorkerPool = new worker_pool_1.WorkerPool(statisticalWorkerScript, { maxWorkers: this.options.maxWorkers, enableMemoryMonitoring: this.options.enableMemoryMonitoring, memoryLimitMB: this.options.memoryLimitMB, taskTimeout: this.options.taskTimeout, }); this.parsingWorkerPool = new worker_pool_1.WorkerPool(parsingWorkerScript, { maxWorkers: this.options.maxWorkers, enableMemoryMonitoring: this.options.enableMemoryMonitoring, memoryLimitMB: this.options.memoryLimitMB, taskTimeout: this.options.taskTimeout, }); logger_1.logger.info(`Parallel analyzer initialized with ${this.options.maxWorkers} workers per pool`); } /** * Calculate descriptive statistics for multiple columns in parallel */ async calculateMultipleDescriptiveStats(datasets) { const startTime = perf_hooks_1.performance.now(); try { // Create tasks for each dataset const tasks = datasets.map((values, index) => ({ id: `desc-stats-${index}-${(0, uuid_1.v4)()}`, type: 'descriptive-stats', data: { values }, priority: 'normal', })); logger_1.logger.info(`Computing descriptive statistics for ${datasets.length} columns in parallel`); const results = await this.statisticalWorkerPool.executeAll(tasks); const executionTime = perf_hooks_1.performance.now() - startTime; return { success: true, results, executionTime, totalTasks: tasks.length, failedTasks: 0, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Parallel descriptive statistics failed: ${error.message}`); return { success: false, results: [], executionTime, totalTasks: datasets.length, failedTasks: datasets.length, }; } } /** * Calculate correlations between multiple column pairs in parallel */ async calculateMultipleCorrelations(pairs) { const startTime = perf_hooks_1.performance.now(); try { // Create tasks for each correlation pair const tasks = pairs.map((pair, index) => ({ id: `correlation-${index}-${(0, uuid_1.v4)()}`, type: 'correlation', data: pair, priority: 'normal', })); logger_1.logger.info(`Computing ${pairs.length} correlations in parallel`); const results = await this.statisticalWorkerPool.executeAll(tasks); const executionTime = perf_hooks_1.performance.now() - startTime; return { success: true, results, executionTime, totalTasks: tasks.length, failedTasks: 0, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Parallel correlation calculation failed: ${error.message}`); return { success: false, results: [], executionTime, totalTasks: pairs.length, failedTasks: pairs.length, }; } } /** * Detect outliers in multiple columns in parallel */ async detectMultipleOutliers(datasets, multiplier = 1.5) { const startTime = perf_hooks_1.performance.now(); try { const tasks = datasets.map((values, index) => ({ id: `outliers-${index}-${(0, uuid_1.v4)()}`, type: 'outlier-detection', data: { values, multiplier }, priority: 'normal', })); logger_1.logger.info(`Detecting outliers in ${datasets.length} columns in parallel`); const results = await this.statisticalWorkerPool.executeAll(tasks); const executionTime = perf_hooks_1.performance.now() - startTime; return { success: true, results, executionTime, totalTasks: tasks.length, failedTasks: 0, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Parallel outlier detection failed: ${error.message}`); return { success: false, results: [], executionTime, totalTasks: datasets.length, failedTasks: datasets.length, }; } } /** * Calculate frequency distributions for multiple categorical columns in parallel */ async calculateMultipleFrequencyDistributions(datasets) { const startTime = perf_hooks_1.performance.now(); try { const tasks = datasets.map((values, index) => ({ id: `freq-dist-${index}-${(0, uuid_1.v4)()}`, type: 'frequency-distribution', data: { values }, priority: 'normal', })); logger_1.logger.info(`Computing frequency distributions for ${datasets.length} columns in parallel`); const results = await this.statisticalWorkerPool.executeAll(tasks); const executionTime = perf_hooks_1.performance.now() - startTime; return { success: true, results, executionTime, totalTasks: tasks.length, failedTasks: 0, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Parallel frequency distribution calculation failed: ${error.message}`); return { success: false, results: [], executionTime, totalTasks: datasets.length, failedTasks: datasets.length, }; } } /** * Parse multiple CSV chunks in parallel */ async parseMultipleCSVChunks(chunks, options = {}) { const startTime = perf_hooks_1.performance.now(); try { const tasks = chunks.map((chunk, index) => ({ id: `csv-parse-${index}-${(0, uuid_1.v4)()}`, type: 'parse-csv-chunk', data: { chunk, options }, priority: 'high', // Parsing is often blocking })); logger_1.logger.info(`Parsing ${chunks.length} CSV chunks in parallel`); const results = await this.parsingWorkerPool.executeAll(tasks); const executionTime = perf_hooks_1.performance.now() - startTime; return { success: true, results, executionTime, totalTasks: tasks.length, failedTasks: 0, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Parallel CSV parsing failed: ${error.message}`); return { success: false, results: [], executionTime, totalTasks: chunks.length, failedTasks: chunks.length, }; } } /** * Parse multiple JSON objects in parallel */ async parseMultipleJSON(jsonStrings, options = {}) { const startTime = perf_hooks_1.performance.now(); try { const tasks = jsonStrings.map((content, index) => ({ id: `json-parse-${index}-${(0, uuid_1.v4)()}`, type: 'parse-json', data: { content, options }, priority: 'high', })); logger_1.logger.info(`Parsing ${jsonStrings.length} JSON objects in parallel`); const results = await this.parsingWorkerPool.executeAll(tasks); const executionTime = perf_hooks_1.performance.now() - startTime; return { success: true, results, executionTime, totalTasks: tasks.length, failedTasks: 0, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Parallel JSON parsing failed: ${error.message}`); return { success: false, results: [], executionTime, totalTasks: jsonStrings.length, failedTasks: jsonStrings.length, }; } } /** * Detect data types for multiple columns in parallel */ async detectMultipleDataTypes(columns) { const startTime = perf_hooks_1.performance.now(); try { // Split columns into batches for parallel processing const batchSize = Math.ceil(columns.length / this.options.maxWorkers); const batches = []; for (let i = 0; i < columns.length; i += batchSize) { batches.push(columns.slice(i, i + batchSize)); } const tasks = batches.map((batch, index) => ({ id: `type-detection-${index}-${(0, uuid_1.v4)()}`, type: 'detect-data-types', data: { columns: batch }, priority: 'normal', })); logger_1.logger.info(`Detecting data types for ${columns.length} columns in ${batches.length} parallel batches`); const batchResults = await this.parsingWorkerPool.executeAll(tasks); // Flatten batch results const results = batchResults.flat(); const executionTime = perf_hooks_1.performance.now() - startTime; return { success: true, results, executionTime, totalTasks: tasks.length, failedTasks: 0, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Parallel data type detection failed: ${error.message}`); return { success: false, results: [], executionTime, totalTasks: columns.length, failedTasks: columns.length, }; } } /** * Execute mixed workload (statistical + parsing) with intelligent scheduling */ async executeMixedWorkload(statisticalTasks, parsingTasks) { const startTime = perf_hooks_1.performance.now(); try { logger_1.logger.info(`Executing mixed workload: ${statisticalTasks.length} statistical + ${parsingTasks.length} parsing tasks`); // Execute both types of tasks in parallel const [statisticalResults, parsingResults] = await Promise.all([ this.statisticalWorkerPool.executeAll(statisticalTasks), this.parsingWorkerPool.executeAll(parsingTasks), ]); const executionTime = perf_hooks_1.performance.now() - startTime; return { statistical: { success: true, results: statisticalResults, executionTime, totalTasks: statisticalTasks.length, failedTasks: 0, }, parsing: { success: true, results: parsingResults, executionTime, totalTasks: parsingTasks.length, failedTasks: 0, }, }; } catch (error) { const executionTime = perf_hooks_1.performance.now() - startTime; logger_1.logger.error(`Mixed workload execution failed: ${error.message}`); throw new types_1.DataPilotError(`Mixed workload execution failed: ${error.message}`, 'PARALLEL_MIXED_WORKLOAD_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PERFORMANCE); } } /** * Get performance statistics from both worker pools */ getPerformanceStats() { const statisticalStats = this.statisticalWorkerPool.getStats(); const parsingStats = this.parsingWorkerPool.getStats(); return { statistical: statisticalStats, parsing: parsingStats, total: { totalWorkers: statisticalStats.totalWorkers + parsingStats.totalWorkers, availableWorkers: statisticalStats.availableWorkers + parsingStats.availableWorkers, busyWorkers: statisticalStats.busyWorkers + parsingStats.busyWorkers, queuedTasks: statisticalStats.queuedTasks + parsingStats.queuedTasks, activeTasksCount: statisticalStats.activeTasksCount + parsingStats.activeTasksCount, }, }; } /** * Adaptive batch size calculation based on data size and available workers */ calculateOptimalBatchSize(dataSize, complexity = 'medium') { const baseComplexity = complexity === 'low' ? 1 : complexity === 'medium' ? 2 : 4; const availableWorkers = this.options.maxWorkers; // Calculate optimal batch size based on data size, complexity, and available workers const targetTasksPerWorker = 2; // Keep workers busy with 2 tasks each const targetTotalTasks = availableWorkers * targetTasksPerWorker; let batchSize = Math.ceil(dataSize / targetTotalTasks / baseComplexity); // Ensure reasonable bounds batchSize = Math.max(100, Math.min(10000, batchSize)); return batchSize; } /** * Gracefully shutdown both worker pools */ async shutdown() { logger_1.logger.info('Shutting down parallel analyzer'); await Promise.all([this.statisticalWorkerPool.shutdown(), this.parsingWorkerPool.shutdown()]); logger_1.logger.info('Parallel analyzer shutdown complete'); } } exports.ParallelAnalyzer = ParallelAnalyzer; /** * Global parallel analyzer instance */ let globalParallelAnalyzer = null; /** * Get or create the global parallel analyzer */ function getGlobalParallelAnalyzer(options) { if (!globalParallelAnalyzer) { globalParallelAnalyzer = new ParallelAnalyzer(options); } return globalParallelAnalyzer; } /** * Shutdown the global parallel analyzer */ async function shutdownGlobalParallelAnalyzer() { if (globalParallelAnalyzer) { await globalParallelAnalyzer.shutdown(); globalParallelAnalyzer = null; } } //# sourceMappingURL=parallel-analyzer.js.map