datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
436 lines • 17.8 kB
JavaScript
;
/**
* Parallel Analysis Engine
* Orchestrates parallel processing for multi-format data analysis
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParallelAnalyzer = void 0;
exports.getGlobalParallelAnalyzer = getGlobalParallelAnalyzer;
exports.shutdownGlobalParallelAnalyzer = shutdownGlobalParallelAnalyzer;
const path = __importStar(require("path"));
const uuid_1 = require("uuid");
const worker_pool_1 = require("./worker-pool");
const logger_1 = require("../utils/logger");
const types_1 = require("../core/types");
const perf_hooks_1 = require("perf_hooks");
/**
* High-performance parallel analysis engine
*/
class ParallelAnalyzer {
statisticalWorkerPool;
parsingWorkerPool;
options;
constructor(options = {}) {
this.options = {
maxWorkers: options.maxWorkers || Math.max(2, require('os').cpus().length - 1),
enableMemoryMonitoring: options.enableMemoryMonitoring ?? true,
memoryLimitMB: options.memoryLimitMB || 256,
batchSize: options.batchSize || 1000,
taskTimeout: options.taskTimeout || 60000,
};
// Initialize worker pools with different scripts
const statisticalWorkerScript = path.join(__dirname, 'workers', 'statistical-worker.js');
const parsingWorkerScript = path.join(__dirname, 'workers', 'parsing-worker.js');
this.statisticalWorkerPool = new worker_pool_1.WorkerPool(statisticalWorkerScript, {
maxWorkers: this.options.maxWorkers,
enableMemoryMonitoring: this.options.enableMemoryMonitoring,
memoryLimitMB: this.options.memoryLimitMB,
taskTimeout: this.options.taskTimeout,
});
this.parsingWorkerPool = new worker_pool_1.WorkerPool(parsingWorkerScript, {
maxWorkers: this.options.maxWorkers,
enableMemoryMonitoring: this.options.enableMemoryMonitoring,
memoryLimitMB: this.options.memoryLimitMB,
taskTimeout: this.options.taskTimeout,
});
logger_1.logger.info(`Parallel analyzer initialized with ${this.options.maxWorkers} workers per pool`);
}
/**
* Calculate descriptive statistics for multiple columns in parallel
*/
async calculateMultipleDescriptiveStats(datasets) {
const startTime = perf_hooks_1.performance.now();
try {
// Create tasks for each dataset
const tasks = datasets.map((values, index) => ({
id: `desc-stats-${index}-${(0, uuid_1.v4)()}`,
type: 'descriptive-stats',
data: { values },
priority: 'normal',
}));
logger_1.logger.info(`Computing descriptive statistics for ${datasets.length} columns in parallel`);
const results = await this.statisticalWorkerPool.executeAll(tasks);
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
success: true,
results,
executionTime,
totalTasks: tasks.length,
failedTasks: 0,
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Parallel descriptive statistics failed: ${error.message}`);
return {
success: false,
results: [],
executionTime,
totalTasks: datasets.length,
failedTasks: datasets.length,
};
}
}
/**
* Calculate correlations between multiple column pairs in parallel
*/
async calculateMultipleCorrelations(pairs) {
const startTime = perf_hooks_1.performance.now();
try {
// Create tasks for each correlation pair
const tasks = pairs.map((pair, index) => ({
id: `correlation-${index}-${(0, uuid_1.v4)()}`,
type: 'correlation',
data: pair,
priority: 'normal',
}));
logger_1.logger.info(`Computing ${pairs.length} correlations in parallel`);
const results = await this.statisticalWorkerPool.executeAll(tasks);
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
success: true,
results,
executionTime,
totalTasks: tasks.length,
failedTasks: 0,
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Parallel correlation calculation failed: ${error.message}`);
return {
success: false,
results: [],
executionTime,
totalTasks: pairs.length,
failedTasks: pairs.length,
};
}
}
/**
* Detect outliers in multiple columns in parallel
*/
async detectMultipleOutliers(datasets, multiplier = 1.5) {
const startTime = perf_hooks_1.performance.now();
try {
const tasks = datasets.map((values, index) => ({
id: `outliers-${index}-${(0, uuid_1.v4)()}`,
type: 'outlier-detection',
data: { values, multiplier },
priority: 'normal',
}));
logger_1.logger.info(`Detecting outliers in ${datasets.length} columns in parallel`);
const results = await this.statisticalWorkerPool.executeAll(tasks);
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
success: true,
results,
executionTime,
totalTasks: tasks.length,
failedTasks: 0,
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Parallel outlier detection failed: ${error.message}`);
return {
success: false,
results: [],
executionTime,
totalTasks: datasets.length,
failedTasks: datasets.length,
};
}
}
/**
* Calculate frequency distributions for multiple categorical columns in parallel
*/
async calculateMultipleFrequencyDistributions(datasets) {
const startTime = perf_hooks_1.performance.now();
try {
const tasks = datasets.map((values, index) => ({
id: `freq-dist-${index}-${(0, uuid_1.v4)()}`,
type: 'frequency-distribution',
data: { values },
priority: 'normal',
}));
logger_1.logger.info(`Computing frequency distributions for ${datasets.length} columns in parallel`);
const results = await this.statisticalWorkerPool.executeAll(tasks);
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
success: true,
results,
executionTime,
totalTasks: tasks.length,
failedTasks: 0,
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Parallel frequency distribution calculation failed: ${error.message}`);
return {
success: false,
results: [],
executionTime,
totalTasks: datasets.length,
failedTasks: datasets.length,
};
}
}
/**
* Parse multiple CSV chunks in parallel
*/
async parseMultipleCSVChunks(chunks, options = {}) {
const startTime = perf_hooks_1.performance.now();
try {
const tasks = chunks.map((chunk, index) => ({
id: `csv-parse-${index}-${(0, uuid_1.v4)()}`,
type: 'parse-csv-chunk',
data: { chunk, options },
priority: 'high', // Parsing is often blocking
}));
logger_1.logger.info(`Parsing ${chunks.length} CSV chunks in parallel`);
const results = await this.parsingWorkerPool.executeAll(tasks);
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
success: true,
results,
executionTime,
totalTasks: tasks.length,
failedTasks: 0,
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Parallel CSV parsing failed: ${error.message}`);
return {
success: false,
results: [],
executionTime,
totalTasks: chunks.length,
failedTasks: chunks.length,
};
}
}
/**
* Parse multiple JSON objects in parallel
*/
async parseMultipleJSON(jsonStrings, options = {}) {
const startTime = perf_hooks_1.performance.now();
try {
const tasks = jsonStrings.map((content, index) => ({
id: `json-parse-${index}-${(0, uuid_1.v4)()}`,
type: 'parse-json',
data: { content, options },
priority: 'high',
}));
logger_1.logger.info(`Parsing ${jsonStrings.length} JSON objects in parallel`);
const results = await this.parsingWorkerPool.executeAll(tasks);
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
success: true,
results,
executionTime,
totalTasks: tasks.length,
failedTasks: 0,
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Parallel JSON parsing failed: ${error.message}`);
return {
success: false,
results: [],
executionTime,
totalTasks: jsonStrings.length,
failedTasks: jsonStrings.length,
};
}
}
/**
* Detect data types for multiple columns in parallel
*/
async detectMultipleDataTypes(columns) {
const startTime = perf_hooks_1.performance.now();
try {
// Split columns into batches for parallel processing
const batchSize = Math.ceil(columns.length / this.options.maxWorkers);
const batches = [];
for (let i = 0; i < columns.length; i += batchSize) {
batches.push(columns.slice(i, i + batchSize));
}
const tasks = batches.map((batch, index) => ({
id: `type-detection-${index}-${(0, uuid_1.v4)()}`,
type: 'detect-data-types',
data: { columns: batch },
priority: 'normal',
}));
logger_1.logger.info(`Detecting data types for ${columns.length} columns in ${batches.length} parallel batches`);
const batchResults = await this.parsingWorkerPool.executeAll(tasks);
// Flatten batch results
const results = batchResults.flat();
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
success: true,
results,
executionTime,
totalTasks: tasks.length,
failedTasks: 0,
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Parallel data type detection failed: ${error.message}`);
return {
success: false,
results: [],
executionTime,
totalTasks: columns.length,
failedTasks: columns.length,
};
}
}
/**
* Execute mixed workload (statistical + parsing) with intelligent scheduling
*/
async executeMixedWorkload(statisticalTasks, parsingTasks) {
const startTime = perf_hooks_1.performance.now();
try {
logger_1.logger.info(`Executing mixed workload: ${statisticalTasks.length} statistical + ${parsingTasks.length} parsing tasks`);
// Execute both types of tasks in parallel
const [statisticalResults, parsingResults] = await Promise.all([
this.statisticalWorkerPool.executeAll(statisticalTasks),
this.parsingWorkerPool.executeAll(parsingTasks),
]);
const executionTime = perf_hooks_1.performance.now() - startTime;
return {
statistical: {
success: true,
results: statisticalResults,
executionTime,
totalTasks: statisticalTasks.length,
failedTasks: 0,
},
parsing: {
success: true,
results: parsingResults,
executionTime,
totalTasks: parsingTasks.length,
failedTasks: 0,
},
};
}
catch (error) {
const executionTime = perf_hooks_1.performance.now() - startTime;
logger_1.logger.error(`Mixed workload execution failed: ${error.message}`);
throw new types_1.DataPilotError(`Mixed workload execution failed: ${error.message}`, 'PARALLEL_MIXED_WORKLOAD_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PERFORMANCE);
}
}
/**
* Get performance statistics from both worker pools
*/
getPerformanceStats() {
const statisticalStats = this.statisticalWorkerPool.getStats();
const parsingStats = this.parsingWorkerPool.getStats();
return {
statistical: statisticalStats,
parsing: parsingStats,
total: {
totalWorkers: statisticalStats.totalWorkers + parsingStats.totalWorkers,
availableWorkers: statisticalStats.availableWorkers + parsingStats.availableWorkers,
busyWorkers: statisticalStats.busyWorkers + parsingStats.busyWorkers,
queuedTasks: statisticalStats.queuedTasks + parsingStats.queuedTasks,
activeTasksCount: statisticalStats.activeTasksCount + parsingStats.activeTasksCount,
},
};
}
/**
* Adaptive batch size calculation based on data size and available workers
*/
calculateOptimalBatchSize(dataSize, complexity = 'medium') {
const baseComplexity = complexity === 'low' ? 1 : complexity === 'medium' ? 2 : 4;
const availableWorkers = this.options.maxWorkers;
// Calculate optimal batch size based on data size, complexity, and available workers
const targetTasksPerWorker = 2; // Keep workers busy with 2 tasks each
const targetTotalTasks = availableWorkers * targetTasksPerWorker;
let batchSize = Math.ceil(dataSize / targetTotalTasks / baseComplexity);
// Ensure reasonable bounds
batchSize = Math.max(100, Math.min(10000, batchSize));
return batchSize;
}
/**
* Gracefully shutdown both worker pools
*/
async shutdown() {
logger_1.logger.info('Shutting down parallel analyzer');
await Promise.all([this.statisticalWorkerPool.shutdown(), this.parsingWorkerPool.shutdown()]);
logger_1.logger.info('Parallel analyzer shutdown complete');
}
}
exports.ParallelAnalyzer = ParallelAnalyzer;
/**
* Global parallel analyzer instance
*/
let globalParallelAnalyzer = null;
/**
* Get or create the global parallel analyzer
*/
function getGlobalParallelAnalyzer(options) {
if (!globalParallelAnalyzer) {
globalParallelAnalyzer = new ParallelAnalyzer(options);
}
return globalParallelAnalyzer;
}
/**
* Shutdown the global parallel analyzer
*/
async function shutdownGlobalParallelAnalyzer() {
if (globalParallelAnalyzer) {
await globalParallelAnalyzer.shutdown();
globalParallelAnalyzer = null;
}
}
//# sourceMappingURL=parallel-analyzer.js.map