datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
938 lines • 43 kB
JavaScript
;
/**
* Universal Analyzer - Multi-format data analysis orchestrator
* Integrates with the existing 6-section analysis pipeline
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.UniversalAnalyzer = void 0;
const parser_registry_1 = require("../parsers/base/parser-registry");
const json_parser_1 = require("../parsers/json-parser");
const csv_parser_adapter_1 = require("../parsers/adapters/csv-parser-adapter");
const excel_parser_1 = require("../parsers/excel-parser");
const tsv_parser_1 = require("../parsers/tsv-parser");
const parquet_parser_1 = require("../parsers/parquet-parser");
const types_1 = require("../core/types");
const logger_1 = require("../utils/logger");
const error_handler_1 = require("../utils/error-handler");
// Import existing analyzers (these remain unchanged)
const overview_1 = require("../analyzers/overview");
const quality_1 = require("../analyzers/quality");
const streaming_analyzer_1 = require("../analyzers/streaming/streaming-analyzer");
const visualization_1 = require("../analyzers/visualization");
const engineering_1 = require("../analyzers/engineering");
const modeling_1 = require("../analyzers/modeling");
/**
* Universal analyzer that works with any supported data format
*/
class UniversalAnalyzer {
registry;
initialized = false;
constructor() {
this.registry = parser_registry_1.globalParserRegistry;
}
/**
* Initialize the analyzer with all available parsers
*/
initializeParsers() {
if (this.initialized)
return;
// Register CSV parser (highest priority for backwards compatibility)
this.registry.register({
format: 'csv',
parserFactory: (options) => (0, csv_parser_adapter_1.createCSVParserAdapter)(options),
detector: {
detect: async (filePath) => {
const adapter = (0, csv_parser_adapter_1.createCSVParserAdapter)();
return adapter.detect(filePath);
},
getSupportedExtensions: () => ['.csv'],
getFormatName: () => 'csv',
},
priority: 100,
extensions: ['.csv'],
});
// Register TSV parser
this.registry.register({
format: 'tsv',
parserFactory: (options) => (0, tsv_parser_1.createTSVParser)(options),
detector: new tsv_parser_1.TSVDetector(),
priority: 90,
extensions: ['.tsv', '.tab'],
});
// Register JSON parser
this.registry.register({
format: 'json',
parserFactory: (options) => (0, json_parser_1.createJSONParser)(options),
detector: new json_parser_1.JSONDetector(),
priority: 80,
extensions: ['.json', '.jsonl', '.ndjson'],
});
// Register Excel parser
this.registry.register({
format: 'excel',
parserFactory: (options) => (0, excel_parser_1.createExcelParser)(options),
detector: new excel_parser_1.ExcelDetector(),
priority: 70,
extensions: ['.xlsx', '.xls', '.xlsm'],
});
// Register Parquet parser
this.registry.register({
format: 'parquet',
parserFactory: (options) => (0, parquet_parser_1.createParquetParser)(options),
detector: new parquet_parser_1.ParquetDetector(),
priority: 60,
extensions: ['.parquet'],
});
this.initialized = true;
logger_1.logger.info(`Initialized universal analyzer with ${this.registry.getSupportedFormats().length} formats`);
}
/**
* Analyze multiple files for join analysis (engineering command with multiple files)
*/
async analyzeMultipleFiles(filePaths, options) {
this.initializeParsers();
try {
logger_1.logger.info(`Starting multi-file join analysis for: ${filePaths.join(', ')}`);
// Validate all files exist and are supported
for (const filePath of filePaths) {
const validation = await this.validateFile(filePath);
if (!validation.supported) {
throw new types_1.DataPilotError(`File ${filePath} is not supported: ${validation.suggestions.join(', ')}`, 'MULTI_FILE_VALIDATION_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION);
}
}
// Import and run join analysis
const { JoinAnalyzer } = await Promise.resolve().then(() => __importStar(require('../analyzers/joins')));
const joinAnalyzer = new JoinAnalyzer({
maxTables: Math.max(10, filePaths.length),
confidenceThreshold: options.confidence || 0.7,
enableFuzzyMatching: true,
enableSemanticAnalysis: true,
enableTemporalJoins: false,
performanceMode: 'BALANCED',
outputFormats: [{ type: 'MARKDOWN' }]
});
const joinResult = await joinAnalyzer.analyzeJoins(filePaths);
return {
success: true,
exitCode: 0,
data: {
joinAnalysis: joinResult
},
format: options.format || 'markdown',
metadata: {
command: 'engineering',
filePaths,
analysisType: 'multi-file-join',
filesAnalyzed: filePaths.length,
timestamp: new Date().toISOString(),
version: '1.2.1',
},
};
}
catch (error) {
return this.handleAnalysisError(error, filePaths.join(', '));
}
}
/**
* Analyze any supported file format
*/
async analyzeFile(filePath, options) {
this.initializeParsers();
// Enable verbose mode in error handler if verbose CLI option is set
error_handler_1.globalErrorHandler.setVerboseMode(options.verbose || false);
return await error_handler_1.ErrorUtils.withEnhancedContext(async () => {
// 1. Auto-detect format and get parser
logger_1.logger.info(`Starting universal analysis for: ${filePath}`);
// Convert CLIOptions to ParseOptions
const parseOptions = {
maxRows: options.maxRows,
encoding: options.encoding,
// NOTE: Do NOT pass options.format here - that's for OUTPUT format, not input file format
delimiter: options.delimiter,
quote: options.quote,
hasHeader: options.hasHeader,
jsonPath: options.jsonPath,
arrayMode: options.arrayMode,
flattenObjects: options.flattenObjects,
sheetName: options.sheetName,
sheetIndex: options.sheetIndex,
columns: options.columns,
rowStart: options.rowStart,
rowEnd: options.rowEnd,
};
const { parser, format, detection } = await error_handler_1.ErrorUtils.withEnhancedContext(() => this.registry.getParser(filePath, parseOptions), {
operationName: 'format_detection',
filePath,
additionalContext: { parseOptions }
});
logger_1.logger.info(`Detected format: ${format} (confidence: ${(detection.confidence * 100).toFixed(1)}%)`);
// 2. Validate file can be parsed
const validation = await error_handler_1.ErrorUtils.withEnhancedContext(() => parser.validate(filePath), {
operationName: 'parser_validation',
filePath,
additionalContext: { format, confidence: detection.confidence }
});
if (!validation.canProceed) {
throw error_handler_1.ErrorUtils.createContextualError(`Cannot parse file: ${validation.errors.join(', ')}`, 'UNIVERSAL_PARSE_ERROR', types_1.ErrorCategory.VALIDATION, types_1.ErrorSeverity.HIGH, {
operationName: 'parser_validation',
filePath,
additionalContext: {
format,
parserErrors: validation.errors,
parserWarnings: validation.warnings
}
});
}
if (validation.warnings.length > 0) {
logger_1.logger.warn(`Parsing warnings: ${validation.warnings.join(', ')}`);
}
// 3. Convert to common dataset format
const dataset = await error_handler_1.ErrorUtils.withEnhancedContext(() => this.parseToDataset(parser, filePath, parseOptions, format, detection), {
operationName: 'dataset_conversion',
filePath,
additionalContext: { format, parseOptions }
});
// 4. Run the same 6-section analysis pipeline
const analysisResult = await error_handler_1.ErrorUtils.withEnhancedContext(() => this.runAnalysisPipeline(dataset, options), {
operationName: 'analysis_pipeline',
filePath,
additionalContext: {
format,
datasetSize: dataset.rows.length,
columns: dataset.headers.length
}
});
return {
success: true,
exitCode: 0,
data: analysisResult,
format: options.format || 'markdown',
metadata: {
command: options.command || 'all',
filePath,
originalFormat: format,
detection: {
format,
confidence: detection.confidence,
metadata: detection.metadata,
},
parserStats: parser.getStats(),
timestamp: new Date().toISOString(),
version: '1.2.1', // Multi-format support version
},
};
}, {
operationName: 'universal_file_analysis',
filePath,
additionalContext: {
command: options.command,
verboseMode: options.verbose
}
}).catch((error) => {
return this.handleAnalysisError(error, filePath, options);
});
}
/**
* Parse file using detected parser and convert to universal dataset format
*/
async parseToDataset(parser, filePath, options, format, detection) {
const rows = [];
let headers = [];
let hasHeader = false;
logger_1.logger.info('Parsing file to dataset format...');
// Parse file and collect rows
for await (const row of parser.parse(filePath, {
maxRows: options.maxRows,
hasHeader: options.hasHeader,
encoding: options.encoding,
delimiter: options.delimiter,
quote: options.quote,
jsonPath: options.jsonPath,
arrayMode: options.arrayMode,
flattenObjects: options.flattenObjects,
sheetName: options.sheetName,
sheetIndex: options.sheetIndex,
columns: options.columns,
rowStart: options.rowStart,
rowEnd: options.rowEnd,
})) {
if (row.index === 0 && !hasHeader) {
// First row - determine if it's headers or data
const isHeaderRow = this.detectHeaderRow(row.data, format);
if (isHeaderRow) {
headers = row.data;
hasHeader = true;
continue;
}
else {
// Generate column names
headers = row.data.map((_, i) => `column_${i + 1}`);
}
}
rows.push(row.data);
}
const stats = parser.getStats();
logger_1.logger.info(`Parsed ${rows.length} rows with ${headers.length} columns`);
return {
headers,
rows,
metadata: {
format,
filePath,
totalRows: rows.length,
parserStats: stats,
detection,
},
};
}
/**
* Detect if first row contains headers
*/
detectHeaderRow(row, format) {
// Format-specific header detection logic
switch (format) {
case 'json':
// JSON usually has meaningful keys as headers
return true;
case 'parquet':
// Parquet always has schema-defined column names
return true;
case 'csv':
// CSV header detection (existing logic)
return this.detectCSVHeaders(row);
default:
// Default: assume first row is header if it contains non-numeric values
return row.some((cell) => isNaN(Number(cell)) && cell.trim() !== '');
}
}
detectCSVHeaders(row) {
// Simple heuristic: if more than half the cells are non-numeric, likely headers
const nonNumeric = row.filter((cell) => {
const trimmed = cell.trim();
return trimmed !== '' && isNaN(Number(trimmed));
});
return nonNumeric.length > row.length / 2;
}
/**
* Run the existing 6-section analysis pipeline on the universal dataset
* Intelligently uses SequentialExecutor or individual execution based on context
*/
async runAnalysisPipeline(dataset, options) {
// Determine which sections to run based on options
const requestedSections = this.getRequestedSections(options);
if (requestedSections.length === 0) {
logger_1.logger.warn('No sections requested for analysis');
return {};
}
// Intelligent execution mode detection for backward compatibility
const shouldUseSequentialExecution = this.shouldUseSequentialExecution(requestedSections, options);
if (shouldUseSequentialExecution) {
// Use new sequential execution for complex dependencies
return this.runSequentialExecution(dataset, options, requestedSections);
}
else {
// Use traditional individual execution for single sections without dependencies
return this.runIndividualExecution(dataset, options, requestedSections);
}
}
/**
* Determine whether to use sequential execution or individual execution
* This maintains backward compatibility while enabling advanced features
*/
shouldUseSequentialExecution(requestedSections, options) {
// Force sequential execution if explicitly requested
if (options.forceSequential) {
logger_1.logger.info('Sequential execution forced by --force-sequential flag');
return true;
}
// Force individual execution if explicitly requested (for testing/debugging)
if (options.forceIndividual) {
logger_1.logger.info('Individual execution forced by --force-individual flag');
return false;
}
// Check if any section with dependencies is requested
const sectionsWithDependencies = ['section4', 'section5', 'section6'];
const hasDependentSection = requestedSections.some(section => sectionsWithDependencies.includes(section));
// Use sequential execution if:
// 1. Multiple sections are requested (better memory management)
// 2. Any section with dependencies is requested
// 3. Command is 'all' or 'analysis' (full pipeline)
if (requestedSections.length > 1 || hasDependentSection ||
['all', 'analysis', 'modeling'].includes(options.command || '')) {
logger_1.logger.info('Using sequential execution for optimal dependency resolution', {
sections: requestedSections,
reason: hasDependentSection ? 'dependencies' : 'multiple sections',
command: options.command
});
return true;
}
// Use individual execution for single sections without dependencies
logger_1.logger.info('Using individual execution for single section', { section: requestedSections[0], command: options.command });
return false;
}
/**
* Run sequential execution with full dependency resolution
*/
async runSequentialExecution(dataset, options, requestedSections) {
logger_1.logger.info(`Starting analysis pipeline with SequentialExecutor for sections: ${requestedSections.join(', ')}`, { sections: requestedSections, executor: 'SequentialExecutor' });
try {
// Import SequentialExecutor (dynamic import to avoid circular dependencies)
const { createSequentialExecutor } = await Promise.resolve().then(() => __importStar(require('./sequential-executor')));
// Create progress callbacks for CLI feedback
const progressCallbacks = {
onPhaseStart: (phase, message) => {
if (options.verbose) {
logger_1.logger.info(`Phase started: ${phase} - ${message}`);
}
},
onProgress: (state) => {
if (options.verbose) {
logger_1.logger.debug(`Progress: ${state.progress}% - ${state.message}`);
}
},
onPhaseComplete: (message, timeElapsed) => {
if (options.verbose) {
logger_1.logger.info(`Phase completed: ${message} (${timeElapsed}ms)`);
}
},
onError: (message) => {
logger_1.logger.error(`Execution error: ${message}`);
},
onWarning: (message) => {
logger_1.logger.warn(`Execution warning: ${message}`);
},
};
// Create and configure sequential executor
const executor = createSequentialExecutor(dataset, options, progressCallbacks, {
operation: 'pipeline_execution',
filePath: dataset.metadata.filePath,
format: dataset.metadata.format,
});
// Execute with sophisticated dependency resolution and memory management
const result = await executor.execute(requestedSections);
if (!result.success) {
throw new types_1.DataPilotError(`Sequential execution failed: ${result.error}`, 'SEQUENTIAL_EXECUTION_FAILED', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.ANALYSIS, {}, result.suggestions?.map(suggestion => ({
action: 'Follow suggestion',
description: suggestion,
severity: types_1.ErrorSeverity.MEDIUM,
})));
}
logger_1.logger.info('Sequential execution completed successfully', {
sectionsCompleted: result.metadata?.sectionsExecuted?.length || 0,
executionTime: result.metadata?.executionTime || 0,
memoryPeak: result.metadata?.memoryPeakUsage || 0,
});
return result.data;
}
catch (error) {
// If sequential execution fails, fall back to individual execution
if (options.fallbackOnError !== false) {
logger_1.logger.warn('Sequential execution failed, falling back to individual execution', { error: error.message });
return this.runIndividualExecution(dataset, options, requestedSections);
}
throw error;
}
}
/**
* Run individual section execution (legacy mode for backward compatibility)
* This is used for single sections without dependencies to maintain performance
*/
async runIndividualExecution(dataset, options, requestedSections) {
logger_1.logger.info(`Running individual section execution for: ${requestedSections.join(', ')}`, { sections: requestedSections, executor: 'Individual' });
const results = {};
const sectionStartTime = Date.now();
// Execute each section individually (original behavior)
for (const section of requestedSections) {
try {
switch (section) {
case 'section1':
if (this.shouldRunSection(1, options)) {
results.section1 = await this.runSection1Analysis(dataset, options);
}
break;
case 'section2':
if (this.shouldRunSection(2, options)) {
results.section2 = await this.runSection2Analysis(dataset, options);
}
break;
case 'section3':
if (this.shouldRunSection(3, options)) {
results.section3 = await this.runSection3Analysis(dataset, options);
}
break;
case 'section4':
if (this.shouldRunSection(4, options)) {
// Section 4 needs dependencies - create mocks for individual execution
const mockSection1 = results.section1 || await this.runSection1Analysis(dataset, options);
const mockSection3 = results.section3 || await this.runSection3Analysis(dataset, options);
results.section4 = await this.runSection4Analysis(dataset, options);
}
break;
case 'section5':
if (this.shouldRunSection(5, options)) {
// Section 5 needs dependencies - create mocks for individual execution
const mockSection1 = results.section1 || await this.runSection1Analysis(dataset, options);
const mockSection2 = results.section2 || await this.runSection2Analysis(dataset, options);
const mockSection3 = results.section3 || await this.runSection3Analysis(dataset, options);
results.section5 = await this.runSection5Analysis(dataset, options);
}
break;
case 'section6':
if (this.shouldRunSection(6, options)) {
// Section 6 needs multiple dependencies - warn about using sequential execution
logger_1.logger.warn('Section 6 has complex dependencies. Consider using sequential execution for better results.', { hint: 'Sequential execution would provide real dependency data instead of mocks' });
results.section6 = await this.runSection6Analysis(dataset, options);
}
break;
}
}
catch (error) {
logger_1.logger.error(`Section ${section} failed in individual execution`, error);
if (options.continueOnError) {
continue;
}
throw error;
}
}
const totalTime = Date.now() - sectionStartTime;
logger_1.logger.info('Individual execution completed', {
sectionsCompleted: Object.keys(results).length,
totalTime,
mode: 'legacy'
});
return results;
}
/**
* Determine which sections to run based on CLI options
*/
getRequestedSections(options) {
if (options.sections && options.sections.length > 0) {
// Convert numbered sections to section names
return options.sections.map(section => {
if (section.startsWith('section')) {
return section;
}
return `section${section}`;
}).filter(section => {
// Validate section exists
const validSections = ['section1', 'section2', 'section3', 'section4', 'section5', 'section6'];
return validSections.includes(section);
});
}
// Default sections based on command
switch (options.command) {
case 'overview':
return ['section1'];
case 'quality':
return ['section2'];
case 'eda':
return ['section3'];
case 'visualization':
return ['section4'];
case 'engineering':
return ['section5'];
case 'modeling':
return ['section6'];
case 'all':
return ['section1', 'section2', 'section3', 'section4', 'section5', 'section6'];
default:
return ['section1', 'section2', 'section3', 'section4', 'section5', 'section6'];
}
}
shouldRunSection(sectionNumber, options) {
if (options.sections && options.sections.length > 0) {
const shouldRun = options.sections.includes(sectionNumber.toString());
if (options.verbose) {
logger_1.logger.info(`Section ${sectionNumber} ${shouldRun ? 'ENABLED' : 'SKIPPED'} by --sections parameter`);
}
return shouldRun;
}
// Default sections based on command
switch (options.command) {
case 'overview':
return sectionNumber === 1;
case 'quality':
return sectionNumber === 2;
case 'eda':
return sectionNumber === 3;
case 'visualization':
return sectionNumber === 4;
case 'engineering':
return sectionNumber === 5;
case 'modeling':
return sectionNumber === 6;
case 'all':
return true;
default:
return true;
}
}
// Section analysis methods (these adapt existing analyzers to work with universal dataset)
async runSection1Analysis(dataset, options) {
const analyzer = new overview_1.Section1Analyzer({
enableFileHashing: options.enableHashing !== false,
includeHostEnvironment: options.includeEnvironment !== false,
privacyMode: options.privacyMode || 'redacted',
detailedProfiling: options.verbose || false,
maxSampleSizeForSparsity: 10000,
enableCompressionAnalysis: options.enableCompressionAnalysis !== false,
enableDataPreview: options.enableDataPreview !== false,
previewRows: options.previewRows || 5,
enableHealthChecks: options.enableHealthChecks !== false,
enableQuickStatistics: options.enableQuickStats !== false,
});
// Section1 expects filePath, command, and analysis sections
return analyzer.analyze(dataset.metadata.filePath, `datapilot ${options.command || 'analysis'} ${dataset.metadata.filePath}`, []);
}
async runSection2Analysis(dataset, options) {
const analyzer = new quality_1.Section2Analyzer({
data: dataset.rows,
headers: dataset.headers,
columnTypes: dataset.headers.map(() => 'string'),
rowCount: dataset.rows.length,
columnCount: dataset.headers.length,
config: {
enabledDimensions: ['completeness', 'uniqueness', 'validity'],
strictMode: false,
maxOutlierDetection: 100,
semanticDuplicateThreshold: 0.85,
},
});
return analyzer.analyze();
}
async runSection3Analysis(dataset, options) {
const analyzer = new streaming_analyzer_1.StreamingAnalyzer({
chunkSize: options.chunkSize || 500,
memoryThresholdMB: options.memoryLimit || 100,
maxRowsAnalyzed: options.maxRows || 500000,
enabledAnalyses: ['univariate', 'bivariate', 'correlations'],
significanceLevel: 0.05,
maxCorrelationPairs: 50,
enableMultivariate: true,
});
// Section3 expects a file path, not data stream
return analyzer.analyzeFile(dataset.metadata.filePath);
}
async runSection4Analysis(dataset, options) {
const analyzer = new visualization_1.Section4Analyzer({
accessibilityLevel: options.accessibility || 'good',
complexityThreshold: options.complexity || 'moderate',
maxRecommendationsPerChart: options.maxRecommendations || 3,
includeCodeExamples: options.includeCode || false,
enabledRecommendations: [
'UNIVARIATE',
'BIVARIATE',
'DASHBOARD',
'ACCESSIBILITY',
'PERFORMANCE',
],
targetLibraries: ['d3', 'plotly', 'observable'],
});
// Section4 needs dependencies from previous sections
// For now, we'll need to create mock dependencies
// This will be replaced with actual dependency resolution
const mockSection1 = {
overview: { structuralDimensions: { totalDataRows: dataset.rows.length } },
};
const mockSection3 = { performanceMetrics: { rowsAnalyzed: dataset.rows.length } };
return analyzer.analyze(mockSection1, mockSection3);
}
async runSection5Analysis(dataset, options) {
const analyzer = new engineering_1.Section5Analyzer({
targetDatabaseSystem: options.database || 'postgresql',
mlFrameworkTarget: options.framework || 'scikit_learn',
});
// Section5 needs dependencies from previous sections
// Create more complete mock data that matches expected structure
const mockSection1 = {
overview: {
structuralDimensions: {
totalDataRows: dataset.rows.length,
totalColumns: dataset.headers.length,
columnInventory: dataset.headers.map((header, index) => ({
name: header,
index: index,
dataType: 'string',
sampleValues: dataset.rows.slice(0, 3).map(row => row[index] || '').filter(v => v)
})),
estimatedInMemorySizeMB: Math.ceil(dataset.rows.length * dataset.headers.length * 50 / 1024 / 1024)
},
fileDetails: {
originalFilename: dataset.metadata.filePath.split('/').pop() || 'unknown.csv',
fileSizeBytes: dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10,
fileSizeMB: (dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10) / 1024 / 1024,
lastModified: new Date()
},
parsingMetadata: {
encoding: { encoding: 'utf-8' }
}
}
};
const mockSection2 = {
qualityAudit: {
cockpit: {
compositeScore: { score: 85 }
}
}
};
const mockSection3 = {
performanceMetrics: { rowsAnalyzed: dataset.rows.length },
edaAnalysis: {
// Safe structure that won't cause crashes in PCA extraction
multivariateAnalysis: null
}
};
return analyzer.analyze(mockSection1, mockSection2, mockSection3);
}
async runSection6Analysis(dataset, options) {
const analyzer = new modeling_1.Section6Analyzer({
focusAreas: options.focus || ['regression', 'binary_classification', 'clustering'],
complexityPreference: options.complexity || 'moderate',
interpretabilityRequirement: options.interpretability || 'medium',
});
// Section6 needs dependencies from previous sections
const mockSection1 = {
overview: {
structuralDimensions: {
totalDataRows: dataset.rows.length,
totalColumns: dataset.headers.length,
columnInventory: dataset.headers.map((header, index) => ({
name: header,
index: index,
originalIndex: index,
dataType: 'string',
sampleValues: dataset.rows.slice(0, 3).map(row => row[index] || '').filter(v => v)
})),
estimatedInMemorySizeMB: Math.ceil(dataset.rows.length * dataset.headers.length * 50 / 1024 / 1024)
},
fileDetails: {
originalFilename: dataset.metadata.filePath.split('/').pop() || 'unknown.csv',
fileSizeBytes: dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10,
fileSizeMB: (dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10) / 1024 / 1024,
lastModified: new Date()
},
parsingMetadata: {
encoding: { encoding: 'utf-8' }
}
},
};
const mockSection2 = { qualityAudit: { overallScore: 85 } };
const mockSection3 = {
performanceMetrics: { rowsAnalyzed: dataset.rows.length },
edaAnalysis: {
univariateAnalysis: [], // Empty array for safe iteration
bivariateAnalysis: {
numericalVsNumerical: {
correlationPairs: [] // Empty array for safe iteration
}
},
multivariateAnalysis: {
principalComponentAnalysis: null,
clusteringAnalysis: null,
outlierAnalysis: {
multivariateOutliers: [],
outlierSummary: {
totalOutliers: 0,
outlierPercentage: 0,
method: 'IQR',
detectionThreshold: 1.5,
}
},
normalityTests: {
overallNormality: {
isNormal: true,
confidence: 0.95,
testMethod: 'Shapiro-Wilk',
}
}
}
}
};
const mockSection5 = {
engineeringAnalysis: {
mlReadiness: {
overallScore: 85,
enhancingFactors: [
{
factor: "Clean Data Structure",
impact: "high",
description: "Well-structured data with consistent formatting"
}
],
remainingChallenges: [
{
challenge: "Type Detection",
severity: "medium",
impact: "May require manual type specification",
mitigationStrategy: "Implement enhanced type detection",
estimatedEffort: "2-4 hours"
}
],
featurePreparationMatrix: dataset.headers.map(header => ({
featureName: `ml_${header}`,
originalColumn: header,
finalDataType: "String",
keyIssues: ["Type detection needed"],
engineeringSteps: ["Type inference", "Encoding if categorical"],
finalMLFeatureType: "Categorical",
modelingNotes: []
})),
modelingConsiderations: []
}
}
};
return analyzer.analyze(mockSection1, mockSection2, mockSection3, mockSection5);
}
/**
* Create async iterable data stream from dataset
*/
async *createDataStream(dataset) {
for (const row of dataset.rows) {
yield row;
}
}
/**
* Handle analysis errors with enhanced debugging information
*/
handleAnalysisError(error, filePath, options) {
const supportedFormats = this.registry.getSupportedFormats();
const supportedExtensions = this.registry.getSupportedExtensions();
let errorMessage = 'Analysis failed';
let enhancedSuggestions = [];
let errorDetails = {};
if (error instanceof types_1.DataPilotError) {
// Enhanced error handling for DataPilotError
errorMessage = error.getFormattedMessage(options?.verbose || false);
enhancedSuggestions = error.getEnhancedSuggestions(options?.verbose || false);
if (options?.verbose && error.verboseInfo) {
errorDetails = {
fullContext: error.verboseInfo.fullContext,
performanceMetrics: error.verboseInfo.performanceMetrics,
memorySnapshot: error.verboseInfo.memorySnapshot,
};
}
}
else {
// Convert generic error to enhanced format
errorMessage = error instanceof Error ? error.message : 'Unknown analysis error';
if (options?.verbose) {
errorMessage += `\n Stack: ${error instanceof Error ? error.stack?.split('\n').slice(0, 3).join('\n ') : 'No stack available'}`;
}
}
// Default suggestions enhanced with debugging context
const defaultSuggestions = [
`Check if file format is supported: ${supportedFormats.join(', ')}`,
`Supported extensions: ${supportedExtensions.join(', ')}`,
'Try specifying format explicitly: --format json',
'Verify file is not corrupted',
'Check file permissions',
];
if (options?.verbose) {
defaultSuggestions.push('Run with --verbose for more detailed error information', 'Check memory usage with system monitor during analysis', 'Use --maxRows to limit data size for testing');
}
else {
defaultSuggestions.push('Use --verbose for detailed debugging information');
}
defaultSuggestions.push('Use --help for more information');
// Combine enhanced suggestions with defaults
const allSuggestions = enhancedSuggestions.length > 0 ?
[...enhancedSuggestions, '---', ...defaultSuggestions] :
defaultSuggestions;
return {
success: false,
exitCode: 1,
error: errorMessage,
suggestions: allSuggestions,
metadata: {
filePath,
supportedFormats,
supportedExtensions,
timestamp: new Date().toISOString(),
errorCategory: error instanceof types_1.DataPilotError ? error.category : 'unknown',
errorSeverity: error instanceof types_1.DataPilotError ? error.severity : 'medium',
verboseMode: options?.verbose || false,
...(options?.verbose && errorDetails ? { errorDetails } : {}),
},
};
}
/**
* Get supported formats for help/error messages
*/
getSupportedFormats() {
this.initializeParsers();
return this.registry.getSupportedFormats();
}
/**
* Validate file format is supported
*/
async validateFile(filePath) {
this.initializeParsers();
try {
const validation = await this.registry.validateFile(filePath);
if (validation.supported && validation.bestMatch) {
return {
supported: true,
format: validation.bestMatch.format,
confidence: validation.bestMatch.detection.confidence,
suggestions: [],
};
}
else {
return {
supported: false,
suggestions: [
`File format not supported or confidence too low`,
`Supported formats: ${this.getSupportedFormats().join(', ')}`,
'Try converting to a supported format',
'Check if file is corrupted',
],
};
}
}
catch (error) {
return {
supported: false,
suggestions: [
`Error validating file: ${error.message}`,
'Check file exists and is readable',
'Verify file format is supported',
],
};
}
}
}
exports.UniversalAnalyzer = UniversalAnalyzer;
//# sourceMappingURL=universal-analyzer.js.map