UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

github.com/Mrassimo/datapilot

Mrassimo/datapilot

938 lines • 43 kB

JavaScript

"use strict"; /** * Universal Analyzer - Multi-format data analysis orchestrator * Integrates with the existing 6-section analysis pipeline */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.UniversalAnalyzer = void 0; const parser_registry_1 = require("../parsers/base/parser-registry"); const json_parser_1 = require("../parsers/json-parser"); const csv_parser_adapter_1 = require("../parsers/adapters/csv-parser-adapter"); const excel_parser_1 = require("../parsers/excel-parser"); const tsv_parser_1 = require("../parsers/tsv-parser"); const parquet_parser_1 = require("../parsers/parquet-parser"); const types_1 = require("../core/types"); const logger_1 = require("../utils/logger"); const error_handler_1 = require("../utils/error-handler"); // Import existing analyzers (these remain unchanged) const overview_1 = require("../analyzers/overview"); const quality_1 = require("../analyzers/quality"); const streaming_analyzer_1 = require("../analyzers/streaming/streaming-analyzer"); const visualization_1 = require("../analyzers/visualization"); const engineering_1 = require("../analyzers/engineering"); const modeling_1 = require("../analyzers/modeling"); /** * Universal analyzer that works with any supported data format */ class UniversalAnalyzer { registry; initialized = false; constructor() { this.registry = parser_registry_1.globalParserRegistry; } /** * Initialize the analyzer with all available parsers */ initializeParsers() { if (this.initialized) return; // Register CSV parser (highest priority for backwards compatibility) this.registry.register({ format: 'csv', parserFactory: (options) => (0, csv_parser_adapter_1.createCSVParserAdapter)(options), detector: { detect: async (filePath) => { const adapter = (0, csv_parser_adapter_1.createCSVParserAdapter)(); return adapter.detect(filePath); }, getSupportedExtensions: () => ['.csv'], getFormatName: () => 'csv', }, priority: 100, extensions: ['.csv'], }); // Register TSV parser this.registry.register({ format: 'tsv', parserFactory: (options) => (0, tsv_parser_1.createTSVParser)(options), detector: new tsv_parser_1.TSVDetector(), priority: 90, extensions: ['.tsv', '.tab'], }); // Register JSON parser this.registry.register({ format: 'json', parserFactory: (options) => (0, json_parser_1.createJSONParser)(options), detector: new json_parser_1.JSONDetector(), priority: 80, extensions: ['.json', '.jsonl', '.ndjson'], }); // Register Excel parser this.registry.register({ format: 'excel', parserFactory: (options) => (0, excel_parser_1.createExcelParser)(options), detector: new excel_parser_1.ExcelDetector(), priority: 70, extensions: ['.xlsx', '.xls', '.xlsm'], }); // Register Parquet parser this.registry.register({ format: 'parquet', parserFactory: (options) => (0, parquet_parser_1.createParquetParser)(options), detector: new parquet_parser_1.ParquetDetector(), priority: 60, extensions: ['.parquet'], }); this.initialized = true; logger_1.logger.info(`Initialized universal analyzer with ${this.registry.getSupportedFormats().length} formats`); } /** * Analyze multiple files for join analysis (engineering command with multiple files) */ async analyzeMultipleFiles(filePaths, options) { this.initializeParsers(); try { logger_1.logger.info(`Starting multi-file join analysis for: ${filePaths.join(', ')}`); // Validate all files exist and are supported for (const filePath of filePaths) { const validation = await this.validateFile(filePath); if (!validation.supported) { throw new types_1.DataPilotError(`File ${filePath} is not supported: ${validation.suggestions.join(', ')}`, 'MULTI_FILE_VALIDATION_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION); } } // Import and run join analysis const { JoinAnalyzer } = await Promise.resolve().then(() => __importStar(require('../analyzers/joins'))); const joinAnalyzer = new JoinAnalyzer({ maxTables: Math.max(10, filePaths.length), confidenceThreshold: options.confidence || 0.7, enableFuzzyMatching: true, enableSemanticAnalysis: true, enableTemporalJoins: false, performanceMode: 'BALANCED', outputFormats: [{ type: 'MARKDOWN' }] }); const joinResult = await joinAnalyzer.analyzeJoins(filePaths); return { success: true, exitCode: 0, data: { joinAnalysis: joinResult }, format: options.format || 'markdown', metadata: { command: 'engineering', filePaths, analysisType: 'multi-file-join', filesAnalyzed: filePaths.length, timestamp: new Date().toISOString(), version: '1.2.1', }, }; } catch (error) { return this.handleAnalysisError(error, filePaths.join(', ')); } } /** * Analyze any supported file format */ async analyzeFile(filePath, options) { this.initializeParsers(); // Enable verbose mode in error handler if verbose CLI option is set error_handler_1.globalErrorHandler.setVerboseMode(options.verbose || false); return await error_handler_1.ErrorUtils.withEnhancedContext(async () => { // 1. Auto-detect format and get parser logger_1.logger.info(`Starting universal analysis for: ${filePath}`); // Convert CLIOptions to ParseOptions const parseOptions = { maxRows: options.maxRows, encoding: options.encoding, // NOTE: Do NOT pass options.format here - that's for OUTPUT format, not input file format delimiter: options.delimiter, quote: options.quote, hasHeader: options.hasHeader, jsonPath: options.jsonPath, arrayMode: options.arrayMode, flattenObjects: options.flattenObjects, sheetName: options.sheetName, sheetIndex: options.sheetIndex, columns: options.columns, rowStart: options.rowStart, rowEnd: options.rowEnd, }; const { parser, format, detection } = await error_handler_1.ErrorUtils.withEnhancedContext(() => this.registry.getParser(filePath, parseOptions), { operationName: 'format_detection', filePath, additionalContext: { parseOptions } }); logger_1.logger.info(`Detected format: ${format} (confidence: ${(detection.confidence * 100).toFixed(1)}%)`); // 2. Validate file can be parsed const validation = await error_handler_1.ErrorUtils.withEnhancedContext(() => parser.validate(filePath), { operationName: 'parser_validation', filePath, additionalContext: { format, confidence: detection.confidence } }); if (!validation.canProceed) { throw error_handler_1.ErrorUtils.createContextualError(`Cannot parse file: ${validation.errors.join(', ')}`, 'UNIVERSAL_PARSE_ERROR', types_1.ErrorCategory.VALIDATION, types_1.ErrorSeverity.HIGH, { operationName: 'parser_validation', filePath, additionalContext: { format, parserErrors: validation.errors, parserWarnings: validation.warnings } }); } if (validation.warnings.length > 0) { logger_1.logger.warn(`Parsing warnings: ${validation.warnings.join(', ')}`); } // 3. Convert to common dataset format const dataset = await error_handler_1.ErrorUtils.withEnhancedContext(() => this.parseToDataset(parser, filePath, parseOptions, format, detection), { operationName: 'dataset_conversion', filePath, additionalContext: { format, parseOptions } }); // 4. Run the same 6-section analysis pipeline const analysisResult = await error_handler_1.ErrorUtils.withEnhancedContext(() => this.runAnalysisPipeline(dataset, options), { operationName: 'analysis_pipeline', filePath, additionalContext: { format, datasetSize: dataset.rows.length, columns: dataset.headers.length } }); return { success: true, exitCode: 0, data: analysisResult, format: options.format || 'markdown', metadata: { command: options.command || 'all', filePath, originalFormat: format, detection: { format, confidence: detection.confidence, metadata: detection.metadata, }, parserStats: parser.getStats(), timestamp: new Date().toISOString(), version: '1.2.1', // Multi-format support version }, }; }, { operationName: 'universal_file_analysis', filePath, additionalContext: { command: options.command, verboseMode: options.verbose } }).catch((error) => { return this.handleAnalysisError(error, filePath, options); }); } /** * Parse file using detected parser and convert to universal dataset format */ async parseToDataset(parser, filePath, options, format, detection) { const rows = []; let headers = []; let hasHeader = false; logger_1.logger.info('Parsing file to dataset format...'); // Parse file and collect rows for await (const row of parser.parse(filePath, { maxRows: options.maxRows, hasHeader: options.hasHeader, encoding: options.encoding, delimiter: options.delimiter, quote: options.quote, jsonPath: options.jsonPath, arrayMode: options.arrayMode, flattenObjects: options.flattenObjects, sheetName: options.sheetName, sheetIndex: options.sheetIndex, columns: options.columns, rowStart: options.rowStart, rowEnd: options.rowEnd, })) { if (row.index === 0 && !hasHeader) { // First row - determine if it's headers or data const isHeaderRow = this.detectHeaderRow(row.data, format); if (isHeaderRow) { headers = row.data; hasHeader = true; continue; } else { // Generate column names headers = row.data.map((_, i) => `column_${i + 1}`); } } rows.push(row.data); } const stats = parser.getStats(); logger_1.logger.info(`Parsed ${rows.length} rows with ${headers.length} columns`); return { headers, rows, metadata: { format, filePath, totalRows: rows.length, parserStats: stats, detection, }, }; } /** * Detect if first row contains headers */ detectHeaderRow(row, format) { // Format-specific header detection logic switch (format) { case 'json': // JSON usually has meaningful keys as headers return true; case 'parquet': // Parquet always has schema-defined column names return true; case 'csv': // CSV header detection (existing logic) return this.detectCSVHeaders(row); default: // Default: assume first row is header if it contains non-numeric values return row.some((cell) => isNaN(Number(cell)) && cell.trim() !== ''); } } detectCSVHeaders(row) { // Simple heuristic: if more than half the cells are non-numeric, likely headers const nonNumeric = row.filter((cell) => { const trimmed = cell.trim(); return trimmed !== '' && isNaN(Number(trimmed)); }); return nonNumeric.length > row.length / 2; } /** * Run the existing 6-section analysis pipeline on the universal dataset * Intelligently uses SequentialExecutor or individual execution based on context */ async runAnalysisPipeline(dataset, options) { // Determine which sections to run based on options const requestedSections = this.getRequestedSections(options); if (requestedSections.length === 0) { logger_1.logger.warn('No sections requested for analysis'); return {}; } // Intelligent execution mode detection for backward compatibility const shouldUseSequentialExecution = this.shouldUseSequentialExecution(requestedSections, options); if (shouldUseSequentialExecution) { // Use new sequential execution for complex dependencies return this.runSequentialExecution(dataset, options, requestedSections); } else { // Use traditional individual execution for single sections without dependencies return this.runIndividualExecution(dataset, options, requestedSections); } } /** * Determine whether to use sequential execution or individual execution * This maintains backward compatibility while enabling advanced features */ shouldUseSequentialExecution(requestedSections, options) { // Force sequential execution if explicitly requested if (options.forceSequential) { logger_1.logger.info('Sequential execution forced by --force-sequential flag'); return true; } // Force individual execution if explicitly requested (for testing/debugging) if (options.forceIndividual) { logger_1.logger.info('Individual execution forced by --force-individual flag'); return false; } // Check if any section with dependencies is requested const sectionsWithDependencies = ['section4', 'section5', 'section6']; const hasDependentSection = requestedSections.some(section => sectionsWithDependencies.includes(section)); // Use sequential execution if: // 1. Multiple sections are requested (better memory management) // 2. Any section with dependencies is requested // 3. Command is 'all' or 'analysis' (full pipeline) if (requestedSections.length > 1 || hasDependentSection || ['all', 'analysis', 'modeling'].includes(options.command || '')) { logger_1.logger.info('Using sequential execution for optimal dependency resolution', { sections: requestedSections, reason: hasDependentSection ? 'dependencies' : 'multiple sections', command: options.command }); return true; } // Use individual execution for single sections without dependencies logger_1.logger.info('Using individual execution for single section', { section: requestedSections[0], command: options.command }); return false; } /** * Run sequential execution with full dependency resolution */ async runSequentialExecution(dataset, options, requestedSections) { logger_1.logger.info(`Starting analysis pipeline with SequentialExecutor for sections: ${requestedSections.join(', ')}`, { sections: requestedSections, executor: 'SequentialExecutor' }); try { // Import SequentialExecutor (dynamic import to avoid circular dependencies) const { createSequentialExecutor } = await Promise.resolve().then(() => __importStar(require('./sequential-executor'))); // Create progress callbacks for CLI feedback const progressCallbacks = { onPhaseStart: (phase, message) => { if (options.verbose) { logger_1.logger.info(`Phase started: ${phase} - ${message}`); } }, onProgress: (state) => { if (options.verbose) { logger_1.logger.debug(`Progress: ${state.progress}% - ${state.message}`); } }, onPhaseComplete: (message, timeElapsed) => { if (options.verbose) { logger_1.logger.info(`Phase completed: ${message} (${timeElapsed}ms)`); } }, onError: (message) => { logger_1.logger.error(`Execution error: ${message}`); }, onWarning: (message) => { logger_1.logger.warn(`Execution warning: ${message}`); }, }; // Create and configure sequential executor const executor = createSequentialExecutor(dataset, options, progressCallbacks, { operation: 'pipeline_execution', filePath: dataset.metadata.filePath, format: dataset.metadata.format, }); // Execute with sophisticated dependency resolution and memory management const result = await executor.execute(requestedSections); if (!result.success) { throw new types_1.DataPilotError(`Sequential execution failed: ${result.error}`, 'SEQUENTIAL_EXECUTION_FAILED', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.ANALYSIS, {}, result.suggestions?.map(suggestion => ({ action: 'Follow suggestion', description: suggestion, severity: types_1.ErrorSeverity.MEDIUM, }))); } logger_1.logger.info('Sequential execution completed successfully', { sectionsCompleted: result.metadata?.sectionsExecuted?.length || 0, executionTime: result.metadata?.executionTime || 0, memoryPeak: result.metadata?.memoryPeakUsage || 0, }); return result.data; } catch (error) { // If sequential execution fails, fall back to individual execution if (options.fallbackOnError !== false) { logger_1.logger.warn('Sequential execution failed, falling back to individual execution', { error: error.message }); return this.runIndividualExecution(dataset, options, requestedSections); } throw error; } } /** * Run individual section execution (legacy mode for backward compatibility) * This is used for single sections without dependencies to maintain performance */ async runIndividualExecution(dataset, options, requestedSections) { logger_1.logger.info(`Running individual section execution for: ${requestedSections.join(', ')}`, { sections: requestedSections, executor: 'Individual' }); const results = {}; const sectionStartTime = Date.now(); // Execute each section individually (original behavior) for (const section of requestedSections) { try { switch (section) { case 'section1': if (this.shouldRunSection(1, options)) { results.section1 = await this.runSection1Analysis(dataset, options); } break; case 'section2': if (this.shouldRunSection(2, options)) { results.section2 = await this.runSection2Analysis(dataset, options); } break; case 'section3': if (this.shouldRunSection(3, options)) { results.section3 = await this.runSection3Analysis(dataset, options); } break; case 'section4': if (this.shouldRunSection(4, options)) { // Section 4 needs dependencies - create mocks for individual execution const mockSection1 = results.section1 || await this.runSection1Analysis(dataset, options); const mockSection3 = results.section3 || await this.runSection3Analysis(dataset, options); results.section4 = await this.runSection4Analysis(dataset, options); } break; case 'section5': if (this.shouldRunSection(5, options)) { // Section 5 needs dependencies - create mocks for individual execution const mockSection1 = results.section1 || await this.runSection1Analysis(dataset, options); const mockSection2 = results.section2 || await this.runSection2Analysis(dataset, options); const mockSection3 = results.section3 || await this.runSection3Analysis(dataset, options); results.section5 = await this.runSection5Analysis(dataset, options); } break; case 'section6': if (this.shouldRunSection(6, options)) { // Section 6 needs multiple dependencies - warn about using sequential execution logger_1.logger.warn('Section 6 has complex dependencies. Consider using sequential execution for better results.', { hint: 'Sequential execution would provide real dependency data instead of mocks' }); results.section6 = await this.runSection6Analysis(dataset, options); } break; } } catch (error) { logger_1.logger.error(`Section ${section} failed in individual execution`, error); if (options.continueOnError) { continue; } throw error; } } const totalTime = Date.now() - sectionStartTime; logger_1.logger.info('Individual execution completed', { sectionsCompleted: Object.keys(results).length, totalTime, mode: 'legacy' }); return results; } /** * Determine which sections to run based on CLI options */ getRequestedSections(options) { if (options.sections && options.sections.length > 0) { // Convert numbered sections to section names return options.sections.map(section => { if (section.startsWith('section')) { return section; } return `section${section}`; }).filter(section => { // Validate section exists const validSections = ['section1', 'section2', 'section3', 'section4', 'section5', 'section6']; return validSections.includes(section); }); } // Default sections based on command switch (options.command) { case 'overview': return ['section1']; case 'quality': return ['section2']; case 'eda': return ['section3']; case 'visualization': return ['section4']; case 'engineering': return ['section5']; case 'modeling': return ['section6']; case 'all': return ['section1', 'section2', 'section3', 'section4', 'section5', 'section6']; default: return ['section1', 'section2', 'section3', 'section4', 'section5', 'section6']; } } shouldRunSection(sectionNumber, options) { if (options.sections && options.sections.length > 0) { const shouldRun = options.sections.includes(sectionNumber.toString()); if (options.verbose) { logger_1.logger.info(`Section ${sectionNumber} ${shouldRun ? 'ENABLED' : 'SKIPPED'} by --sections parameter`); } return shouldRun; } // Default sections based on command switch (options.command) { case 'overview': return sectionNumber === 1; case 'quality': return sectionNumber === 2; case 'eda': return sectionNumber === 3; case 'visualization': return sectionNumber === 4; case 'engineering': return sectionNumber === 5; case 'modeling': return sectionNumber === 6; case 'all': return true; default: return true; } } // Section analysis methods (these adapt existing analyzers to work with universal dataset) async runSection1Analysis(dataset, options) { const analyzer = new overview_1.Section1Analyzer({ enableFileHashing: options.enableHashing !== false, includeHostEnvironment: options.includeEnvironment !== false, privacyMode: options.privacyMode || 'redacted', detailedProfiling: options.verbose || false, maxSampleSizeForSparsity: 10000, enableCompressionAnalysis: options.enableCompressionAnalysis !== false, enableDataPreview: options.enableDataPreview !== false, previewRows: options.previewRows || 5, enableHealthChecks: options.enableHealthChecks !== false, enableQuickStatistics: options.enableQuickStats !== false, }); // Section1 expects filePath, command, and analysis sections return analyzer.analyze(dataset.metadata.filePath, `datapilot ${options.command || 'analysis'} ${dataset.metadata.filePath}`, []); } async runSection2Analysis(dataset, options) { const analyzer = new quality_1.Section2Analyzer({ data: dataset.rows, headers: dataset.headers, columnTypes: dataset.headers.map(() => 'string'), rowCount: dataset.rows.length, columnCount: dataset.headers.length, config: { enabledDimensions: ['completeness', 'uniqueness', 'validity'], strictMode: false, maxOutlierDetection: 100, semanticDuplicateThreshold: 0.85, }, }); return analyzer.analyze(); } async runSection3Analysis(dataset, options) { const analyzer = new streaming_analyzer_1.StreamingAnalyzer({ chunkSize: options.chunkSize || 500, memoryThresholdMB: options.memoryLimit || 100, maxRowsAnalyzed: options.maxRows || 500000, enabledAnalyses: ['univariate', 'bivariate', 'correlations'], significanceLevel: 0.05, maxCorrelationPairs: 50, enableMultivariate: true, }); // Section3 expects a file path, not data stream return analyzer.analyzeFile(dataset.metadata.filePath); } async runSection4Analysis(dataset, options) { const analyzer = new visualization_1.Section4Analyzer({ accessibilityLevel: options.accessibility || 'good', complexityThreshold: options.complexity || 'moderate', maxRecommendationsPerChart: options.maxRecommendations || 3, includeCodeExamples: options.includeCode || false, enabledRecommendations: [ 'UNIVARIATE', 'BIVARIATE', 'DASHBOARD', 'ACCESSIBILITY', 'PERFORMANCE', ], targetLibraries: ['d3', 'plotly', 'observable'], }); // Section4 needs dependencies from previous sections // For now, we'll need to create mock dependencies // This will be replaced with actual dependency resolution const mockSection1 = { overview: { structuralDimensions: { totalDataRows: dataset.rows.length } }, }; const mockSection3 = { performanceMetrics: { rowsAnalyzed: dataset.rows.length } }; return analyzer.analyze(mockSection1, mockSection3); } async runSection5Analysis(dataset, options) { const analyzer = new engineering_1.Section5Analyzer({ targetDatabaseSystem: options.database || 'postgresql', mlFrameworkTarget: options.framework || 'scikit_learn', }); // Section5 needs dependencies from previous sections // Create more complete mock data that matches expected structure const mockSection1 = { overview: { structuralDimensions: { totalDataRows: dataset.rows.length, totalColumns: dataset.headers.length, columnInventory: dataset.headers.map((header, index) => ({ name: header, index: index, dataType: 'string', sampleValues: dataset.rows.slice(0, 3).map(row => row[index] || '').filter(v => v) })), estimatedInMemorySizeMB: Math.ceil(dataset.rows.length * dataset.headers.length * 50 / 1024 / 1024) }, fileDetails: { originalFilename: dataset.metadata.filePath.split('/').pop() || 'unknown.csv', fileSizeBytes: dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10, fileSizeMB: (dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10) / 1024 / 1024, lastModified: new Date() }, parsingMetadata: { encoding: { encoding: 'utf-8' } } } }; const mockSection2 = { qualityAudit: { cockpit: { compositeScore: { score: 85 } } } }; const mockSection3 = { performanceMetrics: { rowsAnalyzed: dataset.rows.length }, edaAnalysis: { // Safe structure that won't cause crashes in PCA extraction multivariateAnalysis: null } }; return analyzer.analyze(mockSection1, mockSection2, mockSection3); } async runSection6Analysis(dataset, options) { const analyzer = new modeling_1.Section6Analyzer({ focusAreas: options.focus || ['regression', 'binary_classification', 'clustering'], complexityPreference: options.complexity || 'moderate', interpretabilityRequirement: options.interpretability || 'medium', }); // Section6 needs dependencies from previous sections const mockSection1 = { overview: { structuralDimensions: { totalDataRows: dataset.rows.length, totalColumns: dataset.headers.length, columnInventory: dataset.headers.map((header, index) => ({ name: header, index: index, originalIndex: index, dataType: 'string', sampleValues: dataset.rows.slice(0, 3).map(row => row[index] || '').filter(v => v) })), estimatedInMemorySizeMB: Math.ceil(dataset.rows.length * dataset.headers.length * 50 / 1024 / 1024) }, fileDetails: { originalFilename: dataset.metadata.filePath.split('/').pop() || 'unknown.csv', fileSizeBytes: dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10, fileSizeMB: (dataset.metadata.parserStats?.totalBytesRead || dataset.rows.length * dataset.headers.length * 10) / 1024 / 1024, lastModified: new Date() }, parsingMetadata: { encoding: { encoding: 'utf-8' } } }, }; const mockSection2 = { qualityAudit: { overallScore: 85 } }; const mockSection3 = { performanceMetrics: { rowsAnalyzed: dataset.rows.length }, edaAnalysis: { univariateAnalysis: [], // Empty array for safe iteration bivariateAnalysis: { numericalVsNumerical: { correlationPairs: [] // Empty array for safe iteration } }, multivariateAnalysis: { principalComponentAnalysis: null, clusteringAnalysis: null, outlierAnalysis: { multivariateOutliers: [], outlierSummary: { totalOutliers: 0, outlierPercentage: 0, method: 'IQR', detectionThreshold: 1.5, } }, normalityTests: { overallNormality: { isNormal: true, confidence: 0.95, testMethod: 'Shapiro-Wilk', } } } } }; const mockSection5 = { engineeringAnalysis: { mlReadiness: { overallScore: 85, enhancingFactors: [ { factor: "Clean Data Structure", impact: "high", description: "Well-structured data with consistent formatting" } ], remainingChallenges: [ { challenge: "Type Detection", severity: "medium", impact: "May require manual type specification", mitigationStrategy: "Implement enhanced type detection", estimatedEffort: "2-4 hours" } ], featurePreparationMatrix: dataset.headers.map(header => ({ featureName: `ml_${header}`, originalColumn: header, finalDataType: "String", keyIssues: ["Type detection needed"], engineeringSteps: ["Type inference", "Encoding if categorical"], finalMLFeatureType: "Categorical", modelingNotes: [] })), modelingConsiderations: [] } } }; return analyzer.analyze(mockSection1, mockSection2, mockSection3, mockSection5); } /** * Create async iterable data stream from dataset */ async *createDataStream(dataset) { for (const row of dataset.rows) { yield row; } } /** * Handle analysis errors with enhanced debugging information */ handleAnalysisError(error, filePath, options) { const supportedFormats = this.registry.getSupportedFormats(); const supportedExtensions = this.registry.getSupportedExtensions(); let errorMessage = 'Analysis failed'; let enhancedSuggestions = []; let errorDetails = {}; if (error instanceof types_1.DataPilotError) { // Enhanced error handling for DataPilotError errorMessage = error.getFormattedMessage(options?.verbose || false); enhancedSuggestions = error.getEnhancedSuggestions(options?.verbose || false); if (options?.verbose && error.verboseInfo) { errorDetails = { fullContext: error.verboseInfo.fullContext, performanceMetrics: error.verboseInfo.performanceMetrics, memorySnapshot: error.verboseInfo.memorySnapshot, }; } } else { // Convert generic error to enhanced format errorMessage = error instanceof Error ? error.message : 'Unknown analysis error'; if (options?.verbose) { errorMessage += `\n Stack: ${error instanceof Error ? error.stack?.split('\n').slice(0, 3).join('\n ') : 'No stack available'}`; } } // Default suggestions enhanced with debugging context const defaultSuggestions = [ `Check if file format is supported: ${supportedFormats.join(', ')}`, `Supported extensions: ${supportedExtensions.join(', ')}`, 'Try specifying format explicitly: --format json', 'Verify file is not corrupted', 'Check file permissions', ]; if (options?.verbose) { defaultSuggestions.push('Run with --verbose for more detailed error information', 'Check memory usage with system monitor during analysis', 'Use --maxRows to limit data size for testing'); } else { defaultSuggestions.push('Use --verbose for detailed debugging information'); } defaultSuggestions.push('Use --help for more information'); // Combine enhanced suggestions with defaults const allSuggestions = enhancedSuggestions.length > 0 ? [...enhancedSuggestions, '---', ...defaultSuggestions] : defaultSuggestions; return { success: false, exitCode: 1, error: errorMessage, suggestions: allSuggestions, metadata: { filePath, supportedFormats, supportedExtensions, timestamp: new Date().toISOString(), errorCategory: error instanceof types_1.DataPilotError ? error.category : 'unknown', errorSeverity: error instanceof types_1.DataPilotError ? error.severity : 'medium', verboseMode: options?.verbose || false, ...(options?.verbose && errorDetails ? { errorDetails } : {}), }, }; } /** * Get supported formats for help/error messages */ getSupportedFormats() { this.initializeParsers(); return this.registry.getSupportedFormats(); } /** * Validate file format is supported */ async validateFile(filePath) { this.initializeParsers(); try { const validation = await this.registry.validateFile(filePath); if (validation.supported && validation.bestMatch) { return { supported: true, format: validation.bestMatch.format, confidence: validation.bestMatch.detection.confidence, suggestions: [], }; } else { return { supported: false, suggestions: [ `File format not supported or confidence too low`, `Supported formats: ${this.getSupportedFormats().join(', ')}`, 'Try converting to a supported format', 'Check if file is corrupted', ], }; } } catch (error) { return { supported: false, suggestions: [ `Error validating file: ${error.message}`, 'Check file exists and is readable', 'Verify file format is supported', ], }; } } } exports.UniversalAnalyzer = UniversalAnalyzer; //# sourceMappingURL=universal-analyzer.js.map