UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

953 lines 40.9 kB
"use strict"; /** * Multivariate Analysis Orchestrator * * Coordinates and integrates all multivariate analyses: * - Principal Component Analysis (PCA) * - K-means Clustering * - Multivariate Outlier Detection * - Multivariate Normality Testing * - Relationship Analysis * * Provides comprehensive insights and recommendations * based on the combined results of all analyses. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.MultivariateOrchestrator = void 0; const types_1 = require("../eda/types"); const pca_analyzer_1 = require("./pca-analyzer"); const clustering_analyzer_1 = require("./clustering-analyzer"); const outlier_analyzer_1 = require("./outlier-analyzer"); const logger_1 = require("../../utils/logger"); /** * Normality testing utilities */ class MultivariateNormalityTester { /** * Perform multivariate normality tests */ static performTests(data, variableNames) { try { // Simplified Mardia's test implementation const mardiasTest = this.mardiasMultivariateNormalityTest(data); // Simplified Royston's test (approximation) const roystonTest = this.roystonMultivariateNormalityTest(data); // Overall assessment const overallAssessment = this.assessOverallNormality(mardiasTest, roystonTest); return { mardiasTest, roystonTest, overallAssessment, }; } catch (error) { return this.createFailedNormalityResult(`Normality testing failed: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * Mardia's multivariate normality test (simplified) */ static mardiasMultivariateNormalityTest(data) { const n = data.length; const p = data[0].length; // Calculate sample mean and covariance const mean = this.calculateMean(data); const covariance = this.calculateCovariance(data, mean); try { const covInverse = this.invertMatrix(covariance); // Calculate multivariate skewness (b1p) let skewnessSum = 0; for (let i = 0; i < n; i++) { for (let j = 0; j < n; j++) { const diff_i = data[i].map((val, k) => val - mean[k]); const diff_j = data[j].map((val, k) => val - mean[k]); const mahal_ij = this.quadraticForm(diff_i, covInverse); const mahal_ji = this.quadraticForm(diff_j, covInverse); const cross_term = this.bilinearForm(diff_i, diff_j, covInverse); skewnessSum += Math.pow(cross_term, 3); } } const b1p = skewnessSum / (n * n); const skewnessStatistic = (n * b1p) / 6; // Approximate p-value for skewness (chi-squared with p(p+1)(p+2)/6 df) const skewnessDf = (p * (p + 1) * (p + 2)) / 6; const skewnessPValue = this.chiSquaredPValue(skewnessStatistic, skewnessDf); // Calculate multivariate kurtosis (b2p) let kurtosisSum = 0; for (let i = 0; i < n; i++) { const diff = data[i].map((val, k) => val - mean[k]); const mahal = this.quadraticForm(diff, covInverse); kurtosisSum += mahal * mahal; } const b2p = kurtosisSum / n; const expectedKurtosis = p * (p + 2); const kurtosisStatistic = (b2p - expectedKurtosis) / Math.sqrt((8 * p * (p + 2)) / n); // Approximate p-value for kurtosis (standard normal) const kurtosisPValue = 2 * (1 - this.standardNormalCdf(Math.abs(kurtosisStatistic))); // Interpretation const interpretation = this.interpretMardiasTest(skewnessPValue, kurtosisPValue); return { skewnessStatistic, kurtosisStatistic, skewnessPValue, kurtosisPValue, interpretation, }; } catch (error) { return { skewnessStatistic: 0, kurtosisStatistic: 0, skewnessPValue: 1, kurtosisPValue: 1, interpretation: 'Mardia test failed due to matrix singularity or computational issues', }; } } /** * Royston's multivariate normality test (simplified approximation) */ static roystonMultivariateNormalityTest(data) { const n = data.length; const p = data[0].length; try { // Simplified approach: average of univariate Shapiro-Wilk-like statistics let sumW = 0; let validTests = 0; for (let j = 0; j < p; j++) { const column = data.map((row) => row[j]); const w = this.approximateShapiroWilk(column); if (w > 0) { sumW += w; validTests++; } } if (validTests === 0) { throw new Error('No valid univariate tests computed'); } const avgW = sumW / validTests; const statistic = -Math.log(1 - avgW) * p; // Approximate p-value const pValue = this.chiSquaredPValue(statistic, p); const interpretation = pValue < 0.05 ? 'Multivariate normality rejected (p < 0.05)' : 'Multivariate normality not rejected (p >= 0.05)'; return { statistic, pValue, interpretation, }; } catch (error) { return { statistic: 0, pValue: 1, interpretation: 'Royston test failed due to computational issues', }; } } /** * Assess overall multivariate normality */ static assessOverallNormality(mardiasTest, roystonTest) { const violations = []; const recommendations = []; // Check violations if (mardiasTest.skewnessPValue < 0.05) { violations.push('Multivariate skewness detected'); } if (mardiasTest.kurtosisPValue < 0.05) { violations.push('Multivariate kurtosis detected'); } if (roystonTest.pValue < 0.05) { violations.push('Overall normality rejected'); } // Determine overall assessment const isNormal = violations.length === 0; const confidence = isNormal ? Math.min(mardiasTest.skewnessPValue, mardiasTest.kurtosisPValue, roystonTest.pValue) : 1 - Math.max(1 - mardiasTest.skewnessPValue, 1 - mardiasTest.kurtosisPValue, 1 - roystonTest.pValue); // Generate recommendations if (!isNormal) { recommendations.push('Consider data transformations (log, Box-Cox) to improve normality'); recommendations.push('Use non-parametric or robust statistical methods'); if (violations.includes('Multivariate skewness detected')) { recommendations.push('Address skewness through variable transformation'); } if (violations.includes('Multivariate kurtosis detected')) { recommendations.push('Consider outlier removal or robust estimation methods'); } } else { recommendations.push('Multivariate normal assumption satisfied - parametric methods appropriate'); } return { isMultivariateNormal: isNormal, confidence, violations, recommendations, }; } // Helper methods for normality testing static calculateMean(data) { const n = data.length; const p = data[0].length; const mean = Array(p).fill(0); for (let j = 0; j < p; j++) { for (let i = 0; i < n; i++) { mean[j] += data[i][j]; } mean[j] /= n; } return mean; } static calculateCovariance(data, mean) { const n = data.length; const p = data[0].length; const cov = Array(p) .fill(0) .map(() => Array(p).fill(0)); for (let i = 0; i < p; i++) { for (let j = 0; j < p; j++) { let sum = 0; for (let k = 0; k < n; k++) { sum += (data[k][i] - mean[i]) * (data[k][j] - mean[j]); } cov[i][j] = sum / (n - 1); } } return cov; } static invertMatrix(matrix) { // Simplified matrix inversion - use LU decomposition for robustness const n = matrix.length; const augmented = matrix.map((row, i) => [ ...row, ...Array(n) .fill(0) .map((_, j) => (i === j ? 1 : 0)), ]); // Gaussian elimination with partial pivoting for (let i = 0; i < n; i++) { // Find pivot let maxRow = i; for (let k = i + 1; k < n; k++) { if (Math.abs(augmented[k][i]) > Math.abs(augmented[maxRow][i])) { maxRow = k; } } [augmented[i], augmented[maxRow]] = [augmented[maxRow], augmented[i]]; // Make diagonal element 1 const pivot = augmented[i][i]; if (Math.abs(pivot) < 1e-10) { throw new Error('Matrix is singular'); } for (let j = 0; j < 2 * n; j++) { augmented[i][j] /= pivot; } // Eliminate column for (let k = 0; k < n; k++) { if (k !== i) { const factor = augmented[k][i]; for (let j = 0; j < 2 * n; j++) { augmented[k][j] -= factor * augmented[i][j]; } } } } return augmented.map((row) => row.slice(n)); } static quadraticForm(vector, matrix) { let result = 0; const n = vector.length; for (let i = 0; i < n; i++) { for (let j = 0; j < n; j++) { result += vector[i] * matrix[i][j] * vector[j]; } } return result; } static bilinearForm(vec1, vec2, matrix) { let result = 0; const n = vec1.length; for (let i = 0; i < n; i++) { for (let j = 0; j < n; j++) { result += vec1[i] * matrix[i][j] * vec2[j]; } } return result; } static approximateShapiroWilk(data) { // Simplified approximation to Shapiro-Wilk test const n = data.length; if (n < 3) return 0; const sorted = [...data].sort((a, b) => a - b); const mean = data.reduce((sum, val) => sum + val, 0) / n; // Calculate sample variance const variance = data.reduce((sum, val) => sum + (val - mean) ** 2, 0) / (n - 1); // Approximate W statistic using range-based estimator const range = sorted[n - 1] - sorted[0]; const expectedRange = variance > 0 ? range / Math.sqrt(variance) : 0; // Normalize to [0, 1] range return Math.max(0, Math.min(1, 1 - Math.abs(expectedRange - Math.sqrt(2 * Math.log(n))) / Math.sqrt(2 * Math.log(n)))); } static chiSquaredPValue(x, df) { // Simplified chi-squared p-value approximation if (x <= 0) return 1; if (df <= 0) return 0; // Use Wilson-Hilferty transformation for approximation const h = 2 / (9 * df); const z = (Math.pow(x / df, 1 / 3) - 1 + h) / Math.sqrt(h); return 1 - this.standardNormalCdf(z); } static standardNormalCdf(z) { // Approximation of standard normal CDF return 0.5 * (1 + this.erf(z / Math.sqrt(2))); } static erf(x) { // Approximation of error function const a1 = 0.254829592; const a2 = -0.284496736; const a3 = 1.421413741; const a4 = -1.453152027; const a5 = 1.061405429; const p = 0.3275911; const sign = x >= 0 ? 1 : -1; x = Math.abs(x); const t = 1.0 / (1.0 + p * x); const y = 1.0 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-x * x); return sign * y; } static interpretMardiasTest(skewnessPValue, kurtosisPValue) { const skewnessResult = skewnessPValue < 0.05 ? 'rejected' : 'not rejected'; const kurtosisResult = kurtosisPValue < 0.05 ? 'rejected' : 'not rejected'; if (skewnessPValue < 0.05 && kurtosisPValue < 0.05) { return 'Multivariate normality rejected due to both skewness and kurtosis'; } else if (skewnessPValue < 0.05) { return 'Multivariate normality rejected due to skewness'; } else if (kurtosisPValue < 0.05) { return 'Multivariate normality rejected due to kurtosis'; } else { return 'Multivariate normality not rejected'; } } static createFailedNormalityResult(message) { return { mardiasTest: { skewnessStatistic: 0, kurtosisStatistic: 0, skewnessPValue: 1, kurtosisPValue: 1, interpretation: message, }, roystonTest: { statistic: 0, pValue: 1, interpretation: message, }, overallAssessment: { isMultivariateNormal: false, confidence: 0, violations: [message], recommendations: ['Address data quality issues before normality testing'], }, }; } } /** * Relationship analysis utilities */ class RelationshipAnalyzer { /** * Analyze multivariate relationships */ static analyzeRelationships(data, variableNames, correlationMatrix) { // Analyze variable interactions const variableInteractions = this.analyzeVariableInteractions(correlationMatrix, variableNames); // Analyze correlation structure const correlationStructure = this.analyzeCorrelationStructure(correlationMatrix, variableNames); // Analyze dimensionality const dimensionalityInsights = this.analyzeDimensionality(correlationMatrix, data.length); return { variableInteractions, correlationStructure, dimensionalityInsights, }; } static analyzeVariableInteractions(correlationMatrix, variableNames) { const interactions = []; const n = variableNames.length; // Analyze pairwise correlations for (let i = 0; i < n; i++) { for (let j = i + 1; j < n; j++) { const correlation = correlationMatrix[i][j]; const absCorr = Math.abs(correlation); if (absCorr > 0.3) { const interactionType = this.determineInteractionType(correlation); const significance = this.calculateSignificance(absCorr); interactions.push({ variables: [variableNames[i], variableNames[j]], interactionType, strength: absCorr, significance, interpretation: this.interpretInteraction(variableNames[i], variableNames[j], correlation, interactionType), }); } } } // Sort by strength return interactions.sort((a, b) => b.strength - a.strength).slice(0, 10); // Top 10 interactions } static analyzeCorrelationStructure(correlationMatrix, variableNames) { const n = variableNames.length; // Find strongly correlated groups const stronglyCorrelatedGroups = []; const processed = new Set(); for (let i = 0; i < n; i++) { if (processed.has(i)) continue; const group = [i]; for (let j = i + 1; j < n; j++) { if (Math.abs(correlationMatrix[i][j]) > 0.7) { group.push(j); processed.add(j); } } if (group.length > 1) { const groupVars = group.map((idx) => variableNames[idx]); const avgCorrelation = this.calculateAverageCorrelation(group, correlationMatrix); stronglyCorrelatedGroups.push({ variables: groupVars, avgCorrelation, description: `Highly correlated group (avg r = ${avgCorrelation.toFixed(3)})`, }); group.forEach((idx) => processed.add(idx)); } } // Find independent variables const independentVariables = []; for (let i = 0; i < n; i++) { let maxCorrelation = 0; for (let j = 0; j < n; j++) { if (i !== j) { maxCorrelation = Math.max(maxCorrelation, Math.abs(correlationMatrix[i][j])); } } if (maxCorrelation < 0.3) { independentVariables.push(variableNames[i]); } } // Find redundant variables const redundantVariables = []; for (let i = 0; i < n; i++) { for (let j = i + 1; j < n; j++) { const correlation = Math.abs(correlationMatrix[i][j]); if (correlation > 0.9) { redundantVariables.push({ variable: variableNames[j], redundantWith: variableNames[i], correlation, }); } } } return { stronglyCorrelatedGroups, independentVariables, redundantVariables, }; } static analyzeDimensionality(correlationMatrix, sampleSize) { const n = correlationMatrix.length; // Estimate effective dimensionality based on correlation structure const eigenvalues = this.approximateEigenvalues(correlationMatrix); const effectiveDimensionality = eigenvalues.filter((val) => val > 1).length; // Estimate intrinsic dimensionality using correlation rank const maxCorrelations = correlationMatrix.map((row) => Math.max(...row.map(Math.abs))); const avgMaxCorrelation = maxCorrelations.reduce((sum, val) => sum + val, 0) / n; const intrinsicDimensionality = Math.max(1, Math.floor(n * (1 - avgMaxCorrelation))); // Determine if dimensionality reduction is recommended const redundancyRatio = (n - effectiveDimensionality) / n; const recommended = redundancyRatio > 0.3; const methods = []; if (recommended) { methods.push('Principal Component Analysis (PCA)'); if (avgMaxCorrelation > 0.7) { methods.push('Factor Analysis'); } if (sampleSize > 1000) { methods.push('Independent Component Analysis (ICA)'); } } const expectedVarianceRetention = recommended ? Math.min(0.95, 0.7 + redundancyRatio * 0.25) : 1.0; return { effectiveDimensionality, intrinsicDimensionality, dimensionalityReduction: { recommended, methods, expectedVarianceRetention, }, }; } // Helper methods static determineInteractionType(correlation) { // Simplified classification based on correlation strength const absCorr = Math.abs(correlation); if (absCorr > 0.8) { return 'linear'; } else if (absCorr > 0.5) { return 'synergistic'; } else { return 'linear'; // Default to linear for moderate correlations } } static calculateSignificance(correlation) { // Simplified significance calculation return Math.min(1, correlation * correlation); } static interpretInteraction(var1, var2, correlation, type) { const direction = correlation > 0 ? 'positive' : 'negative'; const strength = Math.abs(correlation) > 0.7 ? 'strong' : Math.abs(correlation) > 0.5 ? 'moderate' : 'weak'; return `${strength} ${direction} ${type} relationship between ${var1} and ${var2}`; } static calculateAverageCorrelation(indices, correlationMatrix) { let sum = 0; let count = 0; for (let i = 0; i < indices.length; i++) { for (let j = i + 1; j < indices.length; j++) { sum += Math.abs(correlationMatrix[indices[i]][indices[j]]); count++; } } return count > 0 ? sum / count : 0; } static approximateEigenvalues(matrix) { // Simplified eigenvalue approximation using trace and determinant const n = matrix.length; const trace = matrix.reduce((sum, row, i) => sum + row[i], 0); const avgEigenvalue = trace / n; // Generate approximate eigenvalues (simplified) const eigenvalues = Array(n).fill(avgEigenvalue); eigenvalues[0] *= 2; // First eigenvalue typically larger return eigenvalues.sort((a, b) => b - a); } } /** * Main multivariate analysis orchestrator */ class MultivariateOrchestrator { static MIN_VARIABLES = 3; static MIN_OBSERVATIONS = 50; /** * Perform comprehensive multivariate analysis */ static async analyze(data, headers, columnTypes, sampleSize) { const startTime = Date.now(); try { // Identify numerical columns const numericalColumnIndices = this.identifyNumericalColumns(columnTypes); const variableNames = numericalColumnIndices.map((i) => headers[i]); // Check overall applicability const applicabilityAssessment = this.assessApplicability(numericalColumnIndices, sampleSize); if (!applicabilityAssessment.applicable) { const analysisTime = Date.now() - startTime; return this.createNonApplicableResult(applicabilityAssessment.reason, analysisTime); } // Extract numerical data for correlation analysis logger_1.logger.debug('Multivariate analysis setup', { operation: 'multivariate-analysis', dataSample: data.length, numericalColumnIndices, headers, columnTypes }); const numericData = this.extractNumericData(data, numericalColumnIndices); logger_1.logger.debug(`Extracted numeric data length: ${numericData.length}`, { operation: 'multivariate-analysis' }); if (!numericData || numericData.length === 0) { const analysisTime = Date.now() - startTime; return this.createNonApplicableResult('No valid numerical data found for multivariate analysis', analysisTime); } const correlationMatrix = this.calculateCorrelationMatrix(numericData); // Perform individual analyses const [pcaAnalysis, clusteringAnalysis, outlierAnalysis, normalityTests, relationshipAnalysis,] = await Promise.all([ Promise.resolve(pca_analyzer_1.PCAAnalyzer.analyze(data, headers, numericalColumnIndices, sampleSize)), Promise.resolve(clustering_analyzer_1.ClusteringAnalyzer.analyze(data, headers, numericalColumnIndices, sampleSize)), Promise.resolve(outlier_analyzer_1.MultivariateOutlierAnalyzer.analyze(data, headers, numericalColumnIndices, sampleSize)), Promise.resolve(MultivariateNormalityTester.performTests(numericData, variableNames)), Promise.resolve(RelationshipAnalyzer.analyzeRelationships(numericData, variableNames, correlationMatrix)), ]); // Generate comprehensive insights const insights = this.generateComprehensiveInsights(pcaAnalysis, clusteringAnalysis, outlierAnalysis, normalityTests, relationshipAnalysis); const analysisTime = Date.now() - startTime; return { summary: { analysisPerformed: true, applicabilityAssessment: applicabilityAssessment.reason, numericVariablesCount: numericalColumnIndices.length, variablesAnalyzed: variableNames, sampleSize, analysisLimitations: this.identifyLimitations(numericalColumnIndices.length, sampleSize, pcaAnalysis, clusteringAnalysis), }, principalComponentAnalysis: pcaAnalysis, clusteringAnalysis, outlierDetection: outlierAnalysis, normalityTests, relationshipAnalysis, insights, technicalMetadata: { analysisTime, memoryUsage: this.estimateMemoryUsage(sampleSize, numericalColumnIndices.length), computationalComplexity: this.assessComputationalComplexity(numericalColumnIndices.length, sampleSize), algorithmsUsed: [ 'Principal Component Analysis (QR eigendecomposition)', "K-means clustering (Lloyd's algorithm with k-means++)", 'Mahalanobis distance outlier detection', "Mardia's multivariate normality test", 'Correlation structure analysis', ], }, // Backward compatibility keyPatterns: insights.keyFindings.slice(0, 3), pcaOverview: pcaAnalysis.isApplicable ? { componentsFor85PercentVariance: pcaAnalysis.varianceThresholds.componentsFor85Percent, dominantVariables: pcaAnalysis.dominantVariables.slice(0, 3).map((v) => v.variable), } : undefined, clusterAnalysis: clusteringAnalysis.isApplicable ? { optimalClusters: clusteringAnalysis.optimalClusters, clusterProfiles: clusteringAnalysis.finalClustering.clusterProfiles.map((profile) => ({ clusterName: profile.clusterName, description: profile.description, keyCharacteristics: profile.centroid, })), } : undefined, interactionTerms: relationshipAnalysis.variableInteractions .slice(0, 5) .map((interaction) => interaction.variables.join(' × ')), }; } catch (error) { console.error('Multivariate analysis failed:', error); const analysisTime = Date.now() - startTime; return this.createNonApplicableResult(`Analysis failed: ${error instanceof Error ? error.message : 'Unknown error'}`, analysisTime); } } /** * Identify numerical columns from column types */ static identifyNumericalColumns(columnTypes) { const numericalTypes = [types_1.EdaDataType.NUMERICAL_FLOAT, types_1.EdaDataType.NUMERICAL_INTEGER]; return columnTypes .map((type, index) => ({ type, index })) .filter(({ type }) => numericalTypes.includes(type)) .map(({ index }) => index); } /** * Assess overall applicability for multivariate analysis */ static assessApplicability(numericalColumnIndices, sampleSize) { if (numericalColumnIndices.length < this.MIN_VARIABLES) { return { applicable: false, reason: `Insufficient numerical variables for multivariate analysis (${numericalColumnIndices.length} < ${this.MIN_VARIABLES})`, }; } if (sampleSize < this.MIN_OBSERVATIONS) { return { applicable: false, reason: `Insufficient observations for multivariate analysis (${sampleSize} < ${this.MIN_OBSERVATIONS})`, }; } const sampleToVariableRatio = sampleSize / numericalColumnIndices.length; if (sampleToVariableRatio < 5) { return { applicable: true, reason: `Limited sample-to-variable ratio (${sampleToVariableRatio.toFixed(1)}) - results should be interpreted cautiously`, }; } return { applicable: true, reason: 'Dataset well-suited for comprehensive multivariate analysis', }; } /** * Extract numerical data for correlation analysis * Uses listwise deletion - only includes rows where ALL numerical columns have valid values * This ensures consistent matrix dimensions for correlation analysis */ static extractNumericData(data, numericalColumnIndices) { const numericData = []; logger_1.logger.debug(`Attempting to extract data for ${numericalColumnIndices.length} numerical columns`, { operation: 'multivariate-analysis', numericalColumnIndices }); let totalRows = 0; let validRows = 0; let rowsWithMissingValues = 0; for (const row of data) { totalRows++; const numericRow = []; let hasAllValidValues = true; let missingCount = 0; // Extract values from numerical columns only for (const colIndex of numericalColumnIndices) { const value = row[colIndex]; // Check bounds if (colIndex >= row.length) { logger_1.logger.debug(`Column index ${colIndex} out of bounds for row length ${row.length}`, { operation: 'multivariate-analysis' }); hasAllValidValues = false; break; } // Convert string numbers to actual numbers if needed let numericValue; if (typeof value === 'string' && value.trim() !== '') { numericValue = parseFloat(value.trim()); if (!isNaN(numericValue) && isFinite(numericValue)) { numericRow.push(numericValue); } else { hasAllValidValues = false; missingCount++; } } else if (typeof value === 'number' && !isNaN(value) && isFinite(value)) { numericRow.push(value); } else { // Missing, null, undefined, or invalid value hasAllValidValues = false; missingCount++; } } if (missingCount > 0) { rowsWithMissingValues++; } // Only include rows with all valid numerical values if (hasAllValidValues && numericRow.length === numericalColumnIndices.length) { numericData.push(numericRow); validRows++; } } logger_1.logger.debug('Multivariate data processing summary', { operation: 'multivariate-analysis', totalRows, rowsWithMissingValues, validRows, dimensions: `${numericData.length}x${numericData.length > 0 ? numericData[0].length : 0}` }); if (numericData.length > 0) { logger_1.logger.debug('Sample extracted row', { operation: 'multivariate-analysis', sample: numericData[0].slice(0, 3) }); } return numericData; } /** * Calculate correlation matrix */ static calculateCorrelationMatrix(data) { if (!data || data.length === 0 || !data[0] || data[0].length === 0) { return [[]]; // Return empty correlation matrix for invalid data } const n = data.length; const p = data[0].length; // Calculate means const means = Array(p).fill(0); for (let j = 0; j < p; j++) { for (let i = 0; i < n; i++) { means[j] += data[i][j]; } means[j] /= n; } // Calculate correlation matrix const correlations = Array(p) .fill(0) .map(() => Array(p).fill(0)); for (let i = 0; i < p; i++) { for (let j = 0; j < p; j++) { if (i === j) { correlations[i][j] = 1; } else { let numerator = 0; let sumXi = 0; let sumXj = 0; for (let k = 0; k < n; k++) { const xi = data[k][i] - means[i]; const xj = data[k][j] - means[j]; numerator += xi * xj; sumXi += xi * xi; sumXj += xj * xj; } const denominator = Math.sqrt(sumXi * sumXj); correlations[i][j] = denominator > 0 ? numerator / denominator : 0; } } } return correlations; } /** * Generate comprehensive insights from all analyses */ static generateComprehensiveInsights(pcaAnalysis, clusteringAnalysis, outlierAnalysis, normalityTests, relationshipAnalysis) { const keyFindings = []; const dataQualityIssues = []; const hypothesesGenerated = []; const preprocessingRecommendations = []; const analysisRecommendations = []; // PCA insights if (pcaAnalysis.isApplicable) { const varianceExplained = pcaAnalysis.varianceThresholds.componentsFor85Percent; keyFindings.push(`${varianceExplained} principal components explain 85% of variance`); if (varianceExplained < pcaAnalysis.componentsAnalyzed / 2) { hypothesesGenerated.push('Strong dimensionality reduction potential suggests underlying structure'); analysisRecommendations.push('Consider using PCA for feature reduction in modeling'); } } // Clustering insights if (clusteringAnalysis.isApplicable) { const optimalK = clusteringAnalysis.optimalClusters; const silhouette = clusteringAnalysis.finalClustering.validation.silhouetteScore; keyFindings.push(`${optimalK} natural clusters identified (silhouette: ${silhouette.toFixed(3)})`); if (silhouette > 0.5) { hypothesesGenerated.push('Strong clustering structure suggests distinct data segments'); analysisRecommendations.push('Cluster-based analysis may reveal meaningful subgroups'); } } // Outlier insights if (outlierAnalysis.isApplicable) { const outlierPercentage = outlierAnalysis.outlierPercentage; keyFindings.push(`${outlierAnalysis.totalOutliers} multivariate outliers detected (${outlierPercentage.toFixed(1)}%)`); if (outlierPercentage > 5) { dataQualityIssues.push('High multivariate outlier rate may indicate data quality issues'); preprocessingRecommendations.push('Investigate and potentially remove or transform outliers'); } } // Normality insights if (!normalityTests.overallAssessment.isMultivariateNormal) { dataQualityIssues.push('Multivariate normality assumption violated'); preprocessingRecommendations.push('Consider data transformations or robust methods'); } // Relationship insights const strongRelationships = relationshipAnalysis.variableInteractions.filter((interaction) => interaction.strength > 0.7).length; if (strongRelationships > 0) { keyFindings.push(`${strongRelationships} strong variable relationships identified`); if (relationshipAnalysis.correlationStructure.redundantVariables.length > 0) { dataQualityIssues.push('Redundant variables detected'); preprocessingRecommendations.push('Consider removing highly correlated variables'); } } // Dimensionality insights if (relationshipAnalysis.dimensionalityInsights.dimensionalityReduction.recommended) { analysisRecommendations.push('Dimensionality reduction recommended based on correlation structure'); } return { keyFindings, dataQualityIssues, hypothesesGenerated, preprocessingRecommendations, analysisRecommendations, }; } /** * Identify analysis limitations */ static identifyLimitations(numVariables, sampleSize, pcaAnalysis, clusteringAnalysis) { const limitations = []; if (sampleSize / numVariables < 10) { limitations.push('Low sample-to-variable ratio may affect reliability'); } if (!pcaAnalysis.isApplicable) { limitations.push('PCA not applicable due to insufficient variables or observations'); } if (!clusteringAnalysis.isApplicable) { limitations.push('Clustering analysis not applicable due to data constraints'); } if (numVariables > 20) { limitations.push('High dimensionality may affect some analyses'); } return limitations; } /** * Estimate memory usage */ static estimateMemoryUsage(sampleSize, numVariables) { const baseMemory = sampleSize * numVariables * 8; // 8 bytes per number const matrixMemory = numVariables * numVariables * 8; // Covariance matrix const totalBytes = baseMemory + matrixMemory * 3; // Multiple matrices const totalMB = totalBytes / (1024 * 1024); if (totalMB < 1) { return `< 1MB`; } else if (totalMB < 100) { return `~${Math.round(totalMB)}MB`; } else { return `~${Math.round(totalMB / 1024)}GB`; } } /** * Assess computational complexity */ static assessComputationalComplexity(numVariables, sampleSize) { const complexity = numVariables * numVariables * sampleSize; if (complexity < 1e6) { return 'Low'; } else if (complexity < 1e8) { return 'Moderate'; } else { return 'High'; } } /** * Create non-applicable result */ static createNonApplicableResult(reason, analysisTime) { return { summary: { analysisPerformed: false, applicabilityAssessment: reason, numericVariablesCount: 0, variablesAnalyzed: [], sampleSize: 0, analysisLimitations: [reason], }, principalComponentAnalysis: pca_analyzer_1.PCAAnalyzer.analyze([], [], [], 0), clusteringAnalysis: clustering_analyzer_1.ClusteringAnalyzer.analyze([], [], [], 0), outlierDetection: outlier_analyzer_1.MultivariateOutlierAnalyzer.analyze([], [], [], 0), normalityTests: MultivariateNormalityTester.performTests([], []), relationshipAnalysis: { variableInteractions: [], correlationStructure: { stronglyCorrelatedGroups: [], independentVariables: [], redundantVariables: [], }, dimensionalityInsights: { effectiveDimensionality: 0, intrinsicDimensionality: 0, dimensionalityReduction: { recommended: false, methods: [], expectedVarianceRetention: 0, }, }, }, insights: { keyFindings: [reason], dataQualityIssues: [], hypothesesGenerated: [], preprocessingRecommendations: [], analysisRecommendations: [], }, technicalMetadata: { analysisTime, memoryUsage: '< 1MB', computationalComplexity: 'Low', algorithmsUsed: [], }, }; } } exports.MultivariateOrchestrator = MultivariateOrchestrator; //# sourceMappingURL=multivariate-orchestrator.js.map