datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
1,111 lines • 55.1 kB
JavaScript
"use strict";
/**
* Statistical-Driven Chart Selection Engine
*
* Advanced engine that analyzes statistical properties of data to make
* intelligent chart recommendations based on:
* - Distribution characteristics (normality, skewness, kurtosis)
* - Statistical significance of relationships
* - Data quality metrics and outlier patterns
* - Correlation structures and effect sizes
* - Variance explained and dimensionality
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.StatisticalChartSelector = void 0;
const types_1 = require("../../eda/types");
/**
* Advanced Statistical Chart Selection Engine
*/
class StatisticalChartSelector {
/**
* Generate statistically-informed chart recommendations for univariate data
*/
static recommendUnivariateChart(columnAnalysis) {
const dataType = columnAnalysis.detectedDataType;
const distribution = this.analyzeDistribution(columnAnalysis);
switch (dataType) {
case types_1.EdaDataType.NUMERICAL_FLOAT:
case types_1.EdaDataType.NUMERICAL_INTEGER:
return this.recommendNumericalUnivariate(columnAnalysis, distribution);
case types_1.EdaDataType.CATEGORICAL:
return this.recommendCategoricalUnivariate(columnAnalysis);
case types_1.EdaDataType.DATE_TIME:
return this.recommendTemporalUnivariate(columnAnalysis);
case types_1.EdaDataType.BOOLEAN:
return this.recommendBooleanUnivariate(columnAnalysis);
default:
return this.createFallbackRecommendation(columnAnalysis);
}
}
/**
* Generate statistically-informed chart recommendations for bivariate relationships
*/
static recommendBivariateChart(xColumn, yColumn, correlation) {
const xType = xColumn.detectedDataType;
const yType = yColumn.detectedDataType;
// Numerical vs Numerical
if (this.isNumerical(xType) && this.isNumerical(yType)) {
return this.recommendNumericalBivariate(xColumn, yColumn, correlation);
}
// Categorical vs Numerical
if (this.isCategorical(xType) && this.isNumerical(yType)) {
return this.recommendCategoricalNumerical(xColumn, yColumn);
}
// Numerical vs Categorical (swap for consistency)
if (this.isNumerical(xType) && this.isCategorical(yType)) {
return this.recommendCategoricalNumerical(yColumn, xColumn);
}
// Categorical vs Categorical
if (this.isCategorical(xType) && this.isCategorical(yType)) {
return this.recommendCategoricalBivariate(xColumn, yColumn);
}
// Temporal relationships
if (this.isTemporal(xType) || this.isTemporal(yType)) {
return this.recommendTemporalBivariate(xColumn, yColumn);
}
return this.createFallbackBivariateRecommendation(xColumn, yColumn);
}
/**
* Analyze distribution characteristics for statistical insights
*/
static analyzeDistribution(columnAnalysis) {
const stats = columnAnalysis.descriptiveStats;
if (!stats) {
return {
isNormal: false,
skewness: 0,
kurtosis: 0,
modality: 'unimodal',
outlierSeverity: 'none',
tailBehavior: 'normal',
};
}
// Determine normality based on skewness and kurtosis
const skewness = columnAnalysis.distributionAnalysis?.skewness || 0;
const kurtosis = columnAnalysis.distributionAnalysis?.kurtosis || 0;
const isNormal = Math.abs(skewness) < 0.5 && Math.abs(kurtosis) < 1.0;
// Analyze outlier severity
const outlierCount = columnAnalysis.outlierAnalysis?.summary?.totalOutliers || 0;
const totalCount = columnAnalysis.totalValues || 1;
const outlierRate = outlierCount / totalCount;
let outlierSeverity = 'none';
if (outlierRate > 0.1)
outlierSeverity = 'severe';
else if (outlierRate > 0.05)
outlierSeverity = 'moderate';
else if (outlierRate > 0.01)
outlierSeverity = 'mild';
// Determine tail behavior
let tailBehavior = 'normal';
if (Math.abs(kurtosis) > 2)
tailBehavior = 'heavy';
else if (Math.abs(kurtosis) < -1)
tailBehavior = 'light';
// Suggest transformation if needed
let recommendedTransformation;
if (skewness > 1)
recommendedTransformation = 'log';
else if (skewness < -1)
recommendedTransformation = 'square';
else if (Math.abs(kurtosis) > 3)
recommendedTransformation = 'box-cox';
return {
isNormal,
skewness,
kurtosis,
modality: 'unimodal', // Would need more sophisticated analysis for multimodality
outlierSeverity,
tailBehavior,
recommendedTransformation,
};
}
/**
* Recommend charts for numerical univariate data based on distribution
*/
static recommendNumericalUnivariate(columnAnalysis, distribution) {
// Check for missing statistical data
const hasDistributionData = columnAnalysis.distributionAnalysis !== undefined;
if (!hasDistributionData) {
return this.createFallbackRecommendation(columnAnalysis, 'histogram');
}
const uniqueValues = columnAnalysis.uniqueValues || 0;
const totalValues = columnAnalysis.totalValues || 1;
const cardinality = uniqueValues / totalValues;
let chartType;
let confidence;
let justification;
let encodingStrategy;
// Check for highly skewed data first
if (Math.abs(distribution.skewness) > 1.5) {
chartType = 'box_plot';
confidence = 0.9;
justification = 'Highly skewed data is best visualized with box plot to show distribution and outliers';
}
// High cardinality numerical data
else if (cardinality > 0.8 || uniqueValues > 50) {
if (distribution.isNormal && distribution.outlierSeverity === 'none') {
chartType = 'histogram';
confidence = 0.9;
justification =
'normal distribution with high cardinality best shown with histogram';
}
else if (distribution.outlierSeverity === 'severe') {
chartType = 'violin_plot';
confidence = 0.85;
justification =
'Severe outliers require violin plot to show both distribution and outlier patterns';
}
else {
chartType = 'histogram';
confidence = 0.8;
justification =
'High cardinality numerical data shows distribution patterns best with histogram';
}
}
// Moderate cardinality
else if (cardinality > 0.3) {
chartType = 'histogram';
confidence = 0.85;
justification = 'Moderate cardinality allows for meaningful bin-based distribution analysis';
}
// Low cardinality (discrete-like)
else {
chartType = 'bar_chart';
confidence = 0.9;
justification = 'Low cardinality numerical data treated as discrete categories for clarity';
}
// Generate encoding strategy
encodingStrategy = this.createUnivariateEncodingStrategy(columnAnalysis, chartType, distribution);
const interactions = this.generateUnivariateInteractions(chartType, distribution);
const alternatives = this.generateUnivariateAlternatives(chartType, distribution, cardinality);
const performance = this.generatePerformanceGuidance(totalValues, chartType);
return {
chartType,
confidence,
statisticalJustification: justification,
dataCharacteristics: this.extractDataCharacteristics(distribution, cardinality),
visualEncodingStrategy: encodingStrategy,
interactionRecommendations: interactions,
alternativeOptions: alternatives,
performanceConsiderations: performance,
};
}
/**
* Recommend charts for categorical univariate data
*/
static recommendCategoricalUnivariate(columnAnalysis) {
const uniqueValues = columnAnalysis.uniqueValues || 0;
const uniquePercentage = columnAnalysis.uniquePercentage || 0;
const entropy = this.calculateEntropy(columnAnalysis);
const isOrderedCategories = this.detectOrderedCategories(columnAnalysis);
// Check if this is an identifier column first - avoid meaningless frequency charts
if (columnAnalysis.inferredSemanticType === types_1.SemanticType.IDENTIFIER ||
(uniquePercentage >= 95 && uniqueValues > 10)) {
return this.createIdentifierRecommendation(columnAnalysis);
}
let chartType;
let confidence;
let justification;
if (uniqueValues <= 10) {
chartType = 'bar_chart';
confidence = 0.9;
justification = 'Moderate cardinality categorical data ideal for bar chart comparison';
}
else if (uniqueValues <= 20) {
chartType = 'horizontal_bar_chart';
confidence = 0.8;
justification = 'High cardinality requires horizontal orientation for label readability';
}
else {
chartType = 'treemap';
confidence = 0.75;
justification =
'Very high cardinality categorical data benefits from hierarchical treemap display';
}
const encodingStrategy = this.createCategoricalEncodingStrategy(columnAnalysis, chartType);
const interactions = this.generateCategoricalInteractions(chartType, uniqueValues);
const alternatives = this.generateCategoricalAlternatives(chartType, uniqueValues, entropy);
const performance = this.generatePerformanceGuidance(columnAnalysis.totalValues || 0, chartType);
return {
chartType,
confidence,
statisticalJustification: justification,
dataCharacteristics: [`Cardinality: ${uniqueValues}`, `Entropy: ${entropy.toFixed(2)}`, 'categorical'],
visualEncodingStrategy: encodingStrategy,
interactionRecommendations: interactions,
alternativeOptions: alternatives,
performanceConsiderations: performance,
};
}
/**
* Recommend charts for numerical vs numerical relationships
*/
static recommendNumericalBivariate(xColumn, yColumn, correlation) {
const totalPoints = Math.min(xColumn.totalValues || 0, yColumn.totalValues || 0);
const correlationStrength = correlation?.strength || 'weak';
const relationshipType = correlation?.relationship || 'linear';
let chartType;
let confidence;
let justification;
// Large datasets require different approaches
if (totalPoints > 10000) {
if (correlationStrength === 'very_strong' || correlationStrength === 'strong') {
chartType = 'hexbin_plot';
confidence = 0.9;
justification =
'Strong correlation in large dataset best shown with hexagonal binning to reveal density patterns';
}
else {
chartType = 'density_scatter';
confidence = 0.85;
justification = 'Large dataset requires density-based scatter plot to prevent overplotting';
}
}
// Medium datasets
else if (totalPoints > 1000) {
if (relationshipType === 'non_linear') {
chartType = 'smooth_scatter';
confidence = 0.8;
justification = 'Non-linear relationship benefits from smoothed trend line visualization';
}
else {
chartType = 'scatter_plot';
confidence = 0.9;
justification =
'Medium-sized dataset ideal for traditional scatter plot with trend analysis';
}
}
// Small datasets
else {
chartType = 'scatter_plot';
confidence = 0.95;
justification =
'Small dataset allows for detailed scatter plot analysis with correlation patterns and individual point inspection';
}
const encodingStrategy = this.createBivariateEncodingStrategy(xColumn, yColumn, chartType, correlation);
const interactions = this.generateBivariateInteractions(chartType, correlationStrength);
const alternatives = this.generateBivariateAlternatives(chartType, totalPoints, relationshipType);
const performance = this.generatePerformanceGuidance(totalPoints, chartType);
return {
chartType,
confidence,
statisticalJustification: justification,
dataCharacteristics: [
`Correlation: ${correlationStrength}`,
`Relationship: ${relationshipType}`,
`Sample size: ${totalPoints}`,
],
visualEncodingStrategy: encodingStrategy,
interactionRecommendations: interactions,
alternativeOptions: alternatives,
performanceConsiderations: performance,
};
}
// Helper methods for creating encoding strategies
static createUnivariateEncodingStrategy(columnAnalysis, chartType, distribution) {
const primaryEncoding = {
channel: chartType.includes('bar') ? 'y' : 'x',
dataField: columnAnalysis.columnName,
dataType: 'quantitative',
scale: this.recommendScale(columnAnalysis, distribution),
justification: `Primary ${chartType.includes('bar') ? 'vertical' : 'horizontal'} encoding for ${columnAnalysis.columnName}`,
};
const colorStrategy = {
scheme: 'sequential',
palette: distribution.outlierSeverity === 'severe' ? 'viridis' : 'blues',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: ['pattern', 'texture'],
screenReaderGuidance: `Distribution of ${columnAnalysis.columnName}`,
},
reasoning: 'Sequential color scheme appropriate for continuous numerical data',
};
return {
primaryEncoding,
secondaryEncodings: [],
colorStrategy,
aestheticOptimizations: this.generateAestheticOptimizations(distribution),
};
}
static createBivariateEncodingStrategy(xColumn, yColumn, chartType, correlation) {
const xEncoding = {
channel: 'x',
dataField: xColumn.columnName,
dataType: 'quantitative',
scale: this.recommendScale(xColumn),
justification: `Horizontal axis encoding for ${xColumn.columnName}`,
};
const yEncoding = {
channel: 'y',
dataField: yColumn.columnName,
dataType: 'quantitative',
scale: this.recommendScale(yColumn),
justification: `Vertical axis encoding for ${yColumn.columnName}`,
};
const colorStrategy = {
scheme: correlation?.strength === 'very_strong' ? 'diverging' : 'categorical',
palette: correlation?.direction === 'negative' ? 'rdbu' : 'plasma',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: ['size', 'shape'],
screenReaderGuidance: `Relationship between ${xColumn.columnName} and ${yColumn.columnName}`,
},
reasoning: `${correlation?.strength || 'Unknown'} correlation benefits from ${correlation?.strength === 'very_strong' ? 'diverging' : 'categorical'} color scheme`,
};
return {
primaryEncoding: xEncoding,
secondaryEncodings: [yEncoding],
colorStrategy,
aestheticOptimizations: [],
};
}
// Helper methods for scale recommendations
static recommendScale(columnAnalysis, distribution) {
const stats = columnAnalysis.descriptiveStats;
const min = stats?.minimum || 0;
const max = stats?.maximum || 100;
// Detect if log scale would be beneficial
if (distribution?.recommendedTransformation === 'log' || (max / min > 1000 && min > 0)) {
return {
type: 'log',
domain: [min, max],
nice: true,
reasoning: 'Log scale recommended due to wide range or right-skewed distribution',
};
}
// Standard linear scale
return {
type: 'linear',
domain: [min, max],
nice: true,
zero: min >= 0, // Include zero if all values are positive
reasoning: 'Linear scale appropriate for normal distribution with reasonable range',
};
}
// Utility methods
static isNumerical(dataType) {
return dataType === types_1.EdaDataType.NUMERICAL_FLOAT || dataType === types_1.EdaDataType.NUMERICAL_INTEGER;
}
static isCategorical(dataType) {
return dataType === types_1.EdaDataType.CATEGORICAL;
}
static isTemporal(dataType) {
return dataType === types_1.EdaDataType.DATE_TIME;
}
static calculateEntropy(columnAnalysis) {
// Enhanced entropy calculation using actual frequency data if available
const frequencyData = columnAnalysis.frequencyDistribution;
const totalValues = columnAnalysis.totalValues || 1;
if (!frequencyData) {
// Fallback to simplified calculation
const uniqueValues = columnAnalysis.uniqueValues || 1;
if (uniqueValues === 1)
return 0;
const probability = 1 / uniqueValues;
return -uniqueValues * probability * Math.log2(probability);
}
// Calculate entropy using actual frequencies
let entropy = 0;
for (const freqItem of frequencyData) {
const count = freqItem.count || 0;
const probability = count / totalValues;
if (probability > 0) {
entropy -= probability * Math.log2(probability);
}
}
return entropy;
}
static detectOrderedCategories(columnAnalysis) {
const columnName = columnAnalysis.columnName.toLowerCase();
// Common ordinal indicators in column names
const ordinalKeywords = [
'rating',
'level',
'grade',
'score',
'rank',
'priority',
'stage',
'class',
'tier',
'scale',
'order',
'sequence',
'step',
'phase',
'generation',
'version',
'size',
'magnitude',
'intensity',
'severity',
];
// Check for ordinal keywords
if (ordinalKeywords.some((keyword) => columnName.includes(keyword))) {
return true;
}
// Check for sequential patterns in actual values if available
const frequencyData = columnAnalysis.frequencyDistribution;
if (frequencyData) {
const categories = Object.keys(frequencyData);
// Check for numerical sequences (1,2,3 or A,B,C)
const isNumericSequence = categories.every((cat) => !isNaN(Number(cat)));
if (isNumericSequence && categories.length > 1) {
const numbers = categories.map(Number).sort((a, b) => a - b);
const isConsecutive = numbers.every((num, i) => i === 0 || num === numbers[i - 1] + 1);
return isConsecutive;
}
// Check for alphabetical sequences
if (categories.length <= 10 && categories.every((cat) => cat.length === 1)) {
const sortedCats = [...categories].sort();
return sortedCats.join('') === categories.sort().join('');
}
}
return false;
}
static extractDataCharacteristics(distribution, cardinality) {
const characteristics = [];
characteristics.push(`Distribution: ${distribution.isNormal ? 'Normal' : 'Non-normal'}`);
characteristics.push(`Skewness: ${Math.abs(distribution.skewness) < 0.5 ? 'Symmetric' : distribution.skewness > 0 ? 'Right-skewed' : 'Left-skewed'}`);
characteristics.push(`Outliers: ${distribution.outlierSeverity}`);
if (distribution.outlierSeverity !== 'none') {
characteristics.push('outliers');
}
characteristics.push(`Cardinality: ${cardinality > 0.8 ? 'High' : cardinality > 0.3 ? 'Moderate' : 'Low'}`);
if (distribution.recommendedTransformation) {
characteristics.push(`Suggested transformation: ${distribution.recommendedTransformation}`);
}
return characteristics;
}
// Placeholder methods for generating interactions, alternatives, and performance guidance
static generateUnivariateInteractions(chartType, distribution) {
const interactions = [];
interactions.push({
interactionType: 'hover',
purpose: 'Show exact values and statistics',
implementation: 'Tooltip with value, percentile, and z-score',
priority: 'essential',
statisticalBenefit: 'Allows precise value inspection and statistical context',
});
if (distribution.outlierSeverity !== 'none') {
interactions.push({
interactionType: 'click',
purpose: 'Highlight outliers',
implementation: 'Click to highlight outlier points and show outlier analysis',
priority: 'recommended',
statisticalBenefit: 'Facilitates outlier investigation and data quality assessment',
});
}
return interactions;
}
static generateUnivariateAlternatives(chartType, distribution, cardinality) {
const alternatives = [];
if (chartType !== 'box_plot' && distribution.outlierSeverity !== 'none') {
alternatives.push({
chartType: 'box_plot',
confidence: 0.8,
tradeoffs: 'Less detailed distribution but better outlier visibility',
whenToUse: 'When outlier analysis is primary concern',
statisticalSuitability: 0.85,
});
}
if (chartType !== 'violin_plot' && !distribution.isNormal) {
alternatives.push({
chartType: 'violin_plot',
confidence: 0.75,
tradeoffs: 'More complex but shows full distribution shape',
whenToUse: 'When distribution shape analysis is important',
statisticalSuitability: 0.8,
});
}
// Always provide basic alternatives for numerical data
if (chartType === 'histogram') {
alternatives.push({
chartType: 'density_plot',
confidence: 0.7,
tradeoffs: 'Smooth distribution curve but may hide discrete patterns',
whenToUse: 'When emphasizing overall distribution shape',
statisticalSuitability: 0.75,
});
}
if (chartType !== 'scatter_plot' && cardinality > 0.3) {
alternatives.push({
chartType: 'scatter_plot',
confidence: 0.6,
tradeoffs: 'Shows individual data points but may have overplotting',
whenToUse: 'When examining individual observations',
statisticalSuitability: 0.7,
});
}
return alternatives;
}
static generatePerformanceGuidance(dataSize, chartType) {
let threshold = 10000;
let samplingStrategy;
const aggregationSuggestions = [];
const renderingOptimizations = [];
const memoryConsiderations = [];
// Handle very small datasets
if (dataSize < 10) {
threshold = Math.max(dataSize, 5);
aggregationSuggestions.push('Small dataset allows for detailed analysis');
renderingOptimizations.push('No optimization needed for small dataset');
}
else if (dataSize > 100000) {
threshold = 5000;
samplingStrategy = 'Stratified random sampling maintaining distribution characteristics';
aggregationSuggestions.push('Bin data for histogram display');
renderingOptimizations.push('Use canvas rendering instead of SVG');
memoryConsiderations.push('Stream data processing to avoid loading full dataset');
}
else if (dataSize > 10000) {
aggregationSuggestions.push('Consider binning for performance');
renderingOptimizations.push('Optimize for moderate data size');
}
return {
dataPointThreshold: threshold,
samplingStrategy,
aggregationSuggestions,
renderingOptimizations,
memoryConsiderations,
};
}
static generateAestheticOptimizations(distribution) {
const optimizations = [];
if (distribution.outlierSeverity === 'severe') {
optimizations.push({
property: 'opacity',
value: 0.7,
reasoning: 'Reduced opacity helps manage visual impact of severe outliers',
impact: 'medium',
});
}
if (!distribution.isNormal) {
optimizations.push({
property: 'binning',
value: 'adaptive',
reasoning: 'Adaptive binning better represents non-normal distributions',
impact: 'high',
});
}
return optimizations;
}
// Temporal visualization methods
static createTemporalEncodingStrategy(columnAnalysis, chartType) {
return {
primaryEncoding: {
channel: 'x',
dataField: columnAnalysis.columnName,
dataType: 'temporal',
scale: { type: 'time', domain: [], reasoning: 'Temporal scale for time-series data' },
justification: 'X-axis encoding for temporal progression',
},
secondaryEncodings: [],
colorStrategy: {
scheme: 'categorical',
palette: 'category10',
reasoning: 'Categorical color scheme for temporal data visualization',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: ['shape', 'pattern'],
screenReaderGuidance: 'Time-series data with temporal progression'
},
},
aestheticOptimizations: [],
};
}
static generateTemporalInteractions(chartType) {
return [
{
interactionType: 'zoom',
purpose: 'Temporal range selection',
implementation: 'Brush selection for time periods',
priority: 'essential',
statisticalBenefit: 'Detailed analysis of specific time periods',
},
];
}
static generateTemporalAlternatives(chartType) {
return [
{
chartType: 'area_chart',
confidence: 0.7,
statisticalSuitability: 0.7,
tradeoffs: 'Shows magnitude and trend but may obscure precise values',
whenToUse: 'When emphasizing cumulative trends',
},
];
}
// Categorical-Numerical visualization methods
static createCategoricalNumericalEncodingStrategy(catColumn, numColumn, chartType) {
return {
primaryEncoding: {
channel: 'x',
dataField: catColumn.columnName,
dataType: 'nominal',
scale: { type: 'ordinal', domain: [], reasoning: 'Categorical scale for grouping' },
justification: 'X-axis encoding for categorical groups',
},
secondaryEncodings: [{
channel: 'y',
dataField: numColumn.columnName,
dataType: 'quantitative',
scale: { type: 'linear', domain: [], reasoning: 'Linear scale for numerical values' },
justification: 'Y-axis encoding for numerical comparison',
}],
colorStrategy: {
scheme: 'categorical',
palette: 'category10',
reasoning: 'Categorical color scheme for group distinction',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: ['pattern'],
screenReaderGuidance: 'Grouped categorical data with numerical values'
},
},
aestheticOptimizations: [],
};
}
static generateCategoricalNumericalInteractions(chartType) {
return [
{
interactionType: 'hover',
purpose: 'Group-specific value inspection',
implementation: 'Tooltip with category and value details',
priority: 'essential',
statisticalBenefit: 'Precise value comparison between categories',
},
];
}
static generateCategoricalNumericalAlternatives(chartType) {
return [
{
chartType: 'box_plot',
confidence: 0.8,
statisticalSuitability: 0.85,
tradeoffs: 'Shows distribution but obscures individual values',
whenToUse: 'When analyzing distributions within categories',
},
];
}
// Categorical-Categorical bivariate methods
static createCategoricalBivariateEncodingStrategy(xColumn, yColumn, chartType) {
return {
primaryEncoding: {
channel: 'x',
dataField: xColumn.columnName,
dataType: 'nominal',
scale: { type: 'ordinal', domain: [], reasoning: 'Categorical scale for x-axis grouping' },
justification: 'X-axis encoding for first categorical variable',
},
secondaryEncodings: [{
channel: 'y',
dataField: yColumn.columnName,
dataType: 'nominal',
scale: { type: 'ordinal', domain: [], reasoning: 'Categorical scale for y-axis grouping' },
justification: 'Y-axis encoding for second categorical variable',
}],
colorStrategy: {
scheme: 'sequential',
palette: 'viridis',
reasoning: 'Sequential color scheme for heatmap intensity mapping',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: ['pattern', 'size'],
screenReaderGuidance: 'Heatmap showing categorical associations with intensity'
},
},
aestheticOptimizations: [],
};
}
static generateCategoricalBivariateInteractions(chartType) {
return [
{
interactionType: 'hover',
purpose: 'Cell-specific association inspection',
implementation: 'Tooltip with category combination and association strength',
priority: 'essential',
statisticalBenefit: 'Precise association analysis between category pairs',
},
];
}
static generateCategoricalBivariateAlternatives(chartType) {
return [
{
chartType: 'grouped_bar_chart',
confidence: 0.7,
statisticalSuitability: 0.75,
tradeoffs: 'Easier to read exact values but harder to see overall patterns',
whenToUse: 'When precise comparisons between specific categories are needed',
},
];
}
// Additional placeholder methods
static recommendTemporalUnivariate(columnAnalysis) {
const chartType = 'line_chart';
const confidence = 0.9;
const justification = 'temporal data shows trends and patterns best with line chart visualization';
const encodingStrategy = this.createTemporalEncodingStrategy(columnAnalysis, chartType);
const interactions = this.generateTemporalInteractions(chartType);
const alternatives = this.generateTemporalAlternatives(chartType);
const performance = this.generatePerformanceGuidance(columnAnalysis.totalValues, chartType);
return {
chartType,
confidence,
statisticalJustification: justification,
dataCharacteristics: ['temporal', 'time-series', 'chronological'],
visualEncodingStrategy: encodingStrategy,
interactionRecommendations: interactions,
alternativeOptions: alternatives,
performanceConsiderations: performance,
};
}
static recommendBooleanUnivariate(columnAnalysis) {
return this.createFallbackRecommendation(columnAnalysis, 'pie_chart');
}
static recommendCategoricalNumerical(catColumn, numColumn) {
const chartType = 'grouped_bar_chart';
// For categorical-numerical relationships, we generally have lower confidence
// since the relationship strength is harder to assess without correlation data
const confidence = 0.5;
const justification = 'categorical data with numerical values best shown with grouped bar chart for comparison';
const encodingStrategy = this.createCategoricalNumericalEncodingStrategy(catColumn, numColumn, chartType);
const interactions = this.generateCategoricalNumericalInteractions(chartType);
const alternatives = this.generateCategoricalNumericalAlternatives(chartType);
const performance = this.generatePerformanceGuidance(catColumn.totalValues + numColumn.totalValues, chartType);
return {
chartType,
confidence,
statisticalJustification: justification,
dataCharacteristics: ['categorical', 'numerical', 'comparative'],
visualEncodingStrategy: encodingStrategy,
interactionRecommendations: interactions,
alternativeOptions: alternatives,
performanceConsiderations: performance,
};
}
static recommendCategoricalBivariate(xColumn, yColumn) {
const chartType = 'heatmap';
const confidence = 0.9;
const justification = 'Categorical vs categorical relationships best shown with heatmap to reveal association patterns';
const encodingStrategy = this.createCategoricalBivariateEncodingStrategy(xColumn, yColumn, chartType);
const interactions = this.generateCategoricalBivariateInteractions(chartType);
const alternatives = this.generateCategoricalBivariateAlternatives(chartType);
const performance = this.generatePerformanceGuidance(xColumn.totalValues + yColumn.totalValues, chartType);
return {
chartType,
confidence,
statisticalJustification: justification,
dataCharacteristics: ['categorical', 'bivariate', 'association'],
visualEncodingStrategy: encodingStrategy,
interactionRecommendations: interactions,
alternativeOptions: alternatives,
performanceConsiderations: performance,
};
}
static recommendTemporalBivariate(xColumn, yColumn) {
return this.createFallbackBivariateRecommendation(xColumn, yColumn, 'line_chart');
}
static createCategoricalEncodingStrategy(columnAnalysis, chartType) {
return {
primaryEncoding: {
channel: 'x',
dataField: columnAnalysis.columnName,
dataType: 'nominal',
scale: {
type: 'ordinal',
domain: [],
reasoning: 'Categorical data requires ordinal scale',
},
justification: 'Primary categorical encoding',
},
secondaryEncodings: [],
colorStrategy: {
scheme: 'categorical',
palette: 'category10',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: ['pattern'],
screenReaderGuidance: `Categories of ${columnAnalysis.columnName}`,
},
reasoning: 'Categorical color scheme for distinct categories',
},
aestheticOptimizations: [],
};
}
static generateCategoricalInteractions(chartType, uniqueValues) {
return [
{
interactionType: 'hover',
purpose: 'Show category details',
implementation: 'Tooltip with frequency and percentage',
priority: 'essential',
statisticalBenefit: 'Provides exact frequency information',
},
];
}
static generateCategoricalAlternatives(chartType, uniqueValues, entropy) {
const alternatives = [];
// For bar charts, suggest alternatives based on cardinality and entropy
if (chartType === 'bar_chart' || chartType === 'horizontal_bar_chart') {
if (uniqueValues <= 5 && entropy > 1.0) {
alternatives.push({
chartType: 'pie_chart',
confidence: 0.8,
tradeoffs: 'Shows proportions clearly but harder to compare exact values',
whenToUse: 'When emphasizing part-to-whole relationships',
statisticalSuitability: 0.85,
});
alternatives.push({
chartType: 'donut_chart',
confidence: 0.75,
tradeoffs: 'Better space utilization but harder to compare small segments',
whenToUse: 'When space is limited and proportions are important',
statisticalSuitability: 0.75,
});
}
if (uniqueValues > 10) {
alternatives.push({
chartType: 'treemap',
confidence: 0.7,
tradeoffs: 'Handles many categories well but less precise value comparison',
whenToUse: 'When dealing with high cardinality categorical data',
statisticalSuitability: 0.8,
});
}
}
// For pie charts, suggest bar chart alternative
if (chartType === 'pie_chart') {
alternatives.push({
chartType: 'bar_chart',
confidence: 0.85,
tradeoffs: 'Better for precise value comparison but loses part-to-whole context',
whenToUse: 'When exact value comparison is more important than proportions',
statisticalSuitability: 0.9,
});
}
// For high cardinality, suggest packed bubble chart
if (uniqueValues > 20) {
alternatives.push({
chartType: 'packed_bubble',
confidence: 0.65,
tradeoffs: 'Visually appealing for many categories but imprecise value reading',
whenToUse: 'When visual impact is important and precise values are secondary',
statisticalSuitability: 0.6,
});
}
return alternatives;
}
static generateBivariateInteractions(chartType, correlationStrength) {
const interactions = [];
// Essential hover interaction for all bivariate charts
interactions.push({
interactionType: 'hover',
purpose: 'Show point details and statistical context',
implementation: 'Tooltip with coordinates, residuals, and leverage values',
priority: 'essential',
statisticalBenefit: 'Provides immediate access to point-level statistics',
});
// Brush selection for subset analysis
interactions.push({
interactionType: 'brush',
purpose: 'Select data subset for analysis',
implementation: 'Brush selection with linked summary statistics and correlation update',
priority: 'recommended',
statisticalBenefit: 'Enables subset analysis and outlier investigation',
});
// Zoom for detailed examination
if (chartType.includes('scatter') || chartType.includes('hexbin')) {
interactions.push({
interactionType: 'zoom',
purpose: 'Examine dense regions in detail',
implementation: 'Semantic zoom maintaining statistical context',
priority: 'recommended',
statisticalBenefit: 'Allows detailed examination of high-density regions',
});
}
// Filter for outlier management
interactions.push({
interactionType: 'filter',
purpose: 'Remove outliers or focus on data ranges',
implementation: 'Interactive filtering with real-time correlation updates',
priority: 'optional',
statisticalBenefit: 'Enables robust analysis by handling outliers systematically',
});
// For strong correlations, add trend line interaction
if (correlationStrength === 'strong' || correlationStrength === 'very_strong') {
interactions.push({
interactionType: 'click',
purpose: 'Toggle regression line and confidence intervals',
implementation: 'Click to show/hide trend analysis with R² and confidence bands',
priority: 'recommended',
statisticalBenefit: 'Provides immediate access to regression analysis',
});
}
return interactions;
}
static generateBivariateAlternatives(chartType, totalPoints, relationshipType) {
const alternatives = [];
// For scatter plots
if (chartType === 'scatter_plot') {
if (totalPoints > 5000) {
alternatives.push({
chartType: 'hexbin_plot',
confidence: 0.85,
tradeoffs: 'Better for large datasets but loses individual point detail',
whenToUse: 'When data density patterns are more important than individual points',
statisticalSuitability: 0.9,
});
alternatives.push({
chartType: 'density_scatter',
confidence: 0.8,
tradeoffs: 'Reveals density patterns but may obscure outliers',
whenToUse: 'For very large datasets where overplotting is a concern',
statisticalSuitability: 0.85,
});
}
if (relationshipType === 'non_linear') {
alternatives.push({
chartType: 'smooth_scatter',
confidence: 0.8,
tradeoffs: 'Shows trend clearly but may oversimplify complex relationships',
whenToUse: 'When trend visualization is more important than individual points',
statisticalSuitability: 0.85,
});
}
alternatives.push({
chartType: 'contour_plot',
confidence: 0.7,
tradeoffs: 'Shows density patterns but loses individual point detail',
whenToUse: 'For large datasets where point density is important',
statisticalSuitability: 0.8,
});
}
// For hexbin plots
if (chartType === 'hexbin_plot') {
alternatives.push({
chartType: 'scatter_plot',
confidence: 0.6,
tradeoffs: 'Shows individual points but may have overplotting issues',
whenToUse: 'When individual point analysis is needed despite large dataset',
statisticalSuitability: 0.7,
});
alternatives.push({
chartType: 'heatmap_2d',
confidence: 0.75,
tradeoffs: 'Regular grid may not align well with data distribution',
whenToUse: 'When rectangular binning is preferred over hexagonal',
statisticalSuitability: 0.8,
});
}
// For large datasets, always suggest sampling approach
if (totalPoints > 50000) {
alternatives.push({
chartType: 'sampled_scatter',
confidence: 0.7,
tradeoffs: 'Faster rendering but may miss important data patterns',
whenToUse: 'When performance is critical and patterns are robust to sampling',
statisticalSuitability: 0.75,
});
}
return alternatives;
}
static createFallbackRecommendation(columnAnalysis, defaultChart = 'bar_chart') {
return {
chartType: defaultChart,
confidence: 0.3,
statisticalJustification: 'Default recommendation - statistical analysis incomplete',
dataCharacteristics: ['Insufficient statistical analysis'],
visualEncodingStrategy: {
primaryEncoding: {
channel: 'x',
dataField: columnAnalysis.columnName,
dataType: 'nominal',
scale: { type: 'ordinal', domain: [], reasoning: 'Default scale' },
justification: 'Default encoding',
},
secondaryEncodings: [],
colorStrategy: {
scheme: 'categorical',
palette: 'category10',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: [],
screenReaderGuidance: 'Default chart',
},
reasoning: 'Default color strategy',
},
aestheticOptimizations: [],
},
interactionRecommendations: [],
alternativeOptions: [],
performanceConsiderations: {
dataPointThreshold: 10000,
aggregationSuggestions: [],
renderingOptimizations: [],
memoryConsiderations: [],
},
};
}
static createFallbackBivariateRecommendation(xColumn, yColumn, defaultChart = 'scatter_plot') {
return {
chartType: defaultChart,
confidence: 0.5,
statisticalJustification: 'Default bivariate recommendation',
dataCharacteristics: ['Bivariate relationship analysis incomplete'],
visualEncodingStrategy: {
primaryEncoding: {
channel: 'x',
dataField: xColumn.columnName,
dataType: 'quantitative',
scale: { type: 'linear', domain: [0, 100], reasoning: 'Default linear scale' },
justification: 'Default x-axis encoding',
},
secondaryEncodings: [
{
channel: 'y',
dataField: yColumn.columnName,
dataType: 'quantitative',
scale: { type: 'linear', domain: [0, 100], reasoning: 'Default linear scale' },
justification: 'Default y-axis encoding',
},
],
colorStrategy: {
scheme: 'categorical',
palette: 'category10',
accessibility: {
colorBlindnessSafe: true,
contrastRatio: 4.5,
alternativeEncodings: [],
screenReaderGuidance: 'Default bivariate chart',
},
reasoning: 'Default color strategy',
},
aestheticOptimizations: [],
},
interactionRecommendations: [],
alternativeOptions: [],
performanceConsiderations: {
dataPointThreshold: 10000,
aggregationSuggestions: [],
renderingOptimizations: [],
memoryConsiderations: [],
},
};
}
/**
* Create recommendation for identifier columns - avoid meaningless frequency charts
*/
static createIdentifierRecommendation(columnAnalysis) {
return {
chartType: 'summary_table',
confidence: 0.9,
statisticalJustification: `Column '${columnAnalysis.columnName}' appears to be an identifier with ${columnAnalysis.uniquePercentage?.toFixed(1)}% unique values. Frequency-based visualizations are not meaningful for unique identifiers.`,
dataCharacteristics: