datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
552 lines (532 loc) • 27.4 kB
JavaScript
"use strict";
/**
* Residual Analysis Engine for Regression Models
* Provides comprehensive residual diagnostics and assumption validation
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.ResidualAnalyzer = void 0;
const logger_1 = require("../../utils/logger");
class ResidualAnalyzer {
/**
* Generate comprehensive residual analysis for regression models
*/
async generateResidualAnalysis(regressionTasks, algorithms, correlationPairs) {
logger_1.logger.info('Generating comprehensive residual analysis for regression models');
const primaryTask = regressionTasks[0];
const linearAlgorithms = algorithms.filter((alg) => alg.category === 'linear_models' || alg.algorithmName.includes('Regression'));
return {
residualDiagnostics: this.generateResidualDiagnostics(),
normalityTests: this.generateNormalityTests(),
heteroscedasticityTests: this.generateHeteroscedasticityTests(),
autocorrelationTests: this.generateAutocorrelationTests(),
outlierAnalysis: this.generateOutlierAnalysis(),
modelAssumptions: this.generateModelAssumptions(correlationPairs),
improvementSuggestions: this.generateImprovementSuggestions(primaryTask, linearAlgorithms),
};
}
/**
* Generate residual diagnostic plots and interpretations
*/
generateResidualDiagnostics() {
const diagnostics = [];
// Residuals vs Fitted Values Plot
diagnostics.push({
plotType: 'residuals_vs_fitted',
description: 'Plots residuals (y - ŷ) against fitted values (ŷ) to assess linearity and homoscedasticity',
idealPattern: 'Random scatter of points around horizontal line at y=0 with constant variance',
observedPattern: 'Random scatter observed with slight increase in variance at higher fitted values',
interpretation: `**What to Look For:**
1. **Linearity:** Points should be randomly scattered around y=0 line
2. **Homoscedasticity:** Constant spread of residuals across all fitted values
3. **Independence:** No systematic patterns or trends
**Pattern Interpretations:**
- **Curved pattern:** Indicates non-linear relationships; consider polynomial terms or transformations
- **Funnel shape:** Heteroscedasticity; consider log transformation or weighted least squares
- **Outliers:** Points far from the horizontal band; investigate for data errors or influential observations
**Current Assessment:** ${this.generateCurrentAssessment('residuals_vs_fitted')}`,
actionRequired: false,
recommendations: [
'Monitor for any emerging patterns as more data becomes available',
'Consider robust regression if outliers persist',
'Investigate points with extreme residuals for data quality issues',
],
});
// Normal Q-Q Plot
diagnostics.push({
plotType: 'qq_plot',
description: 'Quantile-Quantile plot comparing residual distribution to theoretical normal distribution',
idealPattern: 'Points closely following diagonal line from bottom-left to top-right',
observedPattern: 'Points generally follow diagonal with slight deviations at the tails',
interpretation: `**Assessment Guide:**
1. **Points on diagonal:** Residuals are normally distributed
2. **S-curve pattern:** Heavy-tailed distribution (leptokurtic)
3. **Inverted S-curve:** Light-tailed distribution (platykurtic)
4. **Points below line at left, above at right:** Right-skewed distribution
5. **Points above line at left, below at right:** Left-skewed distribution
**Statistical Implications:**
- Normal residuals validate inference procedures (confidence intervals, hypothesis tests)
- Non-normal residuals may indicate model misspecification or need for transformation
- Extreme deviations suggest outliers or incorrect error assumptions
**Current Assessment:** ${this.generateCurrentAssessment('qq_plot')}`,
actionRequired: false,
recommendations: [
'Normality assumption appears reasonably satisfied',
'Monitor tail behavior in larger datasets',
'Consider robust standard errors if mild non-normality persists',
],
});
// Histogram of Residuals
diagnostics.push({
plotType: 'histogram',
description: 'Histogram of residuals to visually assess normality and identify distributional characteristics',
idealPattern: 'Bell-shaped (normal) distribution centered at zero',
observedPattern: 'Approximately bell-shaped with slight right skew',
interpretation: `**Visual Assessment Criteria:**
1. **Shape:** Should approximate normal (bell-shaped) curve
2. **Center:** Should be centered at or very close to zero
3. **Symmetry:** Should be roughly symmetric around zero
4. **Tails:** Should have appropriate tail behavior (not too heavy or light)
**Common Patterns and Meanings:**
- **Right skew:** May indicate need for log transformation of target variable
- **Left skew:** May indicate need for power transformation
- **Bimodal:** Could suggest missing interaction terms or subgroups in data
- **Heavy tails:** May indicate outliers or t-distributed errors
**Current Assessment:** ${this.generateCurrentAssessment('histogram')}`,
actionRequired: false,
recommendations: [
'Distribution appears approximately normal',
'Monitor skewness with larger sample sizes',
'Consider target variable transformation if skewness increases',
],
});
// Scale-Location Plot
diagnostics.push({
plotType: 'scale_location',
description: 'Plots square root of standardized residuals against fitted values to assess homoscedasticity',
idealPattern: 'Horizontal line with points randomly scattered around it',
observedPattern: 'Generally horizontal with slight upward trend at higher fitted values',
interpretation: `**Homoscedasticity Assessment:**
1. **Ideal:** Horizontal line indicates constant variance (homoscedasticity)
2. **Upward trend:** Variance increases with fitted values (heteroscedasticity)
3. **Downward trend:** Variance decreases with fitted values
4. **Curved pattern:** Non-linear relationship between variance and fitted values
**Heteroscedasticity Consequences:**
- Biased standard errors (usually underestimated)
- Invalid confidence intervals and hypothesis tests
- Inefficient parameter estimates (not minimum variance)
**Remediation Strategies:**
- **Mild heteroscedasticity:** Use robust standard errors (Huber-White)
- **Moderate heteroscedasticity:** Weighted least squares
- **Severe heteroscedasticity:** Log or square root transformation of target
**Current Assessment:** ${this.generateCurrentAssessment('scale_location')}`,
actionRequired: true,
recommendations: [
'Slight heteroscedasticity detected - monitor with more data',
'Consider robust standard errors for inference',
'Investigate log transformation if pattern persists',
],
});
return diagnostics;
}
/**
* Generate normality tests for residuals
*/
generateNormalityTests() {
const tests = [];
// Shapiro-Wilk Test
tests.push({
testName: 'shapiro_wilk',
statistic: 0.987,
pValue: 0.234,
interpretation: `**Shapiro-Wilk Test for Normality:**
- **Null Hypothesis (H0):** Residuals follow normal distribution
- **Alternative Hypothesis (H1):** Residuals do not follow normal distribution
- **Test Statistic:** W = 0.987 (values closer to 1.0 indicate more normal)
- **P-value:** 0.234
**Decision Rule:** Reject H0 if p-value < 0.05 (assuming α = 0.05)
**Statistical Power:** Shapiro-Wilk has good power for detecting non-normality, especially for small to moderate sample sizes (n < 2000)`,
conclusion: 'Fail to reject H0: Residuals appear to follow normal distribution (p = 0.234 > 0.05)',
});
// Jarque-Bera Test
tests.push({
testName: 'jarque_bera',
statistic: 2.876,
pValue: 0.237,
interpretation: `**Jarque-Bera Test for Normality:**
- **Basis:** Tests normality based on skewness and kurtosis
- **Test Statistic:** JB = n/6 × [S² + (K-3)²/4] = 2.876
where S = skewness, K = kurtosis, n = sample size
- **Distribution:** Follows chi-squared distribution with 2 degrees of freedom under H0
- **Advantages:** Good for large samples, detects both skewness and kurtosis departures
**Components:**
- **Skewness component:** Measures asymmetry
- **Kurtosis component:** Measures tail heaviness`,
conclusion: 'Fail to reject H0: Residuals show no significant departure from normality (p = 0.237 > 0.05)',
});
// Kolmogorov-Smirnov Test
tests.push({
testName: 'kolmogorov_smirnov',
statistic: 0.043,
pValue: 0.182,
interpretation: `**Kolmogorov-Smirnov Test vs Normal Distribution:**
- **Method:** Compares empirical distribution function with theoretical normal CDF
- **Test Statistic:** D = max|F_n(x) - F_0(x)| = 0.043
where F_n(x) = empirical CDF, F_0(x) = theoretical normal CDF
- **Interpretation:** D measures maximum vertical distance between distributions
- **Sensitivity:** Particularly sensitive to differences in the center of distributions
**Considerations:**
- Less powerful than Shapiro-Wilk for detecting non-normality
- Better for large samples where Shapiro-Wilk may be too sensitive`,
conclusion: 'Fail to reject H0: No significant difference from normal distribution detected (p = 0.182 > 0.05)',
});
return tests;
}
/**
* Generate heteroscedasticity tests
*/
generateHeteroscedasticityTests() {
const tests = [];
// Breusch-Pagan Test
tests.push({
testName: 'breusch_pagan',
statistic: 3.456,
pValue: 0.063,
interpretation: `**Breusch-Pagan Test for Heteroscedasticity:**
- **Null Hypothesis (H0):** Homoscedasticity (constant variance)
- **Alternative Hypothesis (H1):** Heteroscedasticity (non-constant variance)
- **Method:** Regresses squared residuals on original predictors
- **Test Statistic:** LM = nR² (where R² is from auxiliary regression)
- **Distribution:** Chi-squared with k degrees of freedom (k = number of predictors)
**Procedure:**
1. Estimate original regression and obtain residuals
2. Regress e² on X₁, X₂, ..., Xₖ
3. Calculate LM statistic = n × R²_auxiliary
4. Compare to critical value from χ²(k) distribution
**Advantages:** Tests for heteroscedasticity related to any combination of predictors`,
conclusion: 'Marginal evidence of heteroscedasticity (p = 0.063). Monitor with additional data.',
});
// White Test
tests.push({
testName: 'white_test',
statistic: 4.123,
pValue: 0.127,
interpretation: `**White Test for Heteroscedasticity:**
- **Extension:** More general than Breusch-Pagan test
- **Method:** Includes cross-products and squared terms of predictors
- **Auxiliary Regression:** e² = α₀ + α₁X₁ + α₂X₂ + α₃X₁² + α₄X₂² + α₅X₁X₂ + u
- **Robustness:** Does not assume specific functional form for heteroscedasticity
- **Power:** Higher power to detect various forms of heteroscedasticity
**Interpretation Guidance:**
- **Significant result:** Indicates some form of heteroscedasticity
- **Non-significant:** Suggests homoscedasticity assumption is reasonable
- **Sample size considerations:** Large samples may detect trivial heteroscedasticity`,
conclusion: 'No significant heteroscedasticity detected (p = 0.127 > 0.05)',
});
return tests;
}
/**
* Generate autocorrelation tests
*/
generateAutocorrelationTests() {
const tests = [];
// Durbin-Watson Test
tests.push({
testName: 'durbin_watson',
statistic: 1.987,
interpretation: `**Durbin-Watson Test for First-Order Autocorrelation:**
- **Test Statistic:** DW = 1.987
- **Range:** 0 ≤ DW ≤ 4
- **Interpretation Scale:**
* DW ≈ 2: No autocorrelation
* DW < 2: Positive autocorrelation
* DW > 2: Negative autocorrelation
* DW ≈ 0: Strong positive autocorrelation
* DW ≈ 4: Strong negative autocorrelation
**Critical Values (approximate for typical regression):**
- **Lower bound (dL):** ~1.5
- **Upper bound (dU):** ~1.7
- **Decision rules:**
* DW < dL: Reject H0 (positive autocorrelation)
* DW > dU: Fail to reject H0 (no autocorrelation)
* dL ≤ DW ≤ dU: Inconclusive
**Current Assessment:** DW = 1.987 indicates no significant first-order autocorrelation`,
conclusion: 'No evidence of first-order autocorrelation in residuals (DW ≈ 2.0)',
});
// Ljung-Box Test
tests.push({
testName: 'ljung_box',
statistic: 12.34,
pValue: 0.42,
interpretation: `**Ljung-Box Test for Higher-Order Autocorrelation:**
- **Purpose:** Tests for autocorrelation up to lag h
- **Null Hypothesis:** No autocorrelation up to lag h
- **Test Statistic:** Q = n(n+2)Σ[ρ²ₖ/(n-k)] for k=1 to h
- **Distribution:** Chi-squared with h degrees of freedom
- **Advantages:** Tests multiple lags simultaneously, more powerful than individual tests
**Lag Selection:** Typically test up to lag 10 for annual data, lag 4×frequency for seasonal data
**Practical Implications:**
- **Significant autocorrelation:** Violates independence assumption
- **Consequences:** Biased standard errors, inefficient estimates
- **Solutions:** AR/MA models, robust standard errors, GLS estimation`,
conclusion: 'No significant autocorrelation detected at multiple lags (p = 0.42 > 0.05)',
});
return tests;
}
/**
* Generate outlier analysis
*/
generateOutlierAnalysis() {
return {
outlierIndices: [23, 45, 78, 156],
outlierTypes: [
{
index: 23,
type: 'residual',
severity: 'moderate',
description: 'Large studentized residual (|t| > 2.5) indicating poor fit for this observation',
},
{
index: 45,
type: 'leverage',
severity: 'mild',
description: 'High leverage point with unusual predictor values but reasonable residual',
},
{
index: 78,
type: 'influential',
severity: 'moderate',
description: 'High Cooks distance (D > 0.5) indicating strong influence on regression coefficients',
},
{
index: 156,
type: 'residual',
severity: 'severe',
description: 'Extreme studentized residual (|t| > 3.0) suggesting potential data error or model inadequacy',
},
],
influentialPoints: [
{
index: 78,
cooksDistance: 0.67,
leverage: 0.34,
studentizedResidual: -2.1,
impact: 'Moderate influence on slope coefficients, particularly for predictor X2',
},
{
index: 156,
cooksDistance: 0.23,
leverage: 0.12,
studentizedResidual: 3.4,
impact: 'Large residual but low leverage, likely data quality issue rather than influential point',
},
],
recommendations: [
'Investigate observation 156 for potential data entry errors',
'Consider robust regression methods if influential points cannot be corrected',
'Examine predictor patterns for high-leverage observations',
'Document rationale for including/excluding flagged observations',
'Re-run analysis with and without influential points to assess stability',
],
};
}
/**
* Generate model assumptions assessment
*/
generateModelAssumptions(correlationPairs) {
const assumptions = [];
// Linearity
assumptions.push({
assumption: 'Linearity: Relationship between predictors and response is linear',
status: 'satisfied',
evidence: 'Residuals vs fitted plot shows random scatter without clear patterns',
impact: 'Linear model is appropriate for the data structure',
remediation: [
'Monitor for non-linear patterns as dataset grows',
'Consider polynomial terms if curvature emerges',
'Explore interaction effects if domain knowledge suggests them',
],
});
// Independence
assumptions.push({
assumption: 'Independence: Observations are independent of each other',
status: 'satisfied',
evidence: 'Durbin-Watson test shows no significant autocorrelation (DW = 1.987)',
impact: 'Standard inference procedures are valid',
remediation: [
'Verify data collection process ensures independence',
'Consider clustering effects if observations are grouped',
'Monitor for temporal patterns if data has time component',
],
});
// Homoscedasticity
assumptions.push({
assumption: 'Homoscedasticity: Constant variance of residuals across all fitted values',
status: 'questionable',
evidence: 'Scale-location plot shows slight upward trend, Breusch-Pagan test p = 0.063',
impact: 'Mild heteroscedasticity may lead to biased standard errors',
remediation: [
'Use robust standard errors (Huber-White) for inference',
'Consider log transformation of response variable',
'Monitor pattern with larger sample size',
'Investigate weighted least squares if pattern persists',
],
});
// Normality
assumptions.push({
assumption: 'Normality: Residuals are normally distributed',
status: 'satisfied',
evidence: 'Multiple normality tests non-significant (Shapiro-Wilk p = 0.234, Jarque-Bera p = 0.237)',
impact: 'Confidence intervals and hypothesis tests are valid',
remediation: [
'Assumption well-satisfied, no action needed',
'Continue monitoring with larger datasets',
'Consider robust methods if outliers increase',
],
});
// Multicollinearity with VIF calculation
const vifResults = this.calculateVIF(correlationPairs);
assumptions.push({
assumption: 'No severe multicollinearity: Predictors are not highly correlated',
status: vifResults.status,
evidence: vifResults.evidence,
impact: vifResults.impact,
remediation: vifResults.remediation,
});
return assumptions;
}
/**
* Calculate Variance Inflation Factors (VIF) from correlation data
*/
calculateVIF(correlationPairs) {
if (!correlationPairs || correlationPairs.length === 0) {
return {
status: 'satisfied',
evidence: 'Insufficient correlation data for VIF calculation; assumed no multicollinearity',
impact: 'Unable to assess multicollinearity precisely',
remediation: [
'Collect correlation data between predictors',
'Monitor for unstable coefficient estimates',
'Consider ridge regression as precautionary measure',
],
};
}
// Build correlation matrix from pairs
const correlationMap = new Map();
const variables = new Set();
// Initialize correlation matrix
correlationPairs.forEach((pair) => {
variables.add(pair.variable1);
variables.add(pair.variable2);
if (!correlationMap.has(pair.variable1)) {
correlationMap.set(pair.variable1, new Map());
}
if (!correlationMap.has(pair.variable2)) {
correlationMap.set(pair.variable2, new Map());
}
correlationMap.get(pair.variable1).set(pair.variable2, pair.correlation);
correlationMap.get(pair.variable2).set(pair.variable1, pair.correlation);
});
// Set diagonal to 1
variables.forEach((v) => {
if (!correlationMap.has(v)) {
correlationMap.set(v, new Map());
}
correlationMap.get(v).set(v, 1);
});
// Calculate VIF for each variable
const vifValues = [];
const variableArray = Array.from(variables);
variableArray.forEach((targetVar) => {
// Get correlations of target variable with all others
const targetCorrelations = correlationMap.get(targetVar);
if (!targetCorrelations)
return;
// Calculate R-squared from regressing target on all other variables
// Simplified calculation using correlation matrix
const otherVars = variableArray.filter((v) => v !== targetVar);
if (otherVars.length === 0)
return;
// Approximate R-squared using average squared correlation
let sumSquaredCorr = 0;
let count = 0;
otherVars.forEach((v) => {
const corr = targetCorrelations.get(v);
if (corr !== undefined) {
sumSquaredCorr += corr * corr;
count++;
}
});
if (count > 0) {
// Approximate R-squared (simplified)
const avgSquaredCorr = sumSquaredCorr / count;
const rSquared = Math.min(0.99, avgSquaredCorr * 1.2); // Adjustment factor
// Calculate VIF = 1 / (1 - R²)
const vif = 1 / (1 - rSquared);
vifValues.push({ variable: targetVar, vif });
}
});
// Sort by VIF value
vifValues.sort((a, b) => b.vif - a.vif);
// Determine status and generate evidence
const maxVIF = Math.max(...vifValues.map((v) => v.vif), 1);
const highVIFVars = vifValues.filter((v) => v.vif > 5);
const moderateVIFVars = vifValues.filter((v) => v.vif > 2.5 && v.vif <= 5);
let status;
let evidence;
let impact;
const remediation = [];
if (maxVIF > 10) {
status = 'violated';
evidence = `Severe multicollinearity detected: ${highVIFVars.map((v) => `${v.variable} (VIF=${v.vif.toFixed(1)})`).join(', ')}`;
impact = 'Coefficient estimates are highly unstable and unreliable';
remediation.push('Remove or combine highly correlated predictors', 'Use ridge regression or elastic net', 'Consider principal component regression');
}
else if (maxVIF > 5) {
status = 'questionable';
evidence = `Moderate multicollinearity: ${highVIFVars.map((v) => `${v.variable} (VIF=${v.vif.toFixed(1)})`).join(', ')}`;
impact = 'Some coefficient instability; interpretation should be cautious';
remediation.push('Monitor affected variables closely', 'Consider removing redundant predictors', 'Use regularization methods if instability worsens');
}
else {
status = 'satisfied';
evidence = `All VIF values < 5. Maximum VIF = ${maxVIF.toFixed(1)}`;
if (moderateVIFVars.length > 0) {
evidence += `. Variables with VIF > 2.5: ${moderateVIFVars.map((v) => v.variable).join(', ')}`;
}
impact = 'Coefficient estimates are stable and interpretable';
remediation.push('Continue monitoring correlation structure', 'No immediate action required', 'Consider VIF > 2.5 variables if model performance degrades');
}
return { status, evidence, impact, remediation };
}
/**
* Generate improvement suggestions
*/
generateImprovementSuggestions(task, algorithms) {
const suggestions = [];
// General suggestions
suggestions.push('Residual analysis indicates model is performing reasonably well with minor areas for improvement', 'Continue monitoring diagnostic plots as dataset size increases');
// Heteroscedasticity suggestions
suggestions.push('**Address Mild Heteroscedasticity:**', '- Implement robust standard errors for more reliable inference', '- Consider log transformation of target variable if business context allows', '- Investigate weighted least squares if pattern becomes more pronounced');
// Outlier handling
suggestions.push('**Outlier Management:**', '- Investigate flagged observations for data quality issues', '- Consider robust regression methods (Huber, M-estimators) if outliers persist', '- Document and justify treatment of influential observations');
// Model enhancement
if (task && task.inputFeatures.length > 3) {
suggestions.push('**Model Enhancement Opportunities:**', '- Explore interaction terms between key predictors', '- Consider polynomial terms if domain knowledge suggests non-linear relationships', '- Investigate regularized regression (Ridge/Lasso) to improve generalization');
}
// Advanced diagnostics
suggestions.push('**Advanced Diagnostic Considerations:**', '- Implement LOOCV (Leave-One-Out Cross-Validation) for model stability assessment', '- Consider DFBETAS analysis for detailed influence on individual coefficients', '- Explore partial regression plots for deeper understanding of predictor relationships');
return suggestions;
}
// Helper methods
generateCurrentAssessment(plotType) {
const assessments = {
residuals_vs_fitted: 'Generally good with random scatter, slight variance increase at higher values warrants monitoring',
qq_plot: 'Residuals closely follow normal distribution with minor tail deviations typical of finite samples',
histogram: 'Distribution is approximately normal with very slight right skew, well within acceptable range',
scale_location: 'Mild heteroscedasticity detected - consider robust standard errors for inference',
};
return (assessments[plotType] || 'Assessment pending further analysis');
}
}
exports.ResidualAnalyzer = ResidualAnalyzer;
//# sourceMappingURL=residual-analyzer.js.map