datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
772 lines • 34.3 kB
JavaScript
"use strict";
/**
* Algorithm Recommendation Engine for Section 6
* Provides intelligent algorithm selection based on data characteristics and task requirements
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.AlgorithmRecommender = void 0;
const logger_1 = require("../../utils/logger");
class AlgorithmRecommender {
config;
constructor(config) {
this.config = config;
}
/**
* Generate algorithm recommendations for a specific modeling task
*/
async recommendAlgorithms(task, section1Result, section3Result, section5Result) {
const recommendations = [];
switch (task.taskType) {
case 'regression':
recommendations.push(...this.generateRegressionRecommendations(task, section1Result, section3Result, section5Result));
break;
case 'binary_classification':
case 'multiclass_classification':
recommendations.push(...this.generateClassificationRecommendations(task, section1Result, section3Result, section5Result));
break;
case 'clustering':
recommendations.push(...this.generateClusteringRecommendations(task, section1Result, section3Result, section5Result));
break;
case 'time_series_forecasting':
recommendations.push(...this.generateTimeSeriesRecommendations(task, section1Result, section3Result, section5Result));
break;
case 'anomaly_detection':
recommendations.push(...this.generateAnomalyDetectionRecommendations(task, section1Result, section3Result, section5Result));
break;
default:
logger_1.logger.warn(`Unknown task type: ${task.taskType}`);
}
// Filter recommendations based on configuration preferences
let filteredRecommendations = this.filterByPreferences(recommendations);
// Sort by suitability score and return top recommendations
return filteredRecommendations.sort((a, b) => b.suitabilityScore - a.suitabilityScore).slice(0, 5); // Top 5 recommendations per task
}
/**
* Generate regression algorithm recommendations
*/
generateRegressionRecommendations(task, section1Result, section3Result, section5Result) {
const recommendations = [];
const featureCount = task.inputFeatures.length;
const dataSize = section1Result.overview.structuralDimensions.totalDataRows;
// Linear Regression (Baseline)
recommendations.push({
algorithmName: 'Linear Regression',
category: 'linear_models',
suitabilityScore: this.calculateLinearRegressionSuitability(task, section3Result),
complexity: 'simple',
interpretability: 'high',
strengths: [
'Highly interpretable coefficients',
'Fast training and prediction',
'Well-understood statistical properties',
'Good baseline model',
'Works well with linear relationships',
],
weaknesses: [
'Assumes linear relationships',
'Sensitive to outliers',
'Requires feature scaling for optimal performance',
'May underfit complex patterns',
],
dataRequirements: [
'Linear relationship between features and target',
'Normally distributed residuals',
'Independence of observations',
'Homoscedasticity (constant variance)',
],
hyperparameters: [
{
parameterName: 'fit_intercept',
description: 'Whether to calculate intercept',
defaultValue: true,
recommendedRange: 'true/false',
tuningStrategy: 'Based on domain knowledge',
importance: 'important',
},
{
parameterName: 'normalize',
description: 'Whether to normalize features',
defaultValue: false,
recommendedRange: 'true/false',
tuningStrategy: 'Use if features have different scales',
importance: 'optional',
},
],
implementationFrameworks: ['scikit-learn', 'statsmodels', 'R', 'Julia'],
evaluationMetrics: ['R²', 'Adjusted R²', 'RMSE', 'MAE', 'MAPE'],
reasoningNotes: [
'Excellent starting point for regression analysis',
'Provides interpretable baseline for comparison',
'Essential for understanding linear relationships',
],
});
// Decision Tree Regressor (User Interest - CART)
recommendations.push({
algorithmName: 'Decision Tree Regressor (CART)',
category: 'tree_based',
suitabilityScore: this.calculateTreeSuitability(task, section3Result, 'regression'),
complexity: 'moderate',
interpretability: 'high',
strengths: [
'Handles non-linear relationships naturally',
'No assumptions about data distribution',
'Automatic feature selection',
'Robust to outliers',
'Easily interpretable decision rules',
'Can capture feature interactions',
],
weaknesses: [
'Prone to overfitting without pruning',
'Can be unstable (high variance)',
'Biased toward features with many levels',
'May create overly complex trees',
],
dataRequirements: [
'Sufficient sample size for reliable splits',
'Mixed data types acceptable',
'No strict distributional assumptions',
],
hyperparameters: this.getTreeHyperparameters('regression'),
implementationFrameworks: ['scikit-learn', 'R (rpart/tree)', 'Weka', 'XGBoost'],
evaluationMetrics: ['RMSE', 'MAE', 'R²', 'Tree depth', 'Number of leaves'],
reasoningNotes: [
'Excellent for discovering non-linear patterns',
'Provides human-readable decision rules',
'Foundation for ensemble methods',
],
});
// Random Forest (if sufficient data)
if (dataSize > 100 && featureCount >= 3) {
recommendations.push({
algorithmName: 'Random Forest Regressor',
category: 'ensemble_methods',
suitabilityScore: this.calculateEnsembleSuitability(task, section3Result, dataSize),
complexity: 'moderate',
interpretability: 'medium',
strengths: [
'Reduces overfitting compared to single trees',
'Handles large datasets efficiently',
'Provides feature importance scores',
'Robust to noise and outliers',
'Good out-of-box performance',
],
weaknesses: [
'Less interpretable than single trees',
'Can overfit with very noisy data',
'Biased toward categorical features with many categories',
'Memory intensive for large forests',
],
dataRequirements: [
'Sufficient sample size (>100 recommended)',
'Multiple features for diversity',
'Can handle missing values',
],
hyperparameters: this.getRandomForestHyperparameters(),
implementationFrameworks: ['scikit-learn', 'randomForest (R)', 'H2O', 'Apache Spark'],
evaluationMetrics: ['RMSE', 'MAE', 'R²', 'Out-of-bag error', 'Feature importance'],
reasoningNotes: [
'Excellent balance of performance and interpretability',
'Robust ensemble method for most regression tasks',
'Good for feature selection via importance scores',
],
});
}
// Ridge Regression (if multicollinearity suspected)
if (featureCount > 5) {
recommendations.push({
algorithmName: 'Ridge Regression',
category: 'linear_models',
suitabilityScore: this.calculateRegularizedRegressionSuitability(task, section3Result, featureCount),
complexity: 'simple',
interpretability: 'high',
strengths: [
'Handles multicollinearity well',
'Reduces overfitting through regularization',
'Stable coefficient estimates',
'Works with more features than observations',
],
weaknesses: [
'Still assumes linear relationships',
'Requires hyperparameter tuning',
'Coefficients shrunk toward zero',
'Feature scaling required',
],
dataRequirements: [
'Linear relationships preferred',
'Feature scaling recommended',
'Can handle multicollinearity',
],
hyperparameters: [
{
parameterName: 'alpha',
description: 'Regularization strength',
defaultValue: 1.0,
recommendedRange: '0.001 to 1000 (log scale)',
tuningStrategy: 'Cross-validation grid search',
importance: 'critical',
},
],
implementationFrameworks: ['scikit-learn', 'glmnet (R)', 'statsmodels'],
evaluationMetrics: ['RMSE', 'MAE', 'R²', 'Cross-validation score'],
reasoningNotes: [
'Ideal when multicollinearity is present',
'Good regularized baseline model',
'Maintains interpretability with regularization',
],
});
}
return recommendations;
}
/**
* Generate classification algorithm recommendations
*/
generateClassificationRecommendations(task, section1Result, section3Result, section5Result) {
const recommendations = [];
const featureCount = task.inputFeatures.length;
const dataSize = section1Result.overview.structuralDimensions.totalDataRows;
const isBinary = task.targetType === 'binary';
// Logistic Regression
recommendations.push({
algorithmName: isBinary ? 'Logistic Regression' : 'Multinomial Logistic Regression',
category: 'linear_models',
suitabilityScore: this.calculateLogisticRegressionSuitability(task, section3Result),
complexity: 'simple',
interpretability: 'high',
strengths: [
'Probabilistic predictions',
'Well-understood statistical properties',
'Fast training and prediction',
'Good baseline model',
'Coefficients represent odds ratios',
],
weaknesses: [
'Assumes linear decision boundary',
'Sensitive to outliers',
'May underfit complex patterns',
'Requires feature scaling',
],
dataRequirements: [
'Independent observations',
'No extreme outliers',
'Sufficient sample size per class',
],
hyperparameters: [
{
parameterName: 'C',
description: 'Inverse regularization strength',
defaultValue: 1.0,
recommendedRange: '0.001 to 1000 (log scale)',
tuningStrategy: 'Cross-validation grid search',
importance: 'critical',
},
{
parameterName: 'penalty',
description: 'Regularization type',
defaultValue: 'l2',
recommendedRange: 'l1, l2, elasticnet',
tuningStrategy: 'Based on feature selection needs',
importance: 'important',
},
],
implementationFrameworks: ['scikit-learn', 'statsmodels', 'R (glm)', 'H2O'],
evaluationMetrics: isBinary
? ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC', 'Log-loss']
: ['Accuracy', 'Macro/Micro F1', 'Per-class Precision/Recall', 'Confusion Matrix'],
reasoningNotes: [
'Excellent interpretable baseline',
'Provides probability estimates',
'Well-suited for linear decision boundaries',
],
});
// Decision Tree Classifier (CART)
recommendations.push({
algorithmName: 'Decision Tree Classifier (CART)',
category: 'tree_based',
suitabilityScore: this.calculateTreeSuitability(task, section3Result, 'classification'),
complexity: 'moderate',
interpretability: 'high',
strengths: [
'Highly interpretable rules',
'Handles non-linear relationships',
'No distributional assumptions',
'Automatic feature selection',
'Handles mixed data types',
],
weaknesses: [
'Prone to overfitting',
'High variance',
'Biased toward features with many categories',
'Can create complex trees',
],
dataRequirements: [
'Sufficient sample size per class',
'Balanced or manageable class distribution',
],
hyperparameters: this.getTreeHyperparameters('classification'),
implementationFrameworks: ['scikit-learn', 'R (rpart)', 'C4.5', 'Weka'],
evaluationMetrics: isBinary
? ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Tree depth']
: ['Accuracy', 'Macro F1', 'Confusion Matrix', 'Tree complexity'],
reasoningNotes: [
'Creates interpretable decision rules',
'Excellent for understanding feature interactions',
'Good foundation for ensemble methods',
],
});
// Random Forest Classifier
if (dataSize > 100) {
recommendations.push({
algorithmName: 'Random Forest Classifier',
category: 'ensemble_methods',
suitabilityScore: this.calculateEnsembleSuitability(task, section3Result, dataSize),
complexity: 'moderate',
interpretability: 'medium',
strengths: [
'High predictive accuracy',
'Handles overfitting well',
'Provides feature importance',
'Works with mixed data types',
'Handles class imbalance reasonably',
],
weaknesses: [
'Less interpretable than single trees',
'Can overfit with noisy data',
'Computationally intensive',
'Black box compared to single tree',
],
dataRequirements: ['Sufficient sample size', 'Multiple informative features'],
hyperparameters: this.getRandomForestHyperparameters(),
implementationFrameworks: ['scikit-learn', 'randomForest (R)', 'H2O'],
evaluationMetrics: isBinary
? ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']
: ['Accuracy', 'Macro/Micro F1', 'Per-class metrics'],
reasoningNotes: [
'Excellent general-purpose classifier',
'Good balance of accuracy and interpretability',
'Robust to noise and outliers',
],
});
}
return recommendations;
}
/**
* Generate clustering algorithm recommendations
*/
generateClusteringRecommendations(task, section1Result, section3Result, section5Result) {
const recommendations = [];
const dataSize = section1Result.overview.structuralDimensions.totalDataRows;
const featureCount = task.inputFeatures.length;
// K-Means
recommendations.push({
algorithmName: 'K-Means Clustering',
category: 'unsupervised',
suitabilityScore: this.calculateKMeansSuitability(task, section3Result, dataSize),
complexity: 'simple',
interpretability: 'medium',
strengths: [
'Simple and fast algorithm',
'Works well with spherical clusters',
'Scalable to large datasets',
'Well-understood algorithm',
],
weaknesses: [
'Requires pre-specifying number of clusters',
'Assumes spherical clusters',
'Sensitive to initialization',
'Sensitive to feature scaling',
],
dataRequirements: [
'Numerical features',
'Feature scaling recommended',
'Sufficient sample size',
],
hyperparameters: [
{
parameterName: 'n_clusters',
description: 'Number of clusters',
defaultValue: 3,
recommendedRange: '2 to sqrt(n_samples)',
tuningStrategy: 'Elbow method, silhouette analysis',
importance: 'critical',
},
{
parameterName: 'init',
description: 'Initialization method',
defaultValue: 'k-means++',
recommendedRange: 'k-means++, random',
tuningStrategy: 'k-means++ usually optimal',
importance: 'important',
},
],
implementationFrameworks: ['scikit-learn', 'R (kmeans)', 'H2O', 'Spark MLlib'],
evaluationMetrics: ['Silhouette Score', 'Inertia', 'Davies-Bouldin Index'],
reasoningNotes: [
'Good starting point for clustering',
'Fast and scalable',
'Works well when clusters are roughly spherical',
],
});
// Hierarchical Clustering
if (dataSize < 1000) {
// Computational limitation
recommendations.push({
algorithmName: 'Hierarchical Clustering',
category: 'unsupervised',
suitabilityScore: this.calculateHierarchicalSuitability(task, section3Result, dataSize),
complexity: 'moderate',
interpretability: 'high',
strengths: [
'No need to pre-specify number of clusters',
'Produces dendrogram for visualization',
'Deterministic results',
'Can find nested cluster structures',
],
weaknesses: [
'Computationally expensive O(n³)',
'Sensitive to noise and outliers',
'Difficult to handle large datasets',
'Choice of linkage method affects results',
],
dataRequirements: [
'Small to medium dataset size',
'Distance/similarity metric appropriate',
],
hyperparameters: [
{
parameterName: 'linkage',
description: 'Linkage criterion',
defaultValue: 'ward',
recommendedRange: 'ward, complete, average, single',
tuningStrategy: 'Ward for euclidean distance',
importance: 'critical',
},
],
implementationFrameworks: ['scikit-learn', 'R (cluster)', 'SciPy'],
evaluationMetrics: ['Cophenetic correlation', 'Silhouette Score', 'Dendrogram quality'],
reasoningNotes: [
'Excellent for understanding cluster hierarchy',
'Visual dendrogram aids interpretation',
'Good for small to medium datasets',
],
});
}
return recommendations;
}
/**
* Generate time series forecasting recommendations
*/
generateTimeSeriesRecommendations(task, section1Result, section3Result, section5Result) {
const recommendations = [];
// ARIMA
recommendations.push({
algorithmName: 'ARIMA (AutoRegressive Integrated Moving Average)',
category: 'linear_models',
suitabilityScore: 85,
complexity: 'moderate',
interpretability: 'medium',
strengths: [
'Well-established statistical method',
'Handles trend and seasonality',
'Confidence intervals for predictions',
'Theoretically grounded',
],
weaknesses: [
'Requires stationary data',
'Model selection can be complex',
'Assumes linear relationships',
'Sensitive to outliers',
],
dataRequirements: [
'Regular time intervals',
'Sufficient historical data',
'Stationary or transformable to stationary',
],
hyperparameters: [
{
parameterName: 'p,d,q',
description: 'ARIMA order parameters',
defaultValue: '(1,1,1)',
recommendedRange: 'Determined by ACF/PACF analysis',
tuningStrategy: 'Box-Jenkins methodology',
importance: 'critical',
},
],
implementationFrameworks: ['statsmodels', 'R (forecast)', 'Prophet'],
evaluationMetrics: ['MAPE', 'RMSE', 'MAE', 'AIC/BIC'],
reasoningNotes: [
'Classic approach for time series forecasting',
'Good baseline for comparison',
'Well-suited for univariate time series',
],
});
return recommendations;
}
/**
* Generate anomaly detection recommendations
*/
generateAnomalyDetectionRecommendations(task, section1Result, section3Result, section5Result) {
const recommendations = [];
const dataSize = section1Result.overview.structuralDimensions.totalDataRows;
// Isolation Forest
recommendations.push({
algorithmName: 'Isolation Forest',
category: 'unsupervised',
suitabilityScore: 88,
complexity: 'moderate',
interpretability: 'low',
strengths: [
'Efficient for large datasets',
'No assumptions about normal data distribution',
'Works well with high-dimensional data',
'Linear time complexity',
],
weaknesses: [
'Difficult to interpret anomaly scores',
'Parameter tuning can be challenging',
'May not work well in very high dimensions',
'Limited explainability',
],
dataRequirements: [
'Numerical features',
'Sufficient normal instances',
'Feature scaling recommended',
],
hyperparameters: [
{
parameterName: 'contamination',
description: 'Expected proportion of anomalies',
defaultValue: 0.1,
recommendedRange: '0.01 to 0.5',
tuningStrategy: 'Domain knowledge or validation',
importance: 'critical',
},
],
implementationFrameworks: ['scikit-learn', 'H2O', 'PyOD'],
evaluationMetrics: ['Precision@K', 'Recall@K', 'AUC-ROC', 'Anomaly Score Distribution'],
reasoningNotes: [
'Excellent general-purpose anomaly detector',
'Scales well to large datasets',
'Good performance without labeled anomalies',
],
});
return recommendations;
}
// Suitability calculation methods
calculateLinearRegressionSuitability(task, section3Result) {
let score = 75; // Base score
// Adjust based on complexity preference
if (this.config.complexityPreference === 'simple')
score += 15;
if (this.config.interpretabilityRequirement === 'high')
score += 10;
return Math.min(100, score);
}
calculateTreeSuitability(task, section3Result, taskContext) {
let score = 80; // Base score
// Trees are great for interpretability
if (this.config.interpretabilityRequirement === 'high')
score += 15;
if (this.config.complexityPreference === 'moderate')
score += 5;
return Math.min(100, score);
}
calculateEnsembleSuitability(task, section3Result, dataSize) {
let score = 85; // Base score
// Ensembles need sufficient data
if (dataSize < 100)
score -= 20;
if (dataSize > 1000)
score += 10;
// Complexity considerations
if (this.config.complexityPreference === 'simple')
score -= 10;
if (this.config.interpretabilityRequirement === 'high')
score -= 15;
return Math.min(100, Math.max(0, score));
}
calculateLogisticRegressionSuitability(task, section3Result) {
let score = 80; // Base score
if (this.config.interpretabilityRequirement === 'high')
score += 10;
if (this.config.complexityPreference === 'simple')
score += 10;
return Math.min(100, score);
}
calculateRegularizedRegressionSuitability(task, section3Result, featureCount) {
let score = 70; // Base score
// More features = more beneficial
if (featureCount > 10)
score += 15;
if (featureCount > 20)
score += 10;
return Math.min(100, score);
}
calculateKMeansSuitability(task, section3Result, dataSize) {
let score = 80;
if (dataSize > 1000)
score += 10;
if (this.config.complexityPreference === 'simple')
score += 10;
return Math.min(100, score);
}
calculateHierarchicalSuitability(task, section3Result, dataSize) {
let score = 75;
// Penalize for large datasets
if (dataSize > 500)
score -= 20;
if (this.config.interpretabilityRequirement === 'high')
score += 15;
return Math.min(100, Math.max(0, score));
}
// Hyperparameter generation methods
getTreeHyperparameters(taskType) {
return [
{
parameterName: 'max_depth',
description: 'Maximum depth of the tree',
defaultValue: null,
recommendedRange: '3 to 20, or None for unlimited',
tuningStrategy: 'Cross-validation, start with 5-10',
importance: 'critical',
},
{
parameterName: 'min_samples_split',
description: 'Minimum samples required to split node',
defaultValue: 2,
recommendedRange: '2 to 50',
tuningStrategy: 'Higher values prevent overfitting',
importance: 'important',
},
{
parameterName: 'min_samples_leaf',
description: 'Minimum samples required at leaf node',
defaultValue: 1,
recommendedRange: '1 to 20',
tuningStrategy: 'Higher values create smoother models',
importance: 'important',
},
{
parameterName: 'criterion',
description: 'Splitting criterion',
defaultValue: taskType === 'regression' ? 'squared_error' : 'gini',
recommendedRange: taskType === 'regression' ? 'squared_error, absolute_error' : 'gini, entropy',
tuningStrategy: 'Gini usually optimal for classification',
importance: 'optional',
},
];
}
getRandomForestHyperparameters() {
return [
{
parameterName: 'n_estimators',
description: 'Number of trees in the forest',
defaultValue: 100,
recommendedRange: '10 to 1000',
tuningStrategy: 'More trees generally better, diminishing returns after 100-500',
importance: 'critical',
},
{
parameterName: 'max_depth',
description: 'Maximum depth of trees',
defaultValue: null,
recommendedRange: '3 to 20, or None',
tuningStrategy: 'Cross-validation, deeper trees for complex data',
importance: 'important',
},
{
parameterName: 'min_samples_split',
description: 'Minimum samples to split node',
defaultValue: 2,
recommendedRange: '2 to 20',
tuningStrategy: 'Higher values reduce overfitting',
importance: 'important',
},
{
parameterName: 'max_features',
description: 'Features to consider for best split',
defaultValue: 'sqrt',
recommendedRange: 'sqrt, log2, None, or fraction',
tuningStrategy: 'sqrt for classification, 1/3 for regression',
importance: 'important',
},
];
}
/**
* Filter algorithm recommendations based on configuration preferences
*/
filterByPreferences(recommendations) {
let filtered = recommendations;
// Filter by complexity preference
if (this.config.complexityPreference === 'simple') {
filtered = filtered.filter(alg => alg.complexity === 'simple' || alg.complexity === 'moderate');
}
else if (this.config.complexityPreference === 'complex') {
// Ensure we include complex algorithms, and add complex ones if none exist
const hasComplex = filtered.some(alg => alg.complexity === 'complex');
if (!hasComplex) {
// Add complex algorithms if none exist
this.addComplexAlgorithms(filtered);
}
filtered = filtered.filter(alg => alg.complexity === 'moderate' || alg.complexity === 'complex');
}
// Filter by interpretability requirement
if (this.config.interpretabilityRequirement === 'high') {
filtered = filtered.filter(alg => alg.interpretability === 'high' || alg.interpretability === 'medium');
}
else if (this.config.interpretabilityRequirement === 'low') {
// Allow all interpretability levels when requirement is low
// Don't filter out anything
}
// Ensure we don't return empty results - if filtering removes everything, return original
return filtered.length > 0 ? filtered : recommendations;
}
/**
* Add complex algorithms to the recommendations list
*/
addComplexAlgorithms(recommendations) {
// Add a complex algorithm (Neural Network for regression tasks)
recommendations.push({
algorithmName: 'Neural Network (MLP)',
category: 'neural_networks',
suitabilityScore: 75,
complexity: 'complex',
interpretability: 'low',
strengths: [
'Can model complex non-linear relationships',
'Universal function approximator',
'Good performance on large datasets',
'Flexible architecture',
],
weaknesses: [
'Requires large amounts of data',
'Prone to overfitting',
'Black box - hard to interpret',
'Many hyperparameters to tune',
],
dataRequirements: [
'Large dataset (1000+ samples preferred)',
'Feature scaling essential',
'Sufficient computational resources',
],
hyperparameters: [
{
parameterName: 'hidden_layer_sizes',
description: 'Architecture of hidden layers',
defaultValue: [100],
recommendedRange: '(50,) to (200, 100, 50)',
tuningStrategy: 'Start simple, increase complexity gradually',
importance: 'critical',
},
],
implementationFrameworks: ['scikit-learn', 'TensorFlow', 'PyTorch', 'Keras'],
evaluationMetrics: ['RMSE', 'MAE', 'R²', 'Loss curves'],
reasoningNotes: [
'Complex non-linear modeling capability',
'Good for complex patterns in large datasets',
'Requires careful regularization and tuning',
],
});
}
}
exports.AlgorithmRecommender = AlgorithmRecommender;
//# sourceMappingURL=algorithm-recommender.js.map