datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
1,013 lines (981 loc) • 80.6 kB
JavaScript
"use strict";
/**
* Unsupervised Learning Analyzer - Enhanced unsupervised learning and synthetic target generation
* Addresses GitHub issue #22: Never return "0 modeling tasks" - always find opportunities
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.UnsupervisedAnalyzer = void 0;
class UnsupervisedAnalyzer {
/**
* Generate comprehensive unsupervised learning recommendations when no obvious targets exist
*/
async analyzeUnsupervisedOpportunities(section1Result, section2Result, section3Result, section5Result) {
// Use EDA column analysis for detailed type information, fall back to basic inventory
const edaUnivariate = section3Result.edaAnalysis?.univariateAnalysis;
let edaColumns = [];
// Handle different EDA result structures (array vs object with separate arrays)
if (Array.isArray(edaUnivariate)) {
edaColumns = edaUnivariate;
}
else if (edaUnivariate && typeof edaUnivariate === 'object') {
// Handle test mock structure with separate arrays
const numerical = edaUnivariate.numericalAnalysis || [];
const categorical = edaUnivariate.categoricalAnalysis || [];
const dateTime = edaUnivariate.dateTimeAnalysis || [];
const boolean = edaUnivariate.booleanAnalysis || [];
const text = edaUnivariate.textAnalysis || [];
edaColumns = [
...numerical.map((col) => ({ ...col, detectedDataType: 'numerical_float' })),
...categorical.map((col) => ({ ...col, detectedDataType: 'categorical' })),
...dateTime.map((col) => ({ ...col, detectedDataType: 'date_time' })),
...boolean.map((col) => ({ ...col, detectedDataType: 'boolean' })),
...text.map((col) => ({ ...col, detectedDataType: 'text_general' })),
];
}
// Fall back to basic columns if no EDA data available
if (edaColumns.length === 0) {
edaColumns = section1Result.overview.structuralDimensions.columnInventory.map(col => ({
columnName: col.name,
detectedDataType: 'categorical', // Default assumption
uniqueValues: 10, // Default assumption
}));
}
const basicColumns = section1Result.overview.structuralDimensions.columnInventory;
const qualityScores = section2Result.qualityAudit?.cockpit?.compositeScore?.score || 0.5;
const correlations = section3Result.edaAnalysis?.bivariateAnalysis?.numericalVsNumerical;
// Generate all types of recommendations
const syntheticTargets = await this.generateSyntheticTargets(edaColumns, section2Result, section3Result, section5Result);
const unsupervisedApproaches = await this.generateUnsupervisedRecommendations(edaColumns, section2Result, section3Result);
const fileSizeMB = section1Result.overview.fileDetails?.fileSizeMB || 1; // Safe fallback for missing fileDetails
const autoMLRecommendations = await this.generateAutoMLRecommendations(edaColumns, qualityScores, fileSizeMB);
const featureEngineeringRecipes = await this.generateFeatureEngineeringRecipes(edaColumns, section3Result, section5Result);
const deploymentConsiderations = await this.generateDeploymentConsiderations(edaColumns, fileSizeMB);
return {
syntheticTargets,
unsupervisedApproaches,
autoMLRecommendations,
featureEngineeringRecipes,
deploymentConsiderations,
};
}
/**
* Generate synthetic target variable recommendations
*/
async generateSyntheticTargets(columns, qualityResult, edaResult, engineeringResult) {
const targets = [];
// 1. Clustering-based targets
targets.push(...this.generateClusteringBasedTargets(columns, edaResult));
// 2. Outlier-based targets
targets.push(...this.generateOutlierBasedTargets(columns, edaResult));
// 3. Composite targets from feature engineering
targets.push(...this.generateCompositeTargets(columns, edaResult));
// 4. Temporal targets (if temporal data exists)
targets.push(...this.generateTemporalTargets(columns, edaResult));
// 5. Domain-derived targets
targets.push(...this.generateDomainDerivedTargets(columns, qualityResult));
return targets.sort((a, b) => b.feasibilityScore - a.feasibilityScore);
}
/**
* Generate clustering-based synthetic targets
*/
generateClusteringBasedTargets(columns, edaResult) {
const targets = [];
// Find suitable columns for clustering
const numericalColumns = columns.filter(col => col.detectedDataType === 'numerical_float' || col.detectedDataType === 'numerical_integer');
const categoricalColumns = columns.filter(col => col.detectedDataType === 'categorical' && col.uniqueValues <= 50);
if (numericalColumns.length >= 2 || categoricalColumns.length >= 1) {
targets.push({
targetName: 'customer_segment',
targetType: 'clustering_based',
description: 'Customer segmentation based on behavioral and demographic features',
businessValue: 'Enables targeted marketing campaigns, personalized product recommendations, and customer lifetime value analysis',
technicalImplementation: 'K-Means clustering with optimal K selection using elbow method and silhouette analysis',
sourceColumns: [...numericalColumns.slice(0, 4).map(c => c.columnName), ...categoricalColumns.slice(0, 2).map(c => c.columnName)],
expectedCardinality: 5,
feasibilityScore: 85,
codeExample: this.generateClusteringCode(numericalColumns, categoricalColumns),
validationStrategy: 'Silhouette analysis, Davies-Bouldin index, business interpretation validation',
useCases: [
'Marketing campaign targeting',
'Product recommendation systems',
'Customer service optimization',
'Pricing strategy development',
],
});
}
if (categoricalColumns.some(col => col.columnName.toLowerCase().includes('country'))) {
targets.push({
targetName: 'market_tier',
targetType: 'clustering_based',
description: 'Geographic market tiers based on customer density and characteristics',
businessValue: 'Market prioritization, resource allocation, and expansion planning',
technicalImplementation: 'Hierarchical clustering of geographic regions based on customer metrics',
sourceColumns: ['country', ...numericalColumns.slice(0, 2).map(c => c.columnName)],
expectedCardinality: 3,
feasibilityScore: 75,
codeExample: this.generateGeographicClusteringCode(),
validationStrategy: 'Geographic coherence validation, business metric correlation',
useCases: [
'Market expansion strategy',
'Regional sales team allocation',
'Localized marketing campaigns',
'Supply chain optimization',
],
});
}
return targets;
}
/**
* Generate outlier-based synthetic targets
*/
generateOutlierBasedTargets(columns, edaResult) {
const targets = [];
const numericalColumns = columns.filter(col => col.detectedDataType === 'numerical_float' || col.detectedDataType === 'numerical_integer');
if (numericalColumns.length >= 2) {
targets.push({
targetName: 'anomaly_score',
targetType: 'outlier_based',
description: 'Anomaly detection score for identifying unusual records',
businessValue: 'Fraud detection, data quality monitoring, and outlier investigation',
technicalImplementation: 'Isolation Forest algorithm to assign anomaly scores to each record',
sourceColumns: numericalColumns.map(c => c.columnName),
feasibilityScore: 80,
codeExample: this.generateAnomalyDetectionCode(numericalColumns),
validationStrategy: 'Manual inspection of high-scoring anomalies, domain expert validation',
useCases: [
'Fraud detection systems',
'Data quality monitoring',
'Outlier investigation',
'Rare event detection',
],
});
targets.push({
targetName: 'data_quality_flag',
targetType: 'outlier_based',
description: 'Binary flag indicating records with potential data quality issues',
businessValue: 'Automated data quality assessment and cleaning prioritization',
technicalImplementation: 'Binary classification based on anomaly score threshold',
sourceColumns: numericalColumns.map(c => c.columnName),
expectedCardinality: 2,
feasibilityScore: 75,
codeExample: this.generateDataQualityFlagCode(numericalColumns),
validationStrategy: 'Precision/recall analysis against manually identified quality issues',
useCases: [
'Automated data cleaning',
'Data quality dashboards',
'ETL pipeline monitoring',
'Data validation workflows',
],
});
}
return targets;
}
/**
* Generate composite synthetic targets
*/
generateCompositeTargets(columns, edaResult) {
const targets = [];
// Look for email columns to create email domain quality target
const emailColumns = columns.filter(col => col.columnName.toLowerCase().includes('email'));
if (emailColumns.length > 0) {
targets.push({
targetName: 'email_domain_quality',
targetType: 'composite',
description: 'Classification of email domains as corporate vs personal',
businessValue: 'Lead quality assessment, B2B vs B2C customer segmentation',
technicalImplementation: 'Rule-based classification using domain patterns and known corporate/personal email providers',
sourceColumns: [emailColumns[0].columnName],
expectedCardinality: 2,
feasibilityScore: 90,
codeExample: this.generateEmailDomainQualityCode(emailColumns[0].columnName),
validationStrategy: 'Manual validation against known corporate domains, accuracy testing',
useCases: [
'Lead scoring systems',
'B2B marketing targeting',
'Customer acquisition cost optimization',
'Sales pipeline qualification',
],
});
}
// Look for name columns to create completeness score
const nameColumns = columns.filter(col => col.columnName.toLowerCase().includes('name') ||
col.columnName.toLowerCase().includes('first') ||
col.columnName.toLowerCase().includes('last'));
if (nameColumns.length > 0) {
targets.push({
targetName: 'profile_completeness_score',
targetType: 'composite',
description: 'Percentage score of profile completeness based on filled important fields',
businessValue: 'User engagement optimization, onboarding funnel analysis',
technicalImplementation: 'Weighted percentage of non-null values across important profile fields',
sourceColumns: columns.slice(0, 8).map(c => c.columnName),
feasibilityScore: 85,
codeExample: this.generateProfileCompletenessCode(columns),
validationStrategy: 'Correlation analysis with user engagement metrics',
useCases: [
'User onboarding optimization',
'Engagement prediction models',
'Profile completion campaigns',
'Data collection prioritization',
],
});
}
return targets;
}
/**
* Generate temporal synthetic targets
*/
generateTemporalTargets(columns, edaResult) {
const targets = [];
const dateColumns = columns.filter(col => col.detectedDataType === 'date_time' ||
col.columnName.toLowerCase().includes('date') ||
col.columnName.toLowerCase().includes('time'));
if (dateColumns.length > 0) {
targets.push({
targetName: 'customer_lifetime_days',
targetType: 'temporal',
description: 'Days since customer first interaction or subscription',
businessValue: 'Customer lifetime value analysis, churn prediction, retention modeling',
technicalImplementation: 'Calculate days between first recorded date and reference date',
sourceColumns: [dateColumns[0].columnName],
feasibilityScore: 80,
codeExample: this.generateCustomerLifetimeCode(dateColumns[0].columnName),
validationStrategy: 'Business logic validation, outlier analysis',
useCases: [
'Customer lifetime value models',
'Churn prediction systems',
'Retention campaign targeting',
'Customer journey analysis',
],
});
targets.push({
targetName: 'subscription_quarter',
targetType: 'temporal',
description: 'Quarter of year when customer subscribed (seasonal analysis)',
businessValue: 'Seasonal trend analysis, marketing campaign timing optimization',
technicalImplementation: 'Extract quarter from subscription date',
sourceColumns: [dateColumns[0].columnName],
expectedCardinality: 4,
feasibilityScore: 75,
codeExample: this.generateSeasonalityCode(dateColumns[0].columnName),
validationStrategy: 'Seasonal pattern validation, business cycle correlation',
useCases: [
'Seasonal marketing campaigns',
'Resource planning',
'Budget allocation',
'Trend analysis',
],
});
}
return targets;
}
/**
* Generate domain-derived synthetic targets
*/
generateDomainDerivedTargets(columns, qualityResult) {
const targets = [];
// Look for high-cardinality categorical columns that could be grouped
const highCardinalityColumns = columns.filter(col => col.detectedDataType === 'categorical' &&
col.uniqueValues > 20 &&
col.uniqueValues < 1000);
if (highCardinalityColumns.length > 0) {
const col = highCardinalityColumns[0];
targets.push({
targetName: `${col.columnName}_category`,
targetType: 'domain_derived',
description: `Grouped categories from high-cardinality ${col.columnName} field`,
businessValue: 'Simplify analysis by grouping related categories, enable pattern recognition',
technicalImplementation: 'Frequency-based grouping: top N categories + "Other" group',
sourceColumns: [col.columnName],
expectedCardinality: Math.min(10, Math.ceil(col.uniqueValues / 10)),
feasibilityScore: 70,
codeExample: this.generateCategoryGroupingCode(col.columnName),
validationStrategy: 'Business logic review of groupings, frequency distribution analysis',
useCases: [
'Simplified reporting',
'Category-based analysis',
'Market segmentation',
'Product grouping',
],
});
}
return targets;
}
/**
* Generate enhanced unsupervised learning recommendations
*/
async generateUnsupervisedRecommendations(columns, qualityResult, edaResult) {
const recommendations = [];
const numericalColumns = columns.filter(col => col.detectedDataType === 'numerical_float' || col.detectedDataType === 'numerical_integer');
const categoricalColumns = columns.filter(col => col.detectedDataType === 'categorical');
// 1. Clustering recommendations
recommendations.push(...this.generateClusteringRecommendations(numericalColumns, categoricalColumns));
// 2. Dimensionality reduction recommendations
if (numericalColumns.length >= 4) {
recommendations.push(...this.generateDimensionalityReductionRecommendations(numericalColumns));
}
// 3. Association rule mining (for categorical data)
if (categoricalColumns.length >= 3) {
recommendations.push(...this.generateAssociationMiningRecommendations(categoricalColumns));
}
// 4. Anomaly detection recommendations
if (numericalColumns.length >= 2) {
recommendations.push(...this.generateAnomalyDetectionRecommendations(numericalColumns));
}
return recommendations;
}
/**
* Generate clustering recommendations
*/
generateClusteringRecommendations(numericalColumns, categoricalColumns) {
const recommendations = [];
if (numericalColumns.length >= 2) {
// K-Means recommendation
recommendations.push({
approach: 'clustering',
algorithmName: 'K-Means Clustering',
description: 'Partition data into K clusters based on feature similarity',
businessValue: 'Customer segmentation, market analysis, behavioral grouping',
technicalDetails: {
inputFeatures: numericalColumns.slice(0, 6).map(c => c.columnName),
preprocessing: ['StandardScaler normalization', 'Handle missing values', 'Remove outliers'],
hyperparameters: [
{
parameterName: 'n_clusters',
description: 'Number of clusters',
defaultValue: 5,
recommendedRange: '3-8 (use elbow method)',
tuningStrategy: 'Grid search with silhouette score',
importance: 'critical',
},
{
parameterName: 'random_state',
description: 'Random seed for reproducibility',
defaultValue: 42,
recommendedRange: 'Any integer',
tuningStrategy: 'Fixed for reproducibility',
importance: 'important',
},
],
computationalComplexity: 'O(n * k * i * d) where n=samples, k=clusters, i=iterations, d=dimensions',
memoryRequirements: 'Low - scales linearly with data size',
optimalDataSize: '1K-1M records',
},
codeImplementation: this.generateKMeansImplementation(numericalColumns),
evaluationMetrics: ['Silhouette Score', 'Davies-Bouldin Index', 'Calinski-Harabasz Index', 'Inertia'],
interpretationGuidance: [
'Analyze cluster centroids to understand group characteristics',
'Profile each cluster with business metrics',
'Validate clusters make business sense',
'Check cluster stability with different random seeds',
],
scalabilityNotes: [
'Use MiniBatch K-Means for datasets > 100K records',
'Consider feature selection for high-dimensional data',
'Parallel processing available for large datasets',
],
});
// DBSCAN recommendation
recommendations.push({
approach: 'clustering',
algorithmName: 'DBSCAN',
description: 'Density-based clustering that finds clusters of varying shapes and identifies outliers',
businessValue: 'Anomaly detection, flexible cluster shapes, automatic outlier identification',
technicalDetails: {
inputFeatures: numericalColumns.slice(0, 6).map(c => c.columnName),
preprocessing: ['StandardScaler normalization', 'Handle missing values'],
hyperparameters: [
{
parameterName: 'eps',
description: 'Maximum distance between samples in neighborhood',
defaultValue: 0.5,
recommendedRange: 'Use k-distance graph to determine',
tuningStrategy: 'Grid search or k-distance analysis',
importance: 'critical',
},
{
parameterName: 'min_samples',
description: 'Minimum samples in neighborhood to form cluster',
defaultValue: 5,
recommendedRange: '2 * dimensions',
tuningStrategy: 'Start with 2*dimensions, adjust based on results',
importance: 'critical',
},
],
computationalComplexity: 'O(n log n) with spatial indexing',
memoryRequirements: 'Moderate - requires distance matrix',
optimalDataSize: '1K-100K records',
},
codeImplementation: this.generateDBSCANImplementation(numericalColumns),
evaluationMetrics: ['Silhouette Score', 'Number of clusters found', 'Noise points ratio'],
interpretationGuidance: [
'Noise points (label -1) are potential outliers',
'Clusters can have irregular shapes',
'No need to specify number of clusters in advance',
],
scalabilityNotes: [
'Memory usage increases with dataset size',
'Consider sampling for very large datasets',
'Use approximate nearest neighbor algorithms for speedup',
],
});
}
return recommendations;
}
/**
* Generate dimensionality reduction recommendations
*/
generateDimensionalityReductionRecommendations(numericalColumns) {
const recommendations = [];
// PCA recommendation
recommendations.push({
approach: 'dimensionality_reduction',
algorithmName: 'Principal Component Analysis (PCA)',
description: 'Linear dimensionality reduction using orthogonal transformation',
businessValue: 'Feature reduction, visualization, noise reduction, data compression',
technicalDetails: {
inputFeatures: numericalColumns.map(c => c.columnName),
preprocessing: ['StandardScaler normalization', 'Handle missing values'],
hyperparameters: [
{
parameterName: 'n_components',
description: 'Number of components to keep',
defaultValue: 'auto',
recommendedRange: 'Explain 95% of variance',
tuningStrategy: 'Cumulative explained variance analysis',
importance: 'critical',
},
],
computationalComplexity: 'O(n * d^2) where n=samples, d=dimensions',
memoryRequirements: 'Moderate - requires covariance matrix',
optimalDataSize: '1K-1M records',
},
codeImplementation: this.generatePCAImplementation(numericalColumns),
evaluationMetrics: ['Explained Variance Ratio', 'Cumulative Variance Explained'],
interpretationGuidance: [
'Components are linear combinations of original features',
'First component explains most variance',
'Use feature loadings to interpret components',
],
scalabilityNotes: [
'Incremental PCA available for large datasets',
'Randomized PCA for faster computation',
'Sparse PCA for interpretable components',
],
});
// t-SNE recommendation (for visualization)
if (numericalColumns.length >= 3) {
recommendations.push({
approach: 'dimensionality_reduction',
algorithmName: 't-SNE',
description: 'Non-linear dimensionality reduction for visualization',
businessValue: 'Data visualization, cluster visualization, pattern discovery',
technicalDetails: {
inputFeatures: numericalColumns.slice(0, 10).map(c => c.columnName),
preprocessing: ['StandardScaler normalization', 'Handle missing values', 'Consider PCA preprocessing'],
hyperparameters: [
{
parameterName: 'perplexity',
description: 'Balance attention between local and global aspects',
defaultValue: 30,
recommendedRange: '5-50 (depends on dataset size)',
tuningStrategy: 'Try multiple values, visualize results',
importance: 'critical',
},
{
parameterName: 'n_iter',
description: 'Number of iterations for optimization',
defaultValue: 1000,
recommendedRange: '1000-5000',
tuningStrategy: 'Increase until convergence',
importance: 'important',
},
],
computationalComplexity: 'O(n^2) - computationally expensive',
memoryRequirements: 'High - quadratic with number of samples',
optimalDataSize: '100-10K records (sample larger datasets)',
},
codeImplementation: this.generateTSNEImplementation(numericalColumns),
evaluationMetrics: ['Visual clustering quality', 'KL divergence', 'Trustworthiness'],
interpretationGuidance: [
'Use primarily for visualization, not feature extraction',
'Distances in t-SNE space may not reflect original distances',
'Multiple runs may produce different results',
],
scalabilityNotes: [
'Sample large datasets to <10K records',
'Use PCA preprocessing for high dimensions',
'Consider UMAP as faster alternative',
],
});
}
return recommendations;
}
/**
* Generate association mining recommendations
*/
generateAssociationMiningRecommendations(categoricalColumns) {
const recommendations = [];
recommendations.push({
approach: 'association_mining',
algorithmName: 'Apriori Algorithm',
description: 'Find frequent itemsets and association rules in categorical data',
businessValue: 'Market basket analysis, recommendation systems, cross-selling opportunities',
technicalDetails: {
inputFeatures: categoricalColumns.filter(c => c.uniqueValues <= 100).map(c => c.columnName),
preprocessing: ['One-hot encoding', 'Handle missing values', 'Binary transformation'],
hyperparameters: [
{
parameterName: 'min_support',
description: 'Minimum support threshold for frequent itemsets',
defaultValue: 0.01,
recommendedRange: '0.001-0.1',
tuningStrategy: 'Start high, lower until meaningful patterns emerge',
importance: 'critical',
},
{
parameterName: 'min_confidence',
description: 'Minimum confidence for association rules',
defaultValue: 0.5,
recommendedRange: '0.3-0.9',
tuningStrategy: 'Domain-specific threshold selection',
importance: 'critical',
},
],
computationalComplexity: 'Exponential in worst case, depends on data density',
memoryRequirements: 'High - stores all frequent itemsets',
optimalDataSize: '1K-100K transactions',
},
codeImplementation: this.generateAprioriImplementation(categoricalColumns),
evaluationMetrics: ['Support', 'Confidence', 'Lift', 'Conviction'],
interpretationGuidance: [
'High lift values indicate strong associations',
'Support shows frequency of itemset occurrence',
'Confidence measures rule reliability',
],
scalabilityNotes: [
'Use FP-Growth for larger datasets',
'Sample data if memory constraints exist',
'Filter rare items to improve performance',
],
});
return recommendations;
}
/**
* Generate anomaly detection recommendations
*/
generateAnomalyDetectionRecommendations(numericalColumns) {
const recommendations = [];
recommendations.push({
approach: 'anomaly_detection',
algorithmName: 'Isolation Forest',
description: 'Unsupervised anomaly detection using random forest principles',
businessValue: 'Fraud detection, quality control, outlier identification, system monitoring',
technicalDetails: {
inputFeatures: numericalColumns.map(c => c.columnName),
preprocessing: ['Handle missing values', 'Optional: StandardScaler'],
hyperparameters: [
{
parameterName: 'contamination',
description: 'Expected proportion of outliers',
defaultValue: 0.1,
recommendedRange: '0.01-0.2',
tuningStrategy: 'Domain knowledge or exploratory analysis',
importance: 'critical',
},
{
parameterName: 'n_estimators',
description: 'Number of isolation trees',
defaultValue: 100,
recommendedRange: '50-200',
tuningStrategy: 'Balance performance vs accuracy',
importance: 'important',
},
],
computationalComplexity: 'O(n log n) for training, O(log n) for prediction',
memoryRequirements: 'Low - tree-based algorithm',
optimalDataSize: '1K-1M records',
},
codeImplementation: this.generateIsolationForestImplementation(numericalColumns),
evaluationMetrics: ['Anomaly Score', 'Precision', 'Recall', 'F1-Score (if labels available)'],
interpretationGuidance: [
'Negative scores indicate anomalies',
'Score magnitude indicates anomaly strength',
'Validate results with domain expertise',
],
scalabilityNotes: [
'Scales well to large datasets',
'Parallel training available',
'Real-time scoring possible',
],
});
return recommendations;
}
/**
* Generate AutoML platform recommendations
*/
async generateAutoMLRecommendations(columns, qualityScore, fileSizeMB) {
const recommendations = [];
const numericalColumns = columns.filter(col => col.detectedDataType === 'numerical_float' || col.detectedDataType === 'numerical_integer');
const categoricalColumns = columns.filter(col => col.detectedDataType === 'categorical');
const highCardinalityColumns = categoricalColumns.filter(col => col.uniqueValues > 50);
// H2O AutoML recommendation
recommendations.push({
platform: 'H2O_AutoML',
suitabilityScore: this.calculateH2OSuitability(numericalColumns, categoricalColumns, fileSizeMB),
strengths: [
'Excellent handling of mixed data types',
'Automatic feature engineering',
'Built-in model interpretation',
'Scalable to large datasets',
'Free and open source',
],
limitations: [
'Requires Java runtime',
'Learning curve for beginners',
'Limited deep learning options',
],
dataRequirements: [
'Minimum 1000 rows recommended',
'Handles missing values automatically',
'Automatic encoding of categorical variables',
],
estimatedCost: 'Free (open source)',
setupComplexity: 'moderate',
codeExample: this.generateH2OAutoMLCode(columns),
configurationRecommendations: {
max_models: fileSizeMB > 100 ? 10 : 20,
seed: 42,
exclude_algos: highCardinalityColumns.length > 0 ? ['DeepLearning'] : [],
max_runtime_secs: fileSizeMB > 50 ? 7200 : 3600,
stopping_metric: 'AUTO',
stopping_tolerance: 0.001,
},
});
// AutoGluon recommendation
recommendations.push({
platform: 'AutoGluon',
suitabilityScore: this.calculateAutoGluonSuitability(numericalColumns, categoricalColumns, fileSizeMB),
strengths: [
'State-of-the-art ensemble methods',
'Excellent text feature handling',
'Multi-modal learning capabilities',
'Neural network options',
'Easy to use Python API',
],
limitations: [
'Higher computational requirements',
'Longer training times',
'Memory intensive',
],
dataRequirements: [
'Works well with smaller datasets',
'Automatic feature preprocessing',
'Handles text and categorical data excellently',
],
estimatedCost: 'Free (open source)',
setupComplexity: 'simple',
codeExample: this.generateAutoGluonCode(columns),
configurationRecommendations: {
presets: fileSizeMB > 50 ? 'medium_quality_faster_train' : 'best_quality',
time_limit: fileSizeMB > 50 ? 3600 : 7200,
eval_metric: 'auto',
auto_stack: true,
},
});
return recommendations.sort((a, b) => b.suitabilityScore - a.suitabilityScore);
}
/**
* Generate feature engineering recipes
*/
async generateFeatureEngineeringRecipes(columns, edaResult, engineeringResult) {
const recipes = [];
// Temporal feature engineering
const dateColumns = columns.filter(col => col.detectedDataType === 'date_time' ||
col.columnName.toLowerCase().includes('date'));
if (dateColumns.length > 0) {
recipes.push({
recipeName: 'Temporal Feature Extraction',
description: 'Extract meaningful time-based features from date columns',
applicableColumns: dateColumns.map(c => c.columnName),
businessRationale: 'Time-based patterns often drive business outcomes (seasonality, trends, cycles)',
codeImplementation: this.generateTemporalFeatureCode(dateColumns),
expectedImpact: 'Improved model performance through temporal pattern recognition',
prerequisites: ['Valid date format', 'Reasonable date range'],
riskFactors: ['Time zone considerations', 'Missing date handling'],
});
}
// Geographic feature engineering
const countryColumns = columns.filter(col => col.columnName.toLowerCase().includes('country') ||
col.columnName.toLowerCase().includes('region'));
if (countryColumns.length > 0) {
recipes.push({
recipeName: 'Geographic Feature Enrichment',
description: 'Create derived geographic features from location data',
applicableColumns: countryColumns.map(c => c.columnName),
businessRationale: 'Geographic patterns influence customer behavior, market dynamics, and operational efficiency',
codeImplementation: this.generateGeographicFeatureCode(countryColumns),
expectedImpact: 'Enhanced geographic insights and regional pattern recognition',
prerequisites: ['Standardized country/region names', 'Geographic reference data'],
riskFactors: ['Data quality in geographic fields', 'Changing geographic boundaries'],
});
}
// Text feature engineering
const textColumns = columns.filter(col => col.detectedDataType === 'text_general' || col.detectedDataType === 'text_address' ||
(col.detectedDataType === 'categorical' && col.uniqueValues > 100));
if (textColumns.length > 0) {
recipes.push({
recipeName: 'Text Feature Engineering',
description: 'Extract patterns and features from high-cardinality text columns',
applicableColumns: textColumns.map(c => c.columnName),
businessRationale: 'Text data contains rich patterns that can improve predictive accuracy',
codeImplementation: this.generateTextFeatureCode(textColumns),
expectedImpact: 'Better handling of unstructured text information',
prerequisites: ['Clean text data', 'Consistent formatting'],
riskFactors: ['High dimensionality', 'Overfitting risk'],
});
}
return recipes;
}
/**
* Generate deployment considerations
*/
async generateDeploymentConsiderations(columns, fileSizeMB) {
const considerations = [];
// Data pipeline considerations
considerations.push({
aspect: 'data_pipeline',
requirements: [
'Real-time preprocessing for all input features',
'Encoding dictionaries for categorical variables',
'Missing value imputation strategies',
'Data validation and quality checks',
],
recommendations: [
'Use pipeline objects for consistent preprocessing',
'Version control preprocessing steps',
'Implement data quality monitoring',
'Cache frequently used transformations',
],
riskFactors: [
'Data drift affecting preprocessing',
'Missing values in production data',
'Categorical values not seen in training',
],
codeTemplates: this.generateDataPipelineTemplates(columns),
});
// Monitoring considerations
considerations.push({
aspect: 'monitoring',
requirements: [
`Monitor distribution drift for ${columns.length} features`,
'Track prediction confidence scores',
'Alert on unusual input patterns',
'Performance metric tracking',
],
recommendations: [
'Implement statistical drift detection',
'Set up automated retraining triggers',
'Monitor model performance degradation',
'Log all predictions for audit trail',
],
riskFactors: [
'Concept drift affecting model accuracy',
'Data quality degradation over time',
'Unexpected input combinations',
],
});
// API schema considerations
considerations.push({
aspect: 'api_schema',
requirements: [
'Input validation for all features',
'Standardized response format',
'Error handling for invalid inputs',
'Documentation and examples',
],
recommendations: [
'Use JSON schema validation',
'Provide clear error messages',
'Include confidence scores in responses',
'Support batch and single predictions',
],
riskFactors: [
'Breaking changes in API schema',
'Performance bottlenecks',
'Security vulnerabilities',
],
codeTemplates: this.generateAPISchemaTemplates(columns),
});
return considerations;
}
// Helper methods for code generation
generateClusteringCode(numericalColumns, categoricalColumns) {
return `from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import silhouette_score
import pandas as pd
# Prepare features
features = ${JSON.stringify([...numericalColumns.slice(0, 4).map(c => c.columnName), ...categoricalColumns.slice(0, 2).map(c => c.columnName)])}
# Encode categorical variables
le = LabelEncoder()
df_encoded = df.copy()
for col in ${JSON.stringify(categoricalColumns.slice(0, 2).map(c => c.columnName))}:
df_encoded[col + '_encoded'] = le.fit_transform(df[col].astype(str))
# Scale numerical features
scaler = StandardScaler()
X = scaler.fit_transform(df_encoded[features])
# Find optimal number of clusters
silhouette_scores = []
K_range = range(2, 9)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
silhouette_scores.append(silhouette_avg)
# Train final model with optimal K
optimal_k = K_range[silhouette_scores.index(max(silhouette_scores))]
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['customer_segment'] = kmeans.fit_predict(X)`;
}
generateGeographicClusteringCode() {
return `import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
# Aggregate metrics by country
country_metrics = df.groupby('country').agg({
'customer_id': 'count',
'company': 'nunique'
}).rename(columns={'customer_id': 'customer_count', 'company': 'company_count'})
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(country_metrics)
# Hierarchical clustering
clustering = AgglomerativeClustering(n_clusters=3, linkage='ward')
country_metrics['market_tier'] = clustering.fit_predict(X_scaled)
# Map back to original data
df = df.merge(country_metrics[['market_tier']], left_on='country', right_index=True)`;
}
generateAnomalyDetectionCode(numericalColumns) {
return `from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
# Prepare numerical features
features = ${JSON.stringify(numericalColumns.map(c => c.columnName))}
X = df[features].fillna(df[features].median())
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
df['anomaly_score'] = iso_forest.fit_predict(X_scaled)
df['anomaly_score_raw'] = iso_forest.decision_function(X_scaled)
# Convert to positive scale (higher = more anomalous)
df['anomaly_score_normalized'] = (df['anomaly_score_raw'] - df['anomaly_score_raw'].min()) / (df['anomaly_score_raw'].max() - df['anomaly_score_raw'].min())`;
}
generateDataQualityFlagCode(numericalColumns) {
return `from sklearn.ensemble import IsolationForest
# Use anomaly detection for data quality assessment
features = ${JSON.stringify(numericalColumns.map(c => c.columnName))}
X = df[features].fillna(df[features].median())
iso_forest = IsolationForest(contamination=0.05, random_state=42)
anomaly_scores = iso_forest.decision_function(X)
# Create binary quality flag (1 = good quality, 0 = potential issues)
threshold = np.percentile(anomaly_scores, 5) # Bottom 5% flagged as quality issues
df['data_quality_flag'] = (anomaly_scores > threshold).astype(int)`;
}
generateEmailDomainQualityCode(emailColumn) {
return `import re
def classify_email_domain(email):
if pd.isna(email):
return 'unknown'
# Extract domain
domain_match = re.search(r'@([^.]+\\.[^.]+)$', str(email).lower())
if not domain_match:
return 'invalid'
domain = domain_match.group(1)
# Personal email providers
personal_domains = {
'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com',
'aol.com', 'icloud.com', 'live.com', 'msn.com'
}
if domain in personal_domains:
return 'personal'
else:
return 'corporate'
df['email_domain_quality'] = df['${emailColumn}'].apply(classify_email_domain)
df['is_corporate_email'] = (df['email_domain_quality'] == 'corporate').astype(int)`;
}
generateProfileCompletenessCode(columns) {
const importantColumns = columns.slice(0, 8).map(c => c.columnName);
return `# Define important fields and their weights
important_fields = ${JSON.stringify(importantColumns)}
field_weights = {field: 1.0 for field in important_fields}
def calculate_completeness_score(row):
total_weight = sum(field_weights.values())
weighted_score = 0
for field, weight in field_weights.items():
if pd.notna(row[field]) and str(row[field]).strip() != '':
weighted_score += weight
return (weighted_score / total_weight) * 100
df['profile_completeness_score'] = df.apply(calculate_completeness_score, axis=1)`;
}
generateCustomerLifetimeCode(dateColumn) {
return `import pandas as pd
from datetime import datetime
# Convert to datetime
df['${dateColumn}'] = pd.to_datetime(df['${dateColumn}'])
# Calculate customer lifetime in days
reference_date = pd.Timestamp.now()
df['customer_lifetime_days'] = (reference_date - df['${dateColumn}']).dt.days
# Handle negative values (future dates)
df['customer_lifetime_days'] = df['customer_lifetime_days'].clip(lower=0)`;
}
generateSeasonalityCode(dateColumn) {
return `# Extract temporal features
df['${dateColumn}'] = pd.to_datetime(df['${dateColumn}'])
df['subscription_quarter'] = df['${dateColumn}'].dt.quarter
df['subscription_month'] = df['${dateColumn}'].dt.month
df['subscription_dayofweek'] = df['${dateColumn}'].dt.dayofweek
df['is_weekend_signup'] = df['subscription_dayofweek'].isin([5, 6]).astype(int)`;
}
generateCategoryGroupingCode(columnName) {
return `# Group high-cardinality categories
def group_categories(series, top_n=10):
value_counts = series.value_counts()
top_categories = value_counts.head(top_n).index.tolist()
def categorize(value):
if pd.isna(value):
return 'Missing'
elif value in top_categories:
return value
else:
return 'Other'
return series.apply(categorize)
df['${columnName}_category'] = group_categories(df['${columnName}'], top_n=8)`;
}
generateKMeansImplementation(numericalColumns) {
return {
framework: 'scikit-learn',
importStatements: [
'from sklearn.cluster import KMeans',
'from sklearn.preprocessing import StandardScaler',
'from sklearn.metrics import silhouette_score',
'import numpy as np',
'import pandas as pd',
'import matplotlib.pyplot as plt'
],
preprocessingCode: [
`features = ${JSON.stringify(numericalColumns.slice(0, 6).map(c => c.columnName))}`,
'X = df[features].fillna(df[features].median())',
'scaler = StandardScaler()',
'X_scaled = scaler.fit_transform(X)'
],
mainImplementation: [
'# Find optimal number of clusters using elbow method',
'inertias = []',
'silhouette_scores = []',
'K_range = range(2, 11)',
'',
'for k in K_range:',
' kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)',
' cluster_labels = kmean