datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
1,213 lines (1,212 loc) • 70.7 kB
JavaScript
"use strict";
/**
* Domain-Aware Visualization Intelligence Engine
*
* Advanced engine that understands data context and domain to provide:
* - Automatic domain detection and classification
* - Context-sensitive visualization recommendations
* - Domain-specific insight generation
* - Stakeholder-appropriate view customization
* - Semantic understanding of data relationships
* - Industry best practices integration
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.DomainAwareIntelligence = void 0;
/**
* Domain-Aware Visualization Intelligence Engine
*/
class DomainAwareIntelligence {
/**
* Analyze dataset and determine domain context
*/
static analyzeDomainContext(columnNames, dataCharacteristics, sampleValues) {
// Detect primary domain
const primaryDomain = this.detectPrimaryDomain(columnNames, dataCharacteristics, sampleValues);
// Identify subdomains
const subdomains = this.identifySubdomains(columnNames, primaryDomain, dataCharacteristics);
// Extract context clues
const contextClues = this.extractContextClues(columnNames, dataCharacteristics, sampleValues);
// Define stakeholder profiles
const stakeholders = this.defineStakeholderProfiles(primaryDomain, subdomains);
// Build domain knowledge base
const domainKnowledge = this.buildDomainKnowledge(primaryDomain, subdomains);
// Create visualization strategy
const visualizationStrategy = this.createVisualizationStrategy(primaryDomain, stakeholders, domainKnowledge);
// Generate domain-specific insights
const insights = this.generateDomainInsights(primaryDomain, columnNames, dataCharacteristics);
const confidence = this.calculateOverallConfidence(primaryDomain, contextClues, subdomains);
return {
primaryDomain,
confidence,
subdomains,
contextClues,
stakeholders,
domainKnowledge,
visualizationStrategy,
insights,
};
}
/**
* Detect primary domain based on data characteristics
*/
static detectPrimaryDomain(columnNames, dataCharacteristics, sampleValues) {
const domainScores = new Map();
const domainReasons = new Map();
// Education domain detection
const educationScore = this.scoreEducationDomain(columnNames, dataCharacteristics, sampleValues);
domainScores.set('education', educationScore.score);
domainReasons.set('education', educationScore.reasons);
// Healthcare domain detection
const healthcareScore = this.scoreHealthcareDomain(columnNames, dataCharacteristics, sampleValues);
domainScores.set('healthcare', healthcareScore.score);
domainReasons.set('healthcare', healthcareScore.reasons);
// Finance domain detection
const financeScore = this.scoreFinanceDomain(columnNames, dataCharacteristics, sampleValues);
domainScores.set('finance', financeScore.score);
domainReasons.set('finance', financeScore.reasons);
// Marketing domain detection
const marketingScore = this.scoreMarketingDomain(columnNames, dataCharacteristics, sampleValues);
domainScores.set('marketing', marketingScore.score);
domainReasons.set('marketing', marketingScore.reasons);
// Operations domain detection
const operationsScore = this.scoreOperationsDomain(columnNames, dataCharacteristics, sampleValues);
domainScores.set('operations', operationsScore.score);
domainReasons.set('operations', operationsScore.reasons);
// HR domain detection
const hrScore = this.scoreHRDomain(columnNames, dataCharacteristics, sampleValues);
domainScores.set('hr', hrScore.score);
domainReasons.set('hr', hrScore.reasons);
// Find highest scoring domain
let bestDomain = 'generic';
let bestScore = 0;
let bestReasons = [];
for (const [domain, score] of domainScores) {
if (score > bestScore) {
bestScore = score;
bestDomain = domain;
bestReasons = domainReasons.get(domain) || [];
}
}
// If no domain scores well, default to generic
if (bestScore < 0.3) {
bestDomain = 'generic';
bestReasons = ['No strong domain indicators found'];
}
const domainMap = {
education: this.createEducationDomain(bestScore, bestReasons),
healthcare: this.createHealthcareDomain(bestScore, bestReasons),
finance: this.createFinanceDomain(bestScore, bestReasons),
marketing: this.createMarketingDomain(bestScore, bestReasons),
operations: this.createOperationsDomain(bestScore, bestReasons),
hr: this.createHRDomain(bestScore, bestReasons),
generic: this.createGenericDomain(bestScore, bestReasons),
};
return domainMap[bestDomain];
}
/**
* Score likelihood of education domain
*/
static scoreEducationDomain(columnNames, dataCharacteristics, sampleValues) {
let score = 0;
const reasons = [];
const educationKeywords = [
'student',
'grade',
'score',
'exam',
'test',
'assignment',
'course',
'class',
'attendance',
'gpa',
'academic',
'school',
'university',
'college',
'study',
'education',
'learning',
'subject',
'teacher',
'professor',
'semester',
'transcript',
'performance',
'achievement',
'curriculum',
'enrollment',
];
const educationMetrics = [
'hours_studied',
'study_time',
'homework',
'extracurricular',
'sleep_hours',
'social_media_hours',
'screen_time',
'mental_health',
'stress_level',
'parental_education',
'family_income',
'school_type',
];
// Check column names for education keywords
for (const column of columnNames) {
const columnLower = column.toLowerCase();
for (const keyword of educationKeywords) {
if (columnLower.includes(keyword)) {
score += 0.15;
reasons.push(`Column '${column}' contains education keyword '${keyword}'`);
break;
}
}
for (const metric of educationMetrics) {
if (columnLower.includes(metric)) {
score += 0.1;
reasons.push(`Column '${column}' matches education metric pattern '${metric}'`);
break;
}
}
}
// Check for typical education patterns
const hasStudentId = columnNames.some((col) => col.toLowerCase().includes('student') && col.toLowerCase().includes('id'));
if (hasStudentId) {
score += 0.2;
reasons.push('Student identifier column detected');
}
const hasPerformanceMetric = columnNames.some((col) => col.toLowerCase().includes('score') ||
col.toLowerCase().includes('grade') ||
col.toLowerCase().includes('performance'));
if (hasPerformanceMetric) {
score += 0.15;
reasons.push('Academic performance metric detected');
}
const hasLifestyleFactors = columnNames.some((col) => col.toLowerCase().includes('sleep') ||
col.toLowerCase().includes('social') ||
col.toLowerCase().includes('exercise'));
if (hasLifestyleFactors) {
score += 0.1;
reasons.push('Lifestyle factors affecting academic performance detected');
}
return { score: Math.min(score, 1.0), reasons };
}
/**
* Score likelihood of healthcare domain
*/
static scoreHealthcareDomain(columnNames, dataCharacteristics, sampleValues) {
let score = 0;
const reasons = [];
const healthcareKeywords = [
'patient',
'diagnosis',
'treatment',
'medication',
'dose',
'doctor',
'nurse',
'hospital',
'clinic',
'medical',
'health',
'disease',
'symptom',
'vital',
'blood',
'pressure',
'heart',
'weight',
'bmi',
'temperature',
'lab',
'test',
];
for (const column of columnNames) {
const columnLower = column.toLowerCase();
for (const keyword of healthcareKeywords) {
if (columnLower.includes(keyword)) {
score += 0.2;
reasons.push(`Healthcare keyword '${keyword}' found in column '${column}'`);
break;
}
}
}
return { score: Math.min(score, 1.0), reasons };
}
/**
* Score likelihood of finance domain
*/
static scoreFinanceDomain(columnNames, dataCharacteristics, sampleValues) {
let score = 0;
const reasons = [];
const financeKeywords = [
'amount',
'balance',
'transaction',
'payment',
'revenue',
'profit',
'loss',
'price',
'cost',
'expense',
'income',
'salary',
'budget',
'investment',
'portfolio',
'stock',
'bond',
'market',
'currency',
'exchange',
'rate',
];
for (const column of columnNames) {
const columnLower = column.toLowerCase();
for (const keyword of financeKeywords) {
if (columnLower.includes(keyword)) {
score += 0.15;
reasons.push(`Financial keyword '${keyword}' found in column '${column}'`);
break;
}
}
}
return { score: Math.min(score, 1.0), reasons };
}
/**
* Score likelihood of marketing domain
*/
static scoreMarketingDomain(columnNames, dataCharacteristics, sampleValues) {
let score = 0;
const reasons = [];
const marketingKeywords = [
'campaign',
'click',
'impression',
'conversion',
'customer',
'lead',
'funnel',
'engagement',
'reach',
'audience',
'segment',
'target',
'acquisition',
'retention',
'churn',
'lifetime_value',
'roas',
'roi',
'ctr',
'cpm',
'cpc',
];
for (const column of columnNames) {
const columnLower = column.toLowerCase();
for (const keyword of marketingKeywords) {
if (columnLower.includes(keyword)) {
score += 0.15;
reasons.push(`Marketing keyword '${keyword}' found in column '${column}'`);
break;
}
}
}
return { score: Math.min(score, 1.0), reasons };
}
/**
* Score likelihood of operations domain
*/
static scoreOperationsDomain(columnNames, dataCharacteristics, sampleValues) {
let score = 0;
const reasons = [];
const operationsKeywords = [
'production',
'manufacturing',
'inventory',
'supply',
'demand',
'capacity',
'efficiency',
'throughput',
'cycle_time',
'lead_time',
'quality',
'defect',
'yield',
'downtime',
'maintenance',
'schedule',
'resource',
'utilization',
];
for (const column of columnNames) {
const columnLower = column.toLowerCase();
for (const keyword of operationsKeywords) {
if (columnLower.includes(keyword)) {
score += 0.15;
reasons.push(`Operations keyword '${keyword}' found in column '${column}'`);
break;
}
}
}
return { score: Math.min(score, 1.0), reasons };
}
/**
* Score likelihood of HR domain
*/
static scoreHRDomain(columnNames, dataCharacteristics, sampleValues) {
let score = 0;
const reasons = [];
const hrKeywords = [
'employee',
'staff',
'hire',
'termination',
'performance',
'review',
'rating',
'satisfaction',
'engagement',
'training',
'development',
'promotion',
'department',
'manager',
'team',
'skill',
'competency',
'compensation',
'benefits',
'leave',
];
for (const column of columnNames) {
const columnLower = column.toLowerCase();
for (const keyword of hrKeywords) {
if (columnLower.includes(keyword)) {
score += 0.15;
reasons.push(`HR keyword '${keyword}' found in column '${column}'`);
break;
}
}
}
return { score: Math.min(score, 1.0), reasons };
}
/**
* Create education domain definition
*/
static createEducationDomain(confidence, reasons) {
return {
domain: 'education',
confidence,
reasoning: reasons.join('; '),
characteristics: [
{
characteristic: 'Performance-Outcome Relationships',
description: 'Clear relationships between inputs (study habits, lifestyle) and outcomes (grades, performance)',
visualizationImplications: [
'Use scatter plots to show correlations',
'Dashboard showing performance drivers',
],
commonMistakes: ['Confusing correlation with causation', 'Ignoring external factors'],
},
{
characteristic: 'Multi-Stakeholder Perspectives',
description: 'Different stakeholders need different views (students, teachers, administrators)',
visualizationImplications: ['Role-based dashboards', 'Privacy-sensitive displays'],
commonMistakes: ['One-size-fits-all visualizations', 'Exposing sensitive student data'],
},
{
characteristic: 'Temporal Academic Cycles',
description: 'Data follows academic calendar patterns (semesters, terms, school years)',
visualizationImplications: [
'Academic calendar-aware time series',
'Semester comparison charts',
],
commonMistakes: [
'Using fiscal calendar instead of academic',
'Ignoring seasonal variations',
],
},
],
typicalVariables: [
'student_id',
'grade',
'score',
'attendance',
'study_hours',
'extracurricular',
'parental_education',
'socioeconomic_factors',
'learning_style',
'subject_performance',
],
expectedRelationships: [
{
variables: ['study_hours', 'exam_score'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Study time directly impacts academic performance through learning consolidation',
},
{
variables: ['attendance', 'performance'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Class attendance provides direct learning opportunities affecting performance',
},
{
variables: ['sleep_hours', 'mental_health', 'performance'],
relationship: 'causal',
strength: 'moderate',
domain_specific_meaning: 'Sleep affects cognitive function and mental health, which impact learning capacity',
},
],
};
}
// Helper methods to create other domain definitions
static createHealthcareDomain(confidence, reasons) {
return {
domain: 'healthcare',
confidence,
reasoning: reasons.join('; '),
characteristics: [
{
characteristic: 'Patient Privacy and Ethics',
description: 'Strict privacy requirements and ethical considerations for patient data',
visualizationImplications: [
'Anonymized displays',
'Aggregate-only visualizations',
'HIPAA-compliant dashboards',
],
commonMistakes: [
'Exposing patient identifiers',
'Insufficient data anonymization',
'Unauthorized data sharing',
],
},
{
characteristic: 'Clinical Decision Support',
description: 'Visualizations must support evidence-based clinical decision making',
visualizationImplications: [
'Risk stratification charts',
'Outcome prediction displays',
'Treatment effectiveness comparisons',
],
commonMistakes: [
'Presenting correlation as causation',
'Ignoring clinical context',
'Overcomplicating critical displays',
],
},
{
characteristic: 'Temporal Health Patterns',
description: 'Health data often shows temporal patterns requiring longitudinal analysis',
visualizationImplications: [
'Patient timeline views',
'Trend analysis for vital signs',
'Disease progression tracking',
],
commonMistakes: [
'Missing critical time intervals',
'Inappropriate aggregation periods',
'Ignoring seasonal health patterns',
],
},
{
characteristic: 'Multi-Modal Data Integration',
description: 'Healthcare data comes from diverse sources requiring unified presentation',
visualizationImplications: [
'Integrated patient dashboards',
'Cross-system data correlation',
'Multi-source validation displays',
],
commonMistakes: [
'Data source inconsistencies',
'Conflicting measurement units',
'Missing data context',
],
},
],
typicalVariables: [
'patient_id',
'diagnosis',
'treatment',
'vital_signs',
'lab_results',
'medication',
'dosage',
'blood_pressure',
'heart_rate',
'temperature',
'bmi',
'age',
'gender',
'admission_date',
'discharge_date',
'length_of_stay',
'readmission',
'outcome',
'comorbidities',
'allergies',
'medical_history',
'provider_id',
'facility',
],
expectedRelationships: [
{
variables: ['medication', 'dosage', 'outcome'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Medication type and dosage directly affect patient outcomes through pharmacological mechanisms',
},
{
variables: ['vital_signs', 'severity', 'length_of_stay'],
relationship: 'correlated',
strength: 'strong',
domain_specific_meaning: 'Vital sign abnormalities correlate with disease severity and required care duration',
},
{
variables: ['age', 'comorbidities', 'readmission_risk'],
relationship: 'causal',
strength: 'moderate',
domain_specific_meaning: 'Age and existing conditions increase complexity of care and readmission probability',
},
{
variables: ['lab_results', 'diagnosis', 'treatment_plan'],
relationship: 'sequential',
strength: 'strong',
domain_specific_meaning: 'Laboratory findings inform diagnostic decisions which determine treatment protocols',
},
],
};
}
static createFinanceDomain(confidence, reasons) {
return {
domain: 'finance',
confidence,
reasoning: reasons.join('; '),
characteristics: [
{
characteristic: 'Regulatory Compliance',
description: 'Financial data visualization must comply with regulatory requirements',
visualizationImplications: [
'Audit trail capabilities',
'Standardized reporting formats',
'SOX-compliant controls',
],
commonMistakes: [
'Non-compliant reporting',
'Missing audit capabilities',
'Inadequate access controls',
],
},
{
characteristic: 'Risk Management Focus',
description: 'Financial visualizations must highlight risk factors and exposure levels',
visualizationImplications: [
'Risk heat maps',
'Variance analysis charts',
'Stress testing scenarios',
],
commonMistakes: [
'Understating risk exposure',
'Missing risk correlations',
'Inadequate scenario analysis',
],
},
{
characteristic: 'Temporal Financial Cycles',
description: 'Financial data follows reporting cycles, fiscal periods, and market rhythms',
visualizationImplications: [
'Fiscal calendar alignment',
'Period-over-period comparisons',
'Seasonal adjustment displays',
],
commonMistakes: [
'Misaligned reporting periods',
'Ignoring seasonality',
'Inappropriate comparison timeframes',
],
},
{
characteristic: 'Multi-Currency and Scale Complexity',
description: 'Financial data often involves multiple currencies and vastly different scales',
visualizationImplications: [
'Currency conversion displays',
'Logarithmic scales for wide ranges',
'Normalized comparison views',
],
commonMistakes: [
'Currency confusion',
'Scale distortion',
'Missing exchange rate context',
],
},
],
typicalVariables: [
'amount',
'transaction_id',
'account',
'balance',
'revenue',
'expense',
'profit',
'loss',
'cash_flow',
'assets',
'liabilities',
'equity',
'roi',
'margin',
'ebitda',
'transaction_date',
'settlement_date',
'currency',
'exchange_rate',
'cost_center',
'budget',
'forecast',
'variance',
'risk_rating',
'counterparty',
'instrument_type',
],
expectedRelationships: [
{
variables: ['revenue', 'expenses', 'profit'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Revenue minus expenses equals profit through fundamental accounting identity',
},
{
variables: ['interest_rates', 'bond_prices', 'portfolio_value'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Interest rate changes inversely affect bond prices and portfolio valuations',
},
{
variables: ['market_volatility', 'risk_premium', 'investment_returns'],
relationship: 'correlated',
strength: 'moderate',
domain_specific_meaning: 'Higher market volatility typically correlates with increased risk premiums and variable returns',
},
{
variables: ['cash_flow', 'liquidity', 'operational_efficiency'],
relationship: 'sequential',
strength: 'strong',
domain_specific_meaning: 'Cash flow patterns indicate liquidity health which affects operational capacity',
},
],
};
}
static createMarketingDomain(confidence, reasons) {
return {
domain: 'marketing',
confidence,
reasoning: reasons.join('; '),
characteristics: [
{
characteristic: 'Attribution and Customer Journey',
description: 'Marketing data requires complex attribution modeling across multiple touchpoints',
visualizationImplications: [
'Multi-touch attribution charts',
'Customer journey flow diagrams',
'Funnel conversion analysis',
],
commonMistakes: [
'Single-touch attribution bias',
'Missing journey context',
'Oversimplified funnel models',
],
},
{
characteristic: 'Real-Time Campaign Optimization',
description: 'Marketing campaigns require real-time monitoring and rapid optimization',
visualizationImplications: [
'Live performance dashboards',
'Alert-based monitoring',
'A/B test result displays',
],
commonMistakes: [
'Delayed reaction to poor performance',
'Statistical significance confusion',
'Optimization without context',
],
},
{
characteristic: 'Audience Segmentation Complexity',
description: 'Marketing effectiveness varies dramatically across different audience segments',
visualizationImplications: [
'Segment-specific performance views',
'Cohort analysis displays',
'Persona-based dashboards',
],
commonMistakes: [
'Over-aggregation hiding segment insights',
'Insufficient segment granularity',
'Static segmentation models',
],
},
{
characteristic: 'ROI and Performance Measurement',
description: 'Marketing success requires measuring return on investment across channels and campaigns',
visualizationImplications: [
'ROI comparison charts',
'Performance attribution matrices',
'Cost-effectiveness analysis',
],
commonMistakes: [
'Incomplete cost attribution',
'Short-term ROI focus',
'Missing lifetime value context',
],
},
],
typicalVariables: [
'campaign_id',
'channel',
'impression',
'click',
'conversion',
'cost',
'revenue',
'ctr',
'cpm',
'cpc',
'cpa',
'roas',
'roi',
'audience_segment',
'demographic',
'geographic',
'device_type',
'time_on_site',
'bounce_rate',
'page_views',
'email_open_rate',
'email_click_rate',
'social_engagement',
'brand_awareness',
'customer_acquisition_cost',
'lifetime_value',
'churn_rate',
'retention_rate',
],
expectedRelationships: [
{
variables: ['spend', 'impressions', 'reach'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Marketing spend directly determines impression volume and audience reach through media buying',
},
{
variables: ['relevance_score', 'ctr', 'conversion_rate'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Higher ad relevance increases click-through rates which improve conversion performance',
},
{
variables: ['audience_targeting', 'engagement', 'cost_efficiency'],
relationship: 'correlated',
strength: 'moderate',
domain_specific_meaning: 'Better audience targeting typically correlates with higher engagement and lower acquisition costs',
},
{
variables: ['touchpoint_sequence', 'attribution_weight', 'conversion_probability'],
relationship: 'sequential',
strength: 'moderate',
domain_specific_meaning: 'Customer touchpoint sequence affects attribution modeling and conversion likelihood',
},
],
};
}
static createOperationsDomain(confidence, reasons) {
return {
domain: 'operations',
confidence,
reasoning: reasons.join('; '),
characteristics: [
{
characteristic: 'Process Optimization Focus',
description: 'Operations data emphasizes efficiency, throughput, and continuous improvement',
visualizationImplications: [
'Process flow diagrams',
'Efficiency trend analysis',
'Bottleneck identification charts',
],
commonMistakes: [
'Optimizing local maxima',
'Ignoring process interdependencies',
'Missing constraint analysis',
],
},
{
characteristic: 'Real-Time Monitoring Requirements',
description: 'Operational processes require real-time monitoring for immediate corrective action',
visualizationImplications: [
'Live production dashboards',
'Alert threshold displays',
'Performance deviation warnings',
],
commonMistakes: [
'Delayed problem detection',
'Information overload',
'Missing actionable alerts',
],
},
{
characteristic: 'Quality and Variation Control',
description: 'Operations data requires statistical process control and quality management',
visualizationImplications: [
'Control charts',
'Capability analysis displays',
'Defect rate tracking',
],
commonMistakes: [
'Ignoring process variation',
'Inadequate quality metrics',
'Missing statistical significance',
],
},
{
characteristic: 'Resource Utilization Optimization',
description: 'Operations focus on maximizing resource efficiency and capacity utilization',
visualizationImplications: [
'Utilization heat maps',
'Capacity planning charts',
'Resource allocation displays',
],
commonMistakes: [
'Over-utilization risks',
'Ignoring maintenance windows',
'Missing demand forecasting',
],
},
],
typicalVariables: [
'production_volume',
'cycle_time',
'lead_time',
'throughput',
'efficiency',
'utilization',
'quality_score',
'defect_rate',
'yield',
'downtime',
'uptime',
'maintenance_cost',
'inventory_level',
'stockout_rate',
'supplier_performance',
'delivery_time',
'cost_per_unit',
'labor_hours',
'machine_hours',
'energy_consumption',
'waste_generation',
'safety_incidents',
'compliance_score',
'customer_satisfaction',
],
expectedRelationships: [
{
variables: ['cycle_time', 'throughput', 'capacity'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Shorter cycle times increase throughput within fixed capacity constraints through process efficiency',
},
{
variables: ['quality_investment', 'defect_rate', 'total_cost'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Quality investments reduce defect rates, lowering total cost through prevention over correction',
},
{
variables: ['utilization_rate', 'flexibility', 'responsiveness'],
relationship: 'correlated',
strength: 'moderate',
domain_specific_meaning: 'High utilization can reduce operational flexibility and responsiveness to demand changes',
},
{
variables: ['maintenance_schedule', 'equipment_reliability', 'production_stability'],
relationship: 'sequential',
strength: 'strong',
domain_specific_meaning: 'Preventive maintenance schedules affect equipment reliability which determines production stability',
},
],
};
}
static createHRDomain(confidence, reasons) {
return {
domain: 'hr',
confidence,
reasoning: reasons.join('; '),
characteristics: [
{
characteristic: 'Employee Privacy and Confidentiality',
description: 'HR data requires strict privacy protection and confidential handling',
visualizationImplications: [
'Anonymized individual displays',
'Aggregate-only views',
'Role-based access controls',
],
commonMistakes: [
'Exposing individual performance data',
'Insufficient anonymization',
'Unauthorized data access',
],
},
{
characteristic: 'Performance and Development Focus',
description: 'HR analytics emphasize employee development, performance improvement, and career progression',
visualizationImplications: [
'Performance trend analysis',
'Skill gap identification',
'Career pathway visualization',
],
commonMistakes: [
'Punitive performance displays',
'Missing development context',
'One-dimensional performance metrics',
],
},
{
characteristic: 'Diversity and Inclusion Monitoring',
description: 'HR data requires comprehensive diversity, equity, and inclusion analysis',
visualizationImplications: [
'Demographic representation charts',
'Pay equity analysis',
'Promotion pattern displays',
],
commonMistakes: [
'Oversimplified diversity metrics',
'Missing intersectional analysis',
'Inadequate equity measurement',
],
},
{
characteristic: 'Predictive Talent Management',
description: 'HR analytics increasingly focus on predicting employee behavior and retention',
visualizationImplications: [
'Retention risk scoring',
'Succession planning displays',
'Engagement prediction models',
],
commonMistakes: [
'Over-reliance on predictive models',
'Missing human context',
'Algorithmic bias in predictions',
],
},
],
typicalVariables: [
'employee_id',
'department',
'role',
'level',
'hire_date',
'tenure',
'salary',
'performance_rating',
'goal_achievement',
'skill_assessment',
'training_hours',
'engagement_score',
'satisfaction_score',
'retention_risk',
'promotion_history',
'manager_rating',
'peer_feedback',
'customer_feedback',
'attendance_rate',
'overtime_hours',
'leave_taken',
'benefits_utilization',
'diversity_category',
'age',
'gender',
'ethnicity',
'education_level',
'certification_count',
],
expectedRelationships: [
{
variables: ['engagement_score', 'performance_rating', 'retention_likelihood'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Employee engagement directly affects performance levels and retention probability through motivation',
},
{
variables: ['training_investment', 'skill_development', 'career_advancement'],
relationship: 'causal',
strength: 'strong',
domain_specific_meaning: 'Training investments develop employee skills which enable career advancement opportunities',
},
{
variables: ['manager_quality', 'team_performance', 'employee_satisfaction'],
relationship: 'correlated',
strength: 'strong',
domain_specific_meaning: 'Manager effectiveness strongly correlates with team performance and employee satisfaction levels',
},
{
variables: ['diversity_initiatives', 'inclusion_metrics', 'organizational_culture'],
relationship: 'sequential',
strength: 'moderate',
domain_specific_meaning: 'Diversity initiatives affect inclusion metrics which contribute to overall organizational culture',
},
],
};
}
static createGenericDomain(confidence, reasons) {
return {
domain: 'generic',
confidence,
reasoning: reasons.join('; '),
characteristics: [],
typicalVariables: [],
expectedRelationships: [],
};
}
/**
* Identify relevant subdomains
*/
static identifySubdomains(columnNames, primaryDomain, dataCharacteristics) {
const subdomains = [];
if (primaryDomain.domain === 'education') {
// Check for specific education subdomains
const hasOnlineLearning = columnNames.some((col) => col.toLowerCase().includes('online') ||
col.toLowerCase().includes('digital') ||
col.toLowerCase().includes('screen'));
if (hasOnlineLearning) {
subdomains.push({
name: 'online_learning',
confidence: 0.8,
indicators: ['online learning indicators detected'],
specializations: [
'digital engagement analysis',
'screen time impact',
'virtual classroom dynamics',
],
});
}
const hasLifestyleFactors = columnNames.some((col) => col.toLowerCase().includes('sleep') ||
col.toLowerCase().includes('exercise') ||
col.toLowerCase().includes('diet'));
if (hasLifestyleFactors) {
subdomains.push({
name: 'holistic_student_wellness',
confidence: 0.9,
indicators: ['lifestyle and wellness factors detected'],
specializations: [
'wellness-performance correlation',
'lifestyle intervention analysis',
'holistic student support',
],
});
}
}
return subdomains;
}
/**
* Extract context clues from data
*/
static extractContextClues(columnNames, dataCharacteristics, sampleValues) {
const clues = [];
// Analyze column name patterns
for (const column of columnNames) {
const columnLower = column.toLowerCase();
// Time-based clues
if (columnLower.includes('hour') || columnLower.includes('time')) {
clues.push({
type: 'column_name',
clue: `Time-based measurement: ${column}`,
strength: 0.7,
domain: 'temporal_analysis',
reasoning: 'Time-based columns suggest temporal or behavioral analysis needs',
});
}
// Rating/score clues
if (columnLower.includes('rating') || columnLower.includes('score')) {
clues.push({
type: 'column_name',
clue: `Performance/quality metric: ${column}`,
strength: 0.8,
domain: 'performance_analysis',
reasoning: 'Rating/score columns indicate performance evaluation context',
});
}
// Percentage clues
if (columnLower.includes('percentage') || columnLower.includes('percent')) {
clues.push({
type: 'column_name',
clue: `Percentage metric: ${column}`,
strength: 0.6,
domain: 'proportion_analysis',
reasoning: 'Percentage columns suggest comparative or achievement analysis',
});
}
}
return clues;
}
/**
* Define stakeholder profiles based on domain
*/
static defineStakeholderProfiles(primaryDomain, subdomains) {
const profiles = [];
if (primaryDomain.domain === 'education') {
profiles.push({
role: 'educator',
expertise: 'domain_expert',
primaryInterests: [
'student performance trends',
'intervention effectiveness',
'learning outcomes',
],
visualizationPreferences: [
{
chartType: 'performance_dashboard',
complexity: 'moderate',
interactivity: 'moderate',
reasoning: 'Teachers need actionable insights without overwhelming complexity',
},
],
informationNeeds: [
{
need: 'identify_at_risk_students',
priority: 'critical',
visualizationApproach: 'alert-based dashboard with performance trends',
metrics: ['attendance', 'assignment_completion', 'performance_trend'],
},
],
decisionContext: 'Immediate intervention and support decisions',
});
profiles.push({
role: 'student',
expertise: 'general_public',
primaryInterests: ['personal performance', 'improvement opportunities', 'peer comparison'],
visualizationPreferences: [
{
chartType: 'personal_progress',
complexity: 'simple',
interactivity: 'moderate',
reasoning: 'Students need clear, motivating visualizations of their progress',
},
],
informationNeeds: [
{
need: 'track_personal_progress',
priority: 'high',
visualizationApproach: 'personal dashboard with clear progress indicators',
metrics: ['grade_trends', 'study_effectiveness', 'goal_progress'],
},
],
decisionContext: 'Study habits and academic planning',
});
profiles.push({
role: 'administrator',
expertise: 'executive',
primaryInterests: [
'program effectiveness',
'resource allocation',