UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

157 lines (156 loc) 6.94 kB
"use strict"; /** * Confidence Standards Documentation * Centralized definitions for confidence metrics across all DataPilot sections */ Object.defineProperty(exports, "__esModule", { value: true }); exports.ConfidenceInterpreter = exports.CONFIDENCE_STANDARDS = void 0; exports.CONFIDENCE_STANDARDS = { PARSING_CONFIDENCE: { name: 'Parsing Confidence', section: 'Section 1 - Overview', description: 'Confidence in CSV parsing parameter detection (encoding, delimiter, etc.)', methodology: 'Statistical analysis of character patterns and field consistency', scale: '0-100% with discrete levels: 95% (High), 75% (Medium), 50% (Low)', interpretation: { high: '95%+ - Detection based on strong statistical evidence', medium: '75-94% - Detection based on moderate evidence with some ambiguity', low: '50-74% - Detection based on weak evidence, manual verification recommended', }, factors: ['Character frequency analysis', 'Field consistency scoring', 'Pattern recognition'], }, TYPE_DETECTION_CONFIDENCE: { name: 'Type Detection Confidence', section: 'Section 3 - EDA', description: 'Confidence in data type and semantic type classification', methodology: 'Rule-based classification with pattern matching and semantic analysis', scale: '0.0-1.0 (decimal) where 1.0 = 100% confidence', interpretation: { high: '0.85+ - Strong evidence for type classification', medium: '0.65-0.84 - Moderate evidence, consider manual review', low: '0.0-0.64 - Weak evidence, manual classification recommended', }, factors: [ 'Column name analysis', 'Value pattern matching', 'Statistical distribution analysis', 'Domain knowledge rules', ], }, VISUALIZATION_CONFIDENCE: { name: 'Visualization Confidence', section: 'Section 4 - Visualization', description: 'Confidence in chart type and visualization recommendations', methodology: 'Multi-factor scoring based on data characteristics and visualization best practices', scale: '0.0-1.0 (decimal) with algorithm-specific weights', interpretation: { high: '0.9+ - Chart type strongly recommended based on data characteristics', medium: '0.7-0.89 - Chart type suitable with minor considerations', low: '0.0-0.69 - Chart type may be suitable but requires careful evaluation', }, factors: [ 'Data type compatibility', 'Variable count', 'Statistical distribution', 'Accessibility requirements', 'Performance considerations', ], }, QUALITY_SCORE_CONFIDENCE: { name: 'Quality Score Confidence', section: 'Section 2 - Quality', description: 'Confidence in composite data quality score calculation', methodology: 'Weighted average of validated quality dimensions with uncertainty propagation', scale: 'Implicit confidence based on completeness of quality dimension analysis', interpretation: { high: 'All 10 quality dimensions successfully evaluated with concrete metrics', medium: '7-9 quality dimensions evaluated, some estimated values', low: '<7 quality dimensions evaluated, significant estimation required', }, factors: [ 'Data completeness', 'Sample size', 'Quality dimension coverage', 'Business rule availability', ], }, ML_READINESS_CONFIDENCE: { name: 'ML Readiness Confidence', section: 'Section 5 - Engineering', description: 'Confidence in machine learning readiness assessment', methodology: 'Composite scoring based on data quality, feature engineering potential, and technical constraints', scale: '0-100 score with implicit confidence based on assessment completeness', interpretation: { high: 'Complete assessment with all technical factors evaluated', medium: 'Most factors assessed, some limitations in evaluation scope', low: 'Limited assessment due to data constraints or missing information', }, factors: [ 'Feature count and quality', 'Data volume', 'Missing value patterns', 'Feature correlation structure', 'Technical infrastructure', ], }, MODELING_TASK_CONFIDENCE: { name: 'Modeling Task Confidence', section: 'Section 6 - Modeling', description: 'Confidence in modeling task identification and algorithm recommendations', methodology: 'Domain analysis combined with statistical characteristics and business context', scale: 'Categorical: very_high, high, medium, low', interpretation: { high: 'Clear task identification with strong algorithm-data alignment', medium: 'Probable task identification with good algorithm suitability', low: 'Uncertain task identification requiring domain expert validation', }, factors: [ 'Target variable clarity', 'Domain context', 'Data characteristics', 'Problem type recognition', 'Algorithm requirements', ], }, }; /** * Utility functions for confidence interpretation */ class ConfidenceInterpreter { /** * Get confidence explanation for a specific metric */ static explain(metricType, value) { const standard = exports.CONFIDENCE_STANDARDS[metricType]; if (!standard) { return 'Unknown confidence metric'; } let level = 'low'; if (typeof value === 'number') { if (metricType === 'PARSING_CONFIDENCE') { level = value >= 95 ? 'high' : value >= 75 ? 'medium' : 'low'; } else { level = value >= 0.85 ? 'high' : value >= 0.65 ? 'medium' : 'low'; } } return `${standard.interpretation[level]} (${standard.methodology})`; } /** * Get all confidence standards as formatted documentation */ static getDocumentation() { const sections = Object.values(exports.CONFIDENCE_STANDARDS).map((metric) => `**${metric.name}** (${metric.section}): - Description: ${metric.description} - Methodology: ${metric.methodology} - Scale: ${metric.scale} - High: ${metric.interpretation.high} - Medium: ${metric.interpretation.medium} - Low: ${metric.interpretation.low} - Factors: ${metric.factors.join(', ')}`); return `# DataPilot Confidence Metrics Documentation ${sections.join('\n\n')}`; } } exports.ConfidenceInterpreter = ConfidenceInterpreter; //# sourceMappingURL=confidence-standards.js.map