UNPKG

agentic-data-stack-community

Version:

AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.

477 lines (416 loc) 16.4 kB
# Data Profiling Template # Standardized template for comprehensive data profiling across the AI Agentic Data Stack Framework metadata: template_id: "data-profiling-tmpl" name: "Data Profiling Template" version: "1.0.0" description: "Comprehensive template for systematic data profiling and quality assessment" category: "data-quality" tags: ["profiling", "quality", "analysis", "assessment", "validation"] created_by: "AI Agentic Data Stack Framework" created_date: "2025-01-23" template: name: "Data Profiling Template" description: "Template for comprehensive data profiling and assessment" version: "1.0.0" sections: - name: "profiling_overview" description: "Profiling objectives and scope definition" required: true - name: "data_source_analysis" description: "Data source characteristics and metadata" required: true - name: "structural_profiling" description: "Data structure and schema analysis" required: true - name: "content_profiling" description: "Data content and value analysis" required: true - name: "quality_assessment" description: "Data quality metrics and issues" required: true - name: "relationship_analysis" description: "Data relationships and dependencies" required: true - name: "profiling_recommendations" description: "Findings and improvement recommendations" required: true - name: "validation_rules" description: "Profiling validation requirements" required: true # Profiling Overview profiling_overview: # Basic Information profiling_id: "${profiling_id}" profiling_name: "${profiling_name}" description: "${profiling_description}" version: "${profiling_version}" # Profiling Objectives objectives: primary_objective: "${primary_objective}" secondary_objectives: ["${secondary_objectives}"] business_questions: ["${business_questions}"] quality_focus_areas: ["${quality_focus_areas}"] # Scope Definition scope: data_sources: ["${data_sources_to_profile}"] tables_views: ["${tables_views_to_profile}"] columns_fields: ["${specific_columns}"] time_period: "${profiling_time_period}" # Profiling Context context: business_context: "${business_context}" data_usage_context: "${data_usage_context}" regulatory_requirements: ["${regulatory_requirements}"] compliance_standards: ["${compliance_standards}"] # Stakeholders stakeholders: profiling_owner: "${profiling_owner}" data_steward: "${data_steward}" business_users: ["${business_users}"] technical_reviewers: ["${technical_reviewers}"] # Data Source Analysis data_source_analysis: # Source Characteristics data_sources: - source_id: "${data_source_id}" source_name: "${data_source_name}" source_type: "${source_type}" # database, file, api, data_lake, data_warehouse # Technical Details technical_details: system_type: "${system_type}" version: "${system_version}" location: "${data_location}" access_method: "${access_method}" # Size and Volume size_metrics: total_size_gb: ${total_size_gb} record_count: ${total_record_count} table_count: ${table_count} column_count: ${total_column_count} # Update Patterns update_characteristics: update_frequency: "${update_frequency}" last_updated: "${last_updated_timestamp}" growth_rate: "${data_growth_rate}" update_pattern: "${update_pattern}" # batch, real_time, mixed # System Metadata system_metadata: database_schema: "${database_schema}" catalog_information: ["${catalog_info}"] data_dictionary_available: ${data_dictionary_exists} documentation_completeness: "${documentation_completeness_score}" # Access Patterns access_patterns: primary_users: ["${primary_data_users}"] usage_frequency: "${usage_frequency}" peak_usage_times: ["${peak_usage_times}"] access_methods: ["${access_methods}"] # Structural Profiling structural_profiling: # Schema Analysis schema_analysis: # Tables/Entities tables: - table_id: "${table_id}" table_name: "${table_name}" table_type: "${table_type}" # fact, dimension, staging, reference # Table Characteristics characteristics: row_count: ${table_row_count} column_count: ${table_column_count} table_size_mb: ${table_size_mb} creation_date: "${table_creation_date}" # Columns/Fields columns: - column_id: "${column_id}" column_name: "${column_name}" # Data Type Information data_type_info: data_type: "${column_data_type}" max_length: ${max_column_length} precision: ${numeric_precision} scale: ${numeric_scale} nullable: ${is_nullable} # Constraints constraints: primary_key: ${is_primary_key} foreign_key: ${is_foreign_key} unique_constraint: ${has_unique_constraint} check_constraints: ["${check_constraints}"] default_value: "${default_value}" # Key Relationships key_relationships: primary_keys: - table_name: "${pk_table_name}" primary_key_columns: ["${pk_columns}"] uniqueness_verified: ${pk_uniqueness_verified} foreign_keys: - relationship_id: "${fk_relationship_id}" source_table: "${fk_source_table}" source_columns: ["${fk_source_columns}"] target_table: "${fk_target_table}" target_columns: ["${fk_target_columns}"] referential_integrity: ${referential_integrity_score} # Index Analysis index_analysis: - index_name: "${index_name}" index_type: "${index_type}" # clustered, non_clustered, unique, partial indexed_columns: ["${indexed_columns}"] index_usage_stats: "${index_usage_frequency}" # Content Profiling content_profiling: # Column-Level Analysis column_analysis: # Numeric Columns numeric_columns: - column_name: "${numeric_column_name}" # Basic Statistics basic_stats: count: ${value_count} null_count: ${null_count} null_percentage: ${null_percentage} distinct_count: ${distinct_value_count} distinct_percentage: ${distinct_percentage} # Descriptive Statistics descriptive_stats: mean: ${column_mean} median: ${column_median} mode: ${column_mode} standard_deviation: ${column_std_dev} variance: ${column_variance} # Distribution Analysis distribution: minimum: ${column_minimum} maximum: ${column_maximum} range: ${column_range} quartile_1: ${q1} quartile_3: ${q3} interquartile_range: ${iqr} skewness: ${distribution_skewness} kurtosis: ${distribution_kurtosis} # Outlier Detection outliers: outlier_count: ${outlier_count} outlier_percentage: ${outlier_percentage} outlier_detection_method: "${outlier_method}" outlier_threshold: ${outlier_threshold} # Categorical Columns categorical_columns: - column_name: "${categorical_column_name}" # Basic Statistics basic_stats: count: ${category_count} null_count: ${category_null_count} null_percentage: ${category_null_percentage} distinct_count: ${category_distinct_count} cardinality: "${cardinality_level}" # low, medium, high # Value Distribution value_distribution: most_frequent_value: "${most_frequent_value}" most_frequent_count: ${most_frequent_count} most_frequent_percentage: ${most_frequent_percentage} least_frequent_value: "${least_frequent_value}" value_frequency_distribution: ["${frequency_distribution}"] # Pattern Analysis patterns: common_patterns: ["${common_value_patterns}"] irregular_values: ["${irregular_values}"] format_consistency: ${format_consistency_score} # Date/Time Columns datetime_columns: - column_name: "${datetime_column_name}" # Basic Statistics basic_stats: count: ${datetime_count} null_count: ${datetime_null_count} null_percentage: ${datetime_null_percentage} distinct_count: ${datetime_distinct_count} # Temporal Analysis temporal_analysis: earliest_date: "${earliest_date}" latest_date: "${latest_date}" date_range_days: ${date_range_days} most_common_date: "${most_common_date}" # Format Analysis format_analysis: date_formats: ["${detected_date_formats}"] format_consistency: ${datetime_format_consistency} timezone_information: "${timezone_info}" # Text Columns text_columns: - column_name: "${text_column_name}" # Basic Statistics basic_stats: count: ${text_count} null_count: ${text_null_count} null_percentage: ${text_null_percentage} distinct_count: ${text_distinct_count} # Text Characteristics text_characteristics: min_length: ${min_text_length} max_length: ${max_text_length} avg_length: ${avg_text_length} empty_string_count: ${empty_string_count} whitespace_only_count: ${whitespace_only_count} # Content Analysis content_analysis: character_set: "${detected_character_set}" language_detected: "${detected_language}" common_words: ["${common_words}"] special_characters: ["${special_characters}"] # Quality Assessment quality_assessment: # Completeness Assessment completeness: # Overall Completeness overall_completeness: ${overall_completeness_percentage} # Column-Level Completeness column_completeness: - column_name: "${completeness_column_name}" completeness_percentage: ${column_completeness_percentage} missing_value_count: ${missing_count} completeness_rating: "${completeness_rating}" # excellent, good, fair, poor # Accuracy Assessment accuracy: # Data Type Accuracy data_type_accuracy: - column_name: "${accuracy_column_name}" expected_data_type: "${expected_data_type}" actual_data_type: "${actual_data_type}" type_conformity_percentage: ${type_conformity_percentage} type_violations: ${type_violation_count} # Business Rule Accuracy business_rule_accuracy: - rule_name: "${business_rule_name}" rule_description: "${rule_description}" conformity_percentage: ${rule_conformity_percentage} violation_count: ${rule_violation_count} # Consistency Assessment consistency: # Format Consistency format_consistency: - column_name: "${consistency_column_name}" expected_format: "${expected_format}" format_compliance_percentage: ${format_compliance_percentage} format_variations: ["${format_variations}"] # Cross-Table Consistency cross_table_consistency: - relationship_name: "${consistency_relationship}" table_1: "${consistency_table_1}" table_2: "${consistency_table_2}" consistency_percentage: ${cross_table_consistency_percentage} inconsistency_count: ${inconsistency_count} # Validity Assessment validity: # Domain Validity domain_validity: - column_name: "${validity_column_name}" valid_domain: ["${valid_domain_values}"] domain_compliance_percentage: ${domain_compliance_percentage} invalid_values: ["${invalid_values}"] # Range Validity range_validity: - column_name: "${range_column_name}" valid_range_min: ${valid_range_min} valid_range_max: ${valid_range_max} range_compliance_percentage: ${range_compliance_percentage} out_of_range_count: ${out_of_range_count} # Uniqueness Assessment uniqueness: # Column Uniqueness column_uniqueness: - column_name: "${uniqueness_column_name}" expected_uniqueness: ${expected_uniqueness} actual_uniqueness_percentage: ${actual_uniqueness_percentage} duplicate_count: ${duplicate_count} # Timeliness Assessment timeliness: data_freshness: "${data_freshness_status}" last_update_lag: "${update_lag_hours}" timeliness_rating: "${timeliness_rating}" # excellent, good, fair, poor # Relationship Analysis relationship_analysis: # Intra-Table Relationships intra_table_relationships: # Column Correlations correlations: - column_pair: ["${correlation_column_1}", "${correlation_column_2}"] correlation_coefficient: ${correlation_coefficient} correlation_strength: "${correlation_strength}" # weak, moderate, strong correlation_type: "${correlation_type}" # positive, negative # Functional Dependencies functional_dependencies: - dependency_id: "${dependency_id}" determinant_columns: ["${determinant_columns}"] dependent_columns: ["${dependent_columns}"] dependency_strength: ${dependency_strength} # Inter-Table Relationships inter_table_relationships: # Join Analysis join_analysis: - join_relationship: "${join_relationship_name}" left_table: "${left_table}" right_table: "${right_table}" join_columns: ["${join_columns}"] join_success_rate: ${join_success_rate} unmatched_records: ${unmatched_record_count} # Data Lineage data_lineage: - lineage_path: ["${lineage_path}"] source_system: "${lineage_source}" target_system: "${lineage_target}" transformation_steps: ["${transformation_steps}"] # Profiling Recommendations profiling_recommendations: # Data Quality Improvements quality_improvements: - issue_category: "${quality_issue_category}" issue_description: "${quality_issue_description}" severity: "${issue_severity}" # critical, high, medium, low affected_columns: ["${affected_columns}"] # Recommended Actions recommendations: immediate_actions: ["${immediate_actions}"] long_term_improvements: ["${long_term_improvements}"] estimated_effort: "${estimated_effort}" priority: "${recommendation_priority}" # Performance Optimizations performance_optimizations: - optimization_area: "${optimization_area}" current_performance: "${current_performance_metric}" target_performance: "${target_performance_metric}" optimization_strategies: ["${optimization_strategies}"] # Monitoring Recommendations monitoring_recommendations: ongoing_monitoring: ["${ongoing_monitoring_needs}"] quality_alerts: ["${recommended_quality_alerts}"] review_frequency: "${recommended_review_frequency}" # Validation Rules validation_rules: required_fields: - profiling_id - profiling_name - data_sources - structural_profiling - content_profiling - quality_assessment quality_thresholds: min_completeness_percentage: 90 max_null_percentage: 10 min_accuracy_percentage: 95 max_duplicate_percentage: 5 profiling_standards: - sample_size: "Minimum 10% of data or 10,000 records" - statistical_confidence: "95% confidence level required" - profiling_coverage: "All critical columns must be profiled" - documentation_completeness: "All findings must be documented" # Template Metadata template_metadata: author: "AI Agentic Data Stack Framework" maintainer: "Data Quality Engineer" last_updated: "2025-01-23" changelog: - version: "1.0.0" date: "2025-01-23" changes: "Initial template creation with comprehensive data profiling configuration"