agentic-data-stack-community
Version:
AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.
477 lines (416 loc) • 16.4 kB
YAML
# Data Profiling Template
# Standardized template for comprehensive data profiling across the AI Agentic Data Stack Framework
metadata:
template_id: "data-profiling-tmpl"
name: "Data Profiling Template"
version: "1.0.0"
description: "Comprehensive template for systematic data profiling and quality assessment"
category: "data-quality"
tags: ["profiling", "quality", "analysis", "assessment", "validation"]
created_by: "AI Agentic Data Stack Framework"
created_date: "2025-01-23"
template:
name: "Data Profiling Template"
description: "Template for comprehensive data profiling and assessment"
version: "1.0.0"
sections:
- name: "profiling_overview"
description: "Profiling objectives and scope definition"
required: true
- name: "data_source_analysis"
description: "Data source characteristics and metadata"
required: true
- name: "structural_profiling"
description: "Data structure and schema analysis"
required: true
- name: "content_profiling"
description: "Data content and value analysis"
required: true
- name: "quality_assessment"
description: "Data quality metrics and issues"
required: true
- name: "relationship_analysis"
description: "Data relationships and dependencies"
required: true
- name: "profiling_recommendations"
description: "Findings and improvement recommendations"
required: true
- name: "validation_rules"
description: "Profiling validation requirements"
required: true
# Profiling Overview
profiling_overview:
# Basic Information
profiling_id: "${profiling_id}"
profiling_name: "${profiling_name}"
description: "${profiling_description}"
version: "${profiling_version}"
# Profiling Objectives
objectives:
primary_objective: "${primary_objective}"
secondary_objectives: ["${secondary_objectives}"]
business_questions: ["${business_questions}"]
quality_focus_areas: ["${quality_focus_areas}"]
# Scope Definition
scope:
data_sources: ["${data_sources_to_profile}"]
tables_views: ["${tables_views_to_profile}"]
columns_fields: ["${specific_columns}"]
time_period: "${profiling_time_period}"
# Profiling Context
context:
business_context: "${business_context}"
data_usage_context: "${data_usage_context}"
regulatory_requirements: ["${regulatory_requirements}"]
compliance_standards: ["${compliance_standards}"]
# Stakeholders
stakeholders:
profiling_owner: "${profiling_owner}"
data_steward: "${data_steward}"
business_users: ["${business_users}"]
technical_reviewers: ["${technical_reviewers}"]
# Data Source Analysis
data_source_analysis:
# Source Characteristics
data_sources:
- source_id: "${data_source_id}"
source_name: "${data_source_name}"
source_type: "${source_type}" # database, file, api, data_lake, data_warehouse
# Technical Details
technical_details:
system_type: "${system_type}"
version: "${system_version}"
location: "${data_location}"
access_method: "${access_method}"
# Size and Volume
size_metrics:
total_size_gb: ${total_size_gb}
record_count: ${total_record_count}
table_count: ${table_count}
column_count: ${total_column_count}
# Update Patterns
update_characteristics:
update_frequency: "${update_frequency}"
last_updated: "${last_updated_timestamp}"
growth_rate: "${data_growth_rate}"
update_pattern: "${update_pattern}" # batch, real_time, mixed
# System Metadata
system_metadata:
database_schema: "${database_schema}"
catalog_information: ["${catalog_info}"]
data_dictionary_available: ${data_dictionary_exists}
documentation_completeness: "${documentation_completeness_score}"
# Access Patterns
access_patterns:
primary_users: ["${primary_data_users}"]
usage_frequency: "${usage_frequency}"
peak_usage_times: ["${peak_usage_times}"]
access_methods: ["${access_methods}"]
# Structural Profiling
structural_profiling:
# Schema Analysis
schema_analysis:
# Tables/Entities
tables:
- table_id: "${table_id}"
table_name: "${table_name}"
table_type: "${table_type}" # fact, dimension, staging, reference
# Table Characteristics
characteristics:
row_count: ${table_row_count}
column_count: ${table_column_count}
table_size_mb: ${table_size_mb}
creation_date: "${table_creation_date}"
# Columns/Fields
columns:
- column_id: "${column_id}"
column_name: "${column_name}"
# Data Type Information
data_type_info:
data_type: "${column_data_type}"
max_length: ${max_column_length}
precision: ${numeric_precision}
scale: ${numeric_scale}
nullable: ${is_nullable}
# Constraints
constraints:
primary_key: ${is_primary_key}
foreign_key: ${is_foreign_key}
unique_constraint: ${has_unique_constraint}
check_constraints: ["${check_constraints}"]
default_value: "${default_value}"
# Key Relationships
key_relationships:
primary_keys:
- table_name: "${pk_table_name}"
primary_key_columns: ["${pk_columns}"]
uniqueness_verified: ${pk_uniqueness_verified}
foreign_keys:
- relationship_id: "${fk_relationship_id}"
source_table: "${fk_source_table}"
source_columns: ["${fk_source_columns}"]
target_table: "${fk_target_table}"
target_columns: ["${fk_target_columns}"]
referential_integrity: ${referential_integrity_score}
# Index Analysis
index_analysis:
- index_name: "${index_name}"
index_type: "${index_type}" # clustered, non_clustered, unique, partial
indexed_columns: ["${indexed_columns}"]
index_usage_stats: "${index_usage_frequency}"
# Content Profiling
content_profiling:
# Column-Level Analysis
column_analysis:
# Numeric Columns
numeric_columns:
- column_name: "${numeric_column_name}"
# Basic Statistics
basic_stats:
count: ${value_count}
null_count: ${null_count}
null_percentage: ${null_percentage}
distinct_count: ${distinct_value_count}
distinct_percentage: ${distinct_percentage}
# Descriptive Statistics
descriptive_stats:
mean: ${column_mean}
median: ${column_median}
mode: ${column_mode}
standard_deviation: ${column_std_dev}
variance: ${column_variance}
# Distribution Analysis
distribution:
minimum: ${column_minimum}
maximum: ${column_maximum}
range: ${column_range}
quartile_1: ${q1}
quartile_3: ${q3}
interquartile_range: ${iqr}
skewness: ${distribution_skewness}
kurtosis: ${distribution_kurtosis}
# Outlier Detection
outliers:
outlier_count: ${outlier_count}
outlier_percentage: ${outlier_percentage}
outlier_detection_method: "${outlier_method}"
outlier_threshold: ${outlier_threshold}
# Categorical Columns
categorical_columns:
- column_name: "${categorical_column_name}"
# Basic Statistics
basic_stats:
count: ${category_count}
null_count: ${category_null_count}
null_percentage: ${category_null_percentage}
distinct_count: ${category_distinct_count}
cardinality: "${cardinality_level}" # low, medium, high
# Value Distribution
value_distribution:
most_frequent_value: "${most_frequent_value}"
most_frequent_count: ${most_frequent_count}
most_frequent_percentage: ${most_frequent_percentage}
least_frequent_value: "${least_frequent_value}"
value_frequency_distribution: ["${frequency_distribution}"]
# Pattern Analysis
patterns:
common_patterns: ["${common_value_patterns}"]
irregular_values: ["${irregular_values}"]
format_consistency: ${format_consistency_score}
# Date/Time Columns
datetime_columns:
- column_name: "${datetime_column_name}"
# Basic Statistics
basic_stats:
count: ${datetime_count}
null_count: ${datetime_null_count}
null_percentage: ${datetime_null_percentage}
distinct_count: ${datetime_distinct_count}
# Temporal Analysis
temporal_analysis:
earliest_date: "${earliest_date}"
latest_date: "${latest_date}"
date_range_days: ${date_range_days}
most_common_date: "${most_common_date}"
# Format Analysis
format_analysis:
date_formats: ["${detected_date_formats}"]
format_consistency: ${datetime_format_consistency}
timezone_information: "${timezone_info}"
# Text Columns
text_columns:
- column_name: "${text_column_name}"
# Basic Statistics
basic_stats:
count: ${text_count}
null_count: ${text_null_count}
null_percentage: ${text_null_percentage}
distinct_count: ${text_distinct_count}
# Text Characteristics
text_characteristics:
min_length: ${min_text_length}
max_length: ${max_text_length}
avg_length: ${avg_text_length}
empty_string_count: ${empty_string_count}
whitespace_only_count: ${whitespace_only_count}
# Content Analysis
content_analysis:
character_set: "${detected_character_set}"
language_detected: "${detected_language}"
common_words: ["${common_words}"]
special_characters: ["${special_characters}"]
# Quality Assessment
quality_assessment:
# Completeness Assessment
completeness:
# Overall Completeness
overall_completeness: ${overall_completeness_percentage}
# Column-Level Completeness
column_completeness:
- column_name: "${completeness_column_name}"
completeness_percentage: ${column_completeness_percentage}
missing_value_count: ${missing_count}
completeness_rating: "${completeness_rating}" # excellent, good, fair, poor
# Accuracy Assessment
accuracy:
# Data Type Accuracy
data_type_accuracy:
- column_name: "${accuracy_column_name}"
expected_data_type: "${expected_data_type}"
actual_data_type: "${actual_data_type}"
type_conformity_percentage: ${type_conformity_percentage}
type_violations: ${type_violation_count}
# Business Rule Accuracy
business_rule_accuracy:
- rule_name: "${business_rule_name}"
rule_description: "${rule_description}"
conformity_percentage: ${rule_conformity_percentage}
violation_count: ${rule_violation_count}
# Consistency Assessment
consistency:
# Format Consistency
format_consistency:
- column_name: "${consistency_column_name}"
expected_format: "${expected_format}"
format_compliance_percentage: ${format_compliance_percentage}
format_variations: ["${format_variations}"]
# Cross-Table Consistency
cross_table_consistency:
- relationship_name: "${consistency_relationship}"
table_1: "${consistency_table_1}"
table_2: "${consistency_table_2}"
consistency_percentage: ${cross_table_consistency_percentage}
inconsistency_count: ${inconsistency_count}
# Validity Assessment
validity:
# Domain Validity
domain_validity:
- column_name: "${validity_column_name}"
valid_domain: ["${valid_domain_values}"]
domain_compliance_percentage: ${domain_compliance_percentage}
invalid_values: ["${invalid_values}"]
# Range Validity
range_validity:
- column_name: "${range_column_name}"
valid_range_min: ${valid_range_min}
valid_range_max: ${valid_range_max}
range_compliance_percentage: ${range_compliance_percentage}
out_of_range_count: ${out_of_range_count}
# Uniqueness Assessment
uniqueness:
# Column Uniqueness
column_uniqueness:
- column_name: "${uniqueness_column_name}"
expected_uniqueness: ${expected_uniqueness}
actual_uniqueness_percentage: ${actual_uniqueness_percentage}
duplicate_count: ${duplicate_count}
# Timeliness Assessment
timeliness:
data_freshness: "${data_freshness_status}"
last_update_lag: "${update_lag_hours}"
timeliness_rating: "${timeliness_rating}" # excellent, good, fair, poor
# Relationship Analysis
relationship_analysis:
# Intra-Table Relationships
intra_table_relationships:
# Column Correlations
correlations:
- column_pair: ["${correlation_column_1}", "${correlation_column_2}"]
correlation_coefficient: ${correlation_coefficient}
correlation_strength: "${correlation_strength}" # weak, moderate, strong
correlation_type: "${correlation_type}" # positive, negative
# Functional Dependencies
functional_dependencies:
- dependency_id: "${dependency_id}"
determinant_columns: ["${determinant_columns}"]
dependent_columns: ["${dependent_columns}"]
dependency_strength: ${dependency_strength}
# Inter-Table Relationships
inter_table_relationships:
# Join Analysis
join_analysis:
- join_relationship: "${join_relationship_name}"
left_table: "${left_table}"
right_table: "${right_table}"
join_columns: ["${join_columns}"]
join_success_rate: ${join_success_rate}
unmatched_records: ${unmatched_record_count}
# Data Lineage
data_lineage:
- lineage_path: ["${lineage_path}"]
source_system: "${lineage_source}"
target_system: "${lineage_target}"
transformation_steps: ["${transformation_steps}"]
# Profiling Recommendations
profiling_recommendations:
# Data Quality Improvements
quality_improvements:
- issue_category: "${quality_issue_category}"
issue_description: "${quality_issue_description}"
severity: "${issue_severity}" # critical, high, medium, low
affected_columns: ["${affected_columns}"]
# Recommended Actions
recommendations:
immediate_actions: ["${immediate_actions}"]
long_term_improvements: ["${long_term_improvements}"]
estimated_effort: "${estimated_effort}"
priority: "${recommendation_priority}"
# Performance Optimizations
performance_optimizations:
- optimization_area: "${optimization_area}"
current_performance: "${current_performance_metric}"
target_performance: "${target_performance_metric}"
optimization_strategies: ["${optimization_strategies}"]
# Monitoring Recommendations
monitoring_recommendations:
ongoing_monitoring: ["${ongoing_monitoring_needs}"]
quality_alerts: ["${recommended_quality_alerts}"]
review_frequency: "${recommended_review_frequency}"
# Validation Rules
validation_rules:
required_fields:
- profiling_id
- profiling_name
- data_sources
- structural_profiling
- content_profiling
- quality_assessment
quality_thresholds:
min_completeness_percentage: 90
max_null_percentage: 10
min_accuracy_percentage: 95
max_duplicate_percentage: 5
profiling_standards:
- sample_size: "Minimum 10% of data or 10,000 records"
- statistical_confidence: "95% confidence level required"
- profiling_coverage: "All critical columns must be profiled"
- documentation_completeness: "All findings must be documented"
# Template Metadata
template_metadata:
author: "AI Agentic Data Stack Framework"
maintainer: "Data Quality Engineer"
last_updated: "2025-01-23"
changelog:
- version: "1.0.0"
date: "2025-01-23"
changes: "Initial template creation with comprehensive data profiling configuration"