agentic-data-stack-community
Version:
AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.
1,161 lines (984 loc) • 45.5 kB
YAML
# Quality Checks Template
# Purpose: Comprehensive data quality validation and monitoring framework
# Version: 1.0.0
# Last Updated: 2025-01-23
metadata:
template_id: "quality-checks-tmpl"
version: "1.0.0"
name: "Quality Checks Template"
description: "Systematic framework for implementing data quality checks and validation"
category: "data-engineering"
tags:
- data-quality
- validation
- testing
- monitoring
- data-integrity
owner: "Data Engineering Team"
created_date: "2025-01-23"
last_modified: "2025-01-23"
compliance:
- ISO-27001
- SOC2
- GDPR
dependencies:
- quality-rules-tmpl
- monitoring-tmpl
template:
structure:
- quality_framework
- data_profiling_checks
- schema_validation_checks
- business_rule_checks
- statistical_anomaly_checks
- referential_integrity_checks
- completeness_checks
- consistency_checks
- timeliness_checks
- automated_testing
- monitoring_alerting
sections:
quality_framework:
quality_dimensions:
accuracy:
definition: "Data correctly represents real-world entities"
measurement: "Percentage of accurate records"
examples:
- "Email addresses are valid"
- "Phone numbers follow correct format"
- "Geographic coordinates are valid"
validation_approaches:
- "Format validation using regex patterns"
- "Cross-reference with authoritative sources"
- "Manual review of sample data"
- "Business user feedback validation"
completeness:
definition: "Required data fields are populated"
measurement: "Percentage of complete records"
examples:
- "Mandatory fields are not null"
- "All required attributes present"
- "No missing values in key columns"
validation_approaches:
- "Null value detection"
- "Empty string identification"
- "Required field validation"
- "Record count verification"
consistency:
definition: "Data values are uniform across datasets"
measurement: "Percentage of consistent records"
examples:
- "Same customer has identical data across systems"
- "Date formats are standardized"
- "Unit of measurement consistency"
validation_approaches:
- "Cross-system data comparison"
- "Standardization rule enforcement"
- "Format consistency checks"
- "Reference data validation"
timeliness:
definition: "Data is available when needed"
measurement: "Data freshness and latency metrics"
examples:
- "Data updated within SLA timeframes"
- "Real-time data processing delays"
- "Historical data availability"
validation_approaches:
- "Timestamp validation"
- "Processing lag monitoring"
- "SLA compliance tracking"
- "Data availability checks"
validity:
definition: "Data conforms to defined formats and constraints"
measurement: "Percentage of valid records"
examples:
- "Data types match schema definitions"
- "Values fall within acceptable ranges"
- "Enumerated values are from valid lists"
validation_approaches:
- "Schema validation"
- "Range checking"
- "Pattern matching"
- "Constraint validation"
uniqueness:
definition: "No unwanted duplicate records exist"
measurement: "Percentage of unique records"
examples:
- "Primary keys are unique"
- "No duplicate customer records"
- "Unique constraint enforcement"
validation_approaches:
- "Duplicate detection algorithms"
- "Key uniqueness validation"
- "Fuzzy matching for near-duplicates"
- "Deduplication processes"
quality_check_categories:
preventive_checks:
description: "Prevent bad data from entering the system"
implementation_stage: "Data ingestion"
examples:
- "Input validation at source"
- "Schema enforcement"
- "Format validation"
- "Business rule validation"
detective_checks:
description: "Identify quality issues in existing data"
implementation_stage: "Data processing and storage"
examples:
- "Anomaly detection"
- "Statistical profiling"
- "Pattern recognition"
- "Drift detection"
corrective_checks:
description: "Fix identified data quality issues"
implementation_stage: "Data cleansing and transformation"
examples:
- "Data standardization"
- "Missing value imputation"
- "Duplicate removal"
- "Error correction"
quality_governance:
quality_ownership:
data_stewards:
responsibilities:
- "Define quality standards"
- "Monitor quality metrics"
- "Resolve quality issues"
- "Approve quality rules"
data_engineers:
responsibilities:
- "Implement quality checks"
- "Build quality monitoring"
- "Automate quality processes"
- "Maintain quality infrastructure"
business_users:
responsibilities:
- "Report quality issues"
- "Validate business rules"
- "Define quality requirements"
- "Accept quality standards"
quality_standards:
- standard: "Data accuracy target"
threshold: "99.5% accuracy for critical data"
measurement: "Monthly validation"
- standard: "Completeness requirement"
threshold: "95% completeness for mandatory fields"
measurement: "Daily monitoring"
- standard: "Timeliness SLA"
threshold: "Data available within 4 hours"
measurement: "Continuous monitoring"
data_profiling_checks:
statistical_profiling:
descriptive_statistics:
numerical_columns:
metrics:
- "Count of non-null values"
- "Mean, median, mode"
- "Standard deviation and variance"
- "Min, max, and percentiles"
- "Skewness and kurtosis"
anomaly_detection:
- "Outlier identification using IQR method"
- "Z-score based anomaly detection"
- "Distribution drift monitoring"
categorical_columns:
metrics:
- "Distinct value count"
- "Value frequency distribution"
- "Most/least common values"
- "Cardinality analysis"
quality_checks:
- "Unexpected category values"
- "Frequency distribution changes"
- "New category emergence"
temporal_columns:
metrics:
- "Date range analysis"
- "Temporal distribution patterns"
- "Seasonality detection"
- "Trend analysis"
quality_checks:
- "Future date validation"
- "Logical date sequences"
- "Business calendar compliance"
data_distribution_analysis:
distribution_tests:
- test: "Kolmogorov-Smirnov test"
purpose: "Compare distributions between datasets"
implementation: "Statistical comparison of cumulative distributions"
- test: "Chi-square goodness of fit"
purpose: "Test if data follows expected distribution"
implementation: "Compare observed vs expected frequencies"
drift_detection:
- method: "Population Stability Index (PSI)"
threshold: "PSI > 0.1 indicates significant drift"
monitoring_frequency: "Daily for critical datasets"
- method: "Kullback-Leibler divergence"
purpose: "Measure distribution differences"
alert_threshold: "KL divergence > 0.5"
implementation_example: |
```python
import pandas as pd
import numpy as np
from scipy import stats
class DataProfiler:
def __init__(self, dataset):
self.dataset = dataset
self.profile_results = {}
def profile_numerical_column(self, column):
data = self.dataset[column].dropna()
profile = {
'count': len(data),
'null_count': self.dataset[column].isnull().sum(),
'mean': data.mean(),
'std': data.std(),
'min': data.min(),
'max': data.max(),
'percentiles': {
'25th': data.quantile(0.25),
'50th': data.quantile(0.50),
'75th': data.quantile(0.75),
'95th': data.quantile(0.95)
},
'outliers': self.detect_outliers(data),
'distribution': self.analyze_distribution(data)
}
return profile
def detect_outliers(self, data):
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data < lower_bound) | (data > upper_bound)]
return {
'count': len(outliers),
'percentage': len(outliers) / len(data) * 100,
'values': outliers.tolist()
}
def calculate_psi(self, expected, actual, buckets=10):
expected_percents = np.histogram(expected, buckets)[0] / len(expected)
actual_percents = np.histogram(actual, buckets)[0] / len(actual)
# Add small value to avoid division by zero
expected_percents = np.where(expected_percents == 0, 0.0001, expected_percents)
actual_percents = np.where(actual_percents == 0, 0.0001, actual_percents)
psi = np.sum((actual_percents - expected_percents) *
np.log(actual_percents / expected_percents))
return psi
```
schema_validation_checks:
structure_validation:
schema_compliance:
column_existence:
validation: "Verify all required columns are present"
implementation: "Compare actual vs expected column lists"
error_handling: "Fail pipeline if critical columns missing"
data_type_validation:
validation: "Ensure columns match expected data types"
implementation: "Type checking and conversion validation"
error_handling: "Attempt conversion or flag for review"
constraint_validation:
validation: "Enforce column constraints (NOT NULL, UNIQUE, etc.)"
implementation: "Database constraint checking"
error_handling: "Quarantine violating records"
schema_evolution:
backward_compatibility:
- "Allow optional new columns"
- "Maintain existing column names"
- "Preserve data type compatibility"
- "Handle deprecated columns gracefully"
forward_compatibility:
- "Version schema definitions"
- "Support multiple schema versions"
- "Gradual migration strategies"
- "Schema registry integration"
implementation_example: |
```python
from pydantic import BaseModel, ValidationError
from typing import Optional, List
import pandas as pd
class CustomerSchema(BaseModel):
customer_id: int
first_name: str
last_name: str
email: str
phone: Optional[str] = None
registration_date: str
is_active: bool = True
class SchemaValidator:
def __init__(self, schema_class):
self.schema_class = schema_class
def validate_dataframe(self, df: pd.DataFrame):
validation_results = {
'valid_records': [],
'invalid_records': [],
'validation_errors': []
}
for index, row in df.iterrows():
try:
validated_record = self.schema_class(**row.to_dict())
validation_results['valid_records'].append(validated_record.dict())
except ValidationError as e:
validation_results['invalid_records'].append({
'row_index': index,
'data': row.to_dict(),
'errors': e.errors()
})
validation_results['validation_errors'].extend(e.errors())
return validation_results
def get_schema_compliance_report(self, df: pd.DataFrame):
results = self.validate_dataframe(df)
total_records = len(df)
valid_records = len(results['valid_records'])
return {
'total_records': total_records,
'valid_records': valid_records,
'invalid_records': total_records - valid_records,
'compliance_rate': valid_records / total_records * 100,
'error_summary': self.summarize_errors(results['validation_errors'])
}
```
business_rule_checks:
rule_definition_framework:
rule_categories:
domain_rules:
description: "Business domain specific validations"
examples:
- "Age must be between 0 and 150"
- "Order amount must be positive"
- "Product category must be from approved list"
implementation_approach:
- "Rule engine integration"
- "Custom validation functions"
- "Lookup table validation"
relationship_rules:
description: "Rules governing relationships between data elements"
examples:
- "Order date must be before shipping date"
- "Employee salary must be within grade range"
- "Child records must have valid parent references"
implementation_approach:
- "Cross-column validation"
- "Temporal consistency checks"
- "Referential integrity validation"
business_logic_rules:
description: "Complex business process validations"
examples:
- "Account balance calculations"
- "Pricing rule compliance"
- "Eligibility criteria validation"
implementation_approach:
- "Stored procedure validation"
- "External service validation"
- "Rule engine processing"
rule_implementation:
declarative_rules:
format: "YAML or JSON configuration"
example: |
```yaml
rules:
- rule_id: "AGE_VALIDATION"
description: "Age must be reasonable"
condition: "age >= 0 AND age <= 150"
severity: "ERROR"
action: "REJECT"
- rule_id: "EMAIL_FORMAT"
description: "Email must be valid format"
condition: "email MATCHES '^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$'"
severity: "WARNING"
action: "FLAG"
```
programmatic_rules:
format: "Code-based rule definitions"
example: |
```python
class BusinessRules:
def validate_age(age):
if not (0 <= age <= 150):
return ValidationResult(
valid=False,
message="Age must be between 0 and 150",
severity="ERROR"
)
return ValidationResult(valid=True)
def validate_order_consistency(order_date, ship_date):
if ship_date < order_date:
return ValidationResult(
valid=False,
message="Ship date cannot be before order date",
severity="ERROR"
)
return ValidationResult(valid=True)
```
rule_execution_engine:
execution_strategies:
synchronous_validation:
description: "Validate data immediately during processing"
advantages: "Immediate feedback, prevents bad data propagation"
disadvantages: "Slower processing, blocking operations"
use_cases: "Critical data validation, real-time systems"
asynchronous_validation:
description: "Validate data after initial processing"
advantages: "Faster processing, non-blocking operations"
disadvantages: "Delayed feedback, potential bad data propagation"
use_cases: "Batch processing, non-critical validations"
batch_validation:
description: "Validate data in scheduled batches"
advantages: "Efficient resource utilization"
disadvantages: "Delayed validation results"
use_cases: "Large dataset validation, periodic checks"
rule_performance_optimization:
- "Rule execution order optimization"
- "Parallel rule execution"
- "Rule result caching"
- "Incremental validation"
- "Rule complexity analysis"
statistical_anomaly_checks:
anomaly_detection_methods:
statistical_methods:
z_score_analysis:
description: "Identify values beyond standard deviations"
formula: "z = (x - μ) / σ"
threshold: "Typically |z| > 3 indicates anomaly"
use_case: "Normally distributed numerical data"
implementation: |
```python
def detect_zscore_anomalies(data, threshold=3):
z_scores = np.abs(stats.zscore(data))
anomalies = data[z_scores > threshold]
return {
'anomaly_count': len(anomalies),
'anomaly_percentage': len(anomalies) / len(data) * 100,
'anomalous_values': anomalies.tolist(),
'z_scores': z_scores[z_scores > threshold].tolist()
}
```
iqr_method:
description: "Use interquartile range to identify outliers"
formula: "Outliers: x < Q1 - 1.5*IQR or x > Q3 + 1.5*IQR"
advantage: "Robust to distribution shape"
use_case: "Non-normally distributed data"
isolation_forest:
description: "Machine learning approach for anomaly detection"
characteristics: "Unsupervised, handles high-dimensional data"
use_case: "Complex multivariate anomaly detection"
time_series_anomalies:
seasonal_decomposition:
description: "Separate trend, seasonal, and residual components"
anomaly_detection: "Identify anomalies in residual component"
implementation: "STL decomposition or X-13ARIMA-SEATS"
change_point_detection:
description: "Identify significant changes in time series"
methods: "CUSUM, Bayesian change point detection"
use_case: "Detect shifts in data patterns"
forecasting_based:
description: "Compare actual vs predicted values"
models: "ARIMA, Prophet, LSTM"
anomaly_threshold: "Prediction error exceeds confidence interval"
multivariate_anomaly_detection:
correlation_analysis:
description: "Detect anomalies in variable relationships"
method: "Mahalanobis distance"
threshold: "Chi-square distribution based"
principal_component_analysis:
description: "Reduce dimensionality and detect outliers"
approach: "Reconstruct data and measure reconstruction error"
advantage: "Handles high-dimensional data effectively"
referential_integrity_checks:
foreign_key_validation:
parent_child_relationships:
validation_rules:
- "Child records must have valid parent references"
- "Parent records cannot be deleted if children exist"
- "Referential consistency across distributed systems"
implementation_strategies:
database_constraints:
description: "Use database foreign key constraints"
advantages: "Automatic enforcement, transaction safety"
limitations: "Single database scope, performance impact"
application_validation:
description: "Validate references in application code"
advantages: "Flexible logic, cross-system validation"
limitations: "Potential consistency gaps, complex implementation"
batch_validation:
description: "Periodic validation of referential integrity"
advantages: "Performance optimization, bulk checking"
limitations: "Delayed error detection"
lookup_table_validation:
validation_scenarios:
- "Code values exist in reference tables"
- "Enumerated values are from valid sets"
- "Master data references are current"
implementation_example: |
```python
class ReferentialValidator:
def __init__(self, reference_data):
self.reference_data = reference_data
def validate_foreign_keys(self, dataset, foreign_key_mappings):
validation_results = {}
for table, fk_config in foreign_key_mappings.items():
fk_column = fk_config['foreign_key']
ref_table = fk_config['reference_table']
ref_column = fk_config['reference_key']
# Get unique foreign key values
fk_values = set(dataset[fk_column].dropna().unique())
# Get valid reference values
ref_values = set(self.reference_data[ref_table][ref_column].unique())
# Find invalid references
invalid_fks = fk_values - ref_values
validation_results[table] = {
'total_fk_values': len(fk_values),
'valid_fk_values': len(fk_values - invalid_fks),
'invalid_fk_values': invalid_fks,
'validity_percentage': (len(fk_values - invalid_fks) / len(fk_values)) * 100
}
return validation_results
```
cross_system_validation:
distributed_referential_integrity:
challenges:
- "Network latency and availability"
- "Eventual consistency models"
- "System synchronization timing"
- "Partial failure handling"
solutions:
- "Asynchronous validation with compensation"
- "Event-driven consistency maintenance"
- "Cached reference data with TTL"
- "Graceful degradation strategies"
completeness_checks:
mandatory_field_validation:
null_value_detection:
validation_levels:
strict_validation:
description: "No null values allowed in mandatory fields"
action: "Reject records with null mandatory fields"
use_case: "Critical business data"
conditional_validation:
description: "Null validation based on business rules"
action: "Context-dependent null handling"
use_case: "Complex business scenarios"
implementation_strategies:
- "Database NOT NULL constraints"
- "Application-level validation"
- "ETL pipeline validation"
- "Data entry form validation"
record_completeness_scoring:
scoring_methodology:
simple_scoring:
formula: "Completeness = (Non-null fields / Total fields) * 100"
use_case: "Basic completeness measurement"
weighted_scoring:
formula: "Completeness = Σ(Field_weight * Field_completeness) / Σ(Field_weights)"
use_case: "Business-critical field prioritization"
implementation_example: |
```python
class CompletenessValidator:
def __init__(self, field_weights=None):
self.field_weights = field_weights or {}
def calculate_record_completeness(self, record):
total_fields = len(record)
non_null_fields = sum(1 for value in record.values() if pd.notna(value))
if not self.field_weights:
return (non_null_fields / total_fields) * 100
# Weighted completeness calculation
weighted_completeness = 0
total_weight = 0
for field, value in record.items():
weight = self.field_weights.get(field, 1)
total_weight += weight
if pd.notna(value):
weighted_completeness += weight
return (weighted_completeness / total_weight) * 100
def generate_completeness_report(self, dataset):
completeness_scores = []
field_completeness = {}
# Calculate per-record completeness
for _, record in dataset.iterrows():
score = self.calculate_record_completeness(record.to_dict())
completeness_scores.append(score)
# Calculate per-field completeness
for column in dataset.columns:
non_null_count = dataset[column].notna().sum()
field_completeness[column] = (non_null_count / len(dataset)) * 100
return {
'overall_completeness': np.mean(completeness_scores),
'record_completeness_distribution': {
'min': min(completeness_scores),
'max': max(completeness_scores),
'mean': np.mean(completeness_scores),
'std': np.std(completeness_scores)
},
'field_completeness': field_completeness,
'incomplete_records': len([s for s in completeness_scores if s < 100])
}
```
data_availability_checks:
expected_data_presence:
validation_scenarios:
- "Daily data feeds are received"
- "All expected data sources are present"
- "Minimum record counts are met"
- "Data coverage across time periods"
monitoring_approaches:
- "Automated data arrival detection"
- "Record count trend analysis"
- "Data freshness monitoring"
- "Source system availability checks"
consistency_checks:
cross_dataset_consistency:
data_synchronization_validation:
validation_scenarios:
same_entity_different_systems:
description: "Validate same entity across multiple systems"
example: "Customer data in CRM vs Billing system"
validation_method: "Key-based record matching and comparison"
aggregation_consistency:
description: "Validate aggregated values match detail records"
example: "Sum of line items equals order total"
validation_method: "Recalculation and comparison"
temporal_consistency:
description: "Validate data consistency over time"
example: "Balance changes match transaction history"
validation_method: "Audit trail validation"
format_standardization:
standardization_rules:
- "Date formats are consistent (ISO 8601)"
- "Currency amounts use standard precision"
- "Text fields use consistent case"
- "Enumerated values follow standard lists"
implementation_example: |
```python
class ConsistencyValidator:
def __init__(self):
self.standardization_rules = {
'date_format': '%Y-%m-%d',
'currency_precision': 2,
'text_case': 'upper',
'phone_format': r'^\+?1?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$'
}
def validate_cross_system_consistency(self, system1_data, system2_data, key_field):
# Match records by key
merged_data = pd.merge(
system1_data, system2_data,
on=key_field,
suffixes=('_sys1', '_sys2')
)
consistency_results = {}
# Compare common fields
common_fields = [col.replace('_sys1', '') for col in merged_data.columns
if col.endswith('_sys1') and col.replace('_sys1', '_sys2') in merged_data.columns]
for field in common_fields:
sys1_col = f"{field}_sys1"
sys2_col = f"{field}_sys2"
matches = merged_data[sys1_col] == merged_data[sys2_col]
consistency_results[field] = {
'total_comparisons': len(merged_data),
'matches': matches.sum(),
'consistency_rate': (matches.sum() / len(merged_data)) * 100,
'discrepancies': merged_data[~matches][[key_field, sys1_col, sys2_col]]
}
return consistency_results
```
timeliness_checks:
data_freshness_validation:
freshness_requirements:
real_time_data:
max_latency: "< 1 minute"
validation_method: "Timestamp comparison"
alert_threshold: "Latency > 2 minutes"
near_real_time_data:
max_latency: "< 15 minutes"
validation_method: "Processing lag monitoring"
alert_threshold: "Latency > 30 minutes"
batch_data:
max_latency: "< 4 hours from source update"
validation_method: "ETL completion monitoring"
alert_threshold: "Batch processing delay > 6 hours"
sla_compliance_monitoring:
sla_definitions:
- sla_name: "Customer data synchronization"
target: "Data updated within 1 hour"
measurement: "Time from source change to availability"
penalty: "Business process delays"
- sla_name: "Financial reporting data"
target: "Daily data available by 6 AM"
measurement: "ETL completion time"
penalty: "Regulatory reporting delays"
monitoring_implementation: |
```python
class TimelinessValidator:
def __init__(self, sla_definitions):
self.sla_definitions = sla_definitions
def validate_data_freshness(self, dataset, timestamp_column):
current_time = pd.Timestamp.now()
data_timestamps = pd.to_datetime(dataset[timestamp_column])
# Calculate data age
data_age = current_time - data_timestamps
freshness_report = {
'total_records': len(dataset),
'avg_age_minutes': data_age.dt.total_seconds().mean() / 60,
'max_age_minutes': data_age.dt.total_seconds().max() / 60,
'stale_data_count': 0,
'freshness_distribution': {}
}
# Check against SLA thresholds
for sla in self.sla_definitions:
threshold_minutes = sla['threshold_minutes']
stale_records = data_age.dt.total_seconds() > (threshold_minutes * 60)
freshness_report['freshness_distribution'][sla['name']] = {
'threshold_minutes': threshold_minutes,
'compliant_records': (~stale_records).sum(),
'non_compliant_records': stale_records.sum(),
'compliance_rate': (~stale_records).sum() / len(dataset) * 100
}
return freshness_report
```
automated_testing:
test_automation_framework:
test_categories:
unit_tests:
description: "Test individual quality check functions"
scope: "Single quality rule or validation function"
execution: "Continuous integration pipeline"
examples:
- "Test email validation regex"
- "Test statistical outlier detection"
- "Test foreign key validation logic"
integration_tests:
description: "Test quality checks in data pipeline context"
scope: "End-to-end quality validation process"
execution: "Pipeline deployment validation"
examples:
- "Test quality checks with sample datasets"
- "Test error handling and recovery"
- "Test quality report generation"
regression_tests:
description: "Ensure quality checks maintain accuracy over time"
scope: "Historical quality validation results"
execution: "Periodic regression testing"
examples:
- "Compare quality scores over time"
- "Validate consistent anomaly detection"
- "Test schema evolution handling"
test_data_management:
synthetic_test_data:
description: "Generate test data with known quality issues"
benefits: "Controlled testing scenarios, privacy protection"
generation_strategies:
- "Statistical distribution sampling"
- "Rule-based data generation"
- "ML-based synthetic data creation"
anonymized_production_data:
description: "Use real data with sensitive information removed"
benefits: "Realistic data patterns, actual quality issues"
considerations:
- "Data privacy compliance"
- "Anonymization quality"
- "Data freshness management"
test_automation_example: |
```python
import unittest
import pandas as pd
from data_quality_framework import QualityValidator
class TestDataQualityChecks(unittest.TestCase):
def setUp(self):
self.validator = QualityValidator()
# Create test datasets
self.valid_data = pd.DataFrame({
'customer_id': [1, 2, 3, 4, 5],
'email': ['user1@test.com', 'user2@test.com', 'user3@test.com',
'user4@test.com', 'user5@test.com'],
'age': [25, 30, 45, 35, 28],
'registration_date': ['2023-01-01', '2023-01-02', '2023-01-03',
'2023-01-04', '2023-01-05']
})
self.invalid_data = pd.DataFrame({
'customer_id': [1, 2, None, 4, 5],
'email': ['user1@test.com', 'invalid-email', 'user3@test.com',
'', 'user5@test.com'],
'age': [25, 200, 45, -5, 28],
'registration_date': ['2023-01-01', '2023-13-02', '2023-01-03',
'2023-01-04', 'invalid-date']
})
def test_email_validation(self):
# Test valid emails
valid_results = self.validator.validate_email_format(self.valid_data['email'])
self.assertEqual(valid_results['valid_count'], 5)
self.assertEqual(valid_results['invalid_count'], 0)
# Test invalid emails
invalid_results = self.validator.validate_email_format(self.invalid_data['email'])
self.assertEqual(invalid_results['valid_count'], 2)
self.assertEqual(invalid_results['invalid_count'], 3)
def test_age_range_validation(self):
# Test valid ages
valid_results = self.validator.validate_age_range(self.valid_data['age'])
self.assertEqual(valid_results['valid_count'], 5)
# Test invalid ages
invalid_results = self.validator.validate_age_range(self.invalid_data['age'])
self.assertEqual(invalid_results['valid_count'], 2)
self.assertIn(200, invalid_results['invalid_values'])
self.assertIn(-5, invalid_results['invalid_values'])
def test_completeness_validation(self):
# Test complete data
valid_completeness = self.validator.calculate_completeness(self.valid_data)
self.assertEqual(valid_completeness['overall_completeness'], 100.0)
# Test incomplete data
invalid_completeness = self.validator.calculate_completeness(self.invalid_data)
self.assertLess(invalid_completeness['overall_completeness'], 100.0)
if __name__ == '__main__':
unittest.main()
```
monitoring_alerting:
real_time_monitoring:
monitoring_architecture:
data_quality_dashboard:
components:
- "Real-time quality metrics visualization"
- "Quality trend analysis charts"
- "Anomaly detection alerts"
- "Data lineage impact visualization"
key_metrics:
- "Data quality score (overall and by dimension)"
- "Quality rule pass/fail rates"
- "Data volume and freshness metrics"
- "Error rates and types"
automated_monitoring:
monitoring_frequency:
- "Continuous: Critical data streams"
- "Hourly: Near real-time data"
- "Daily: Batch processed data"
- "Weekly: Historical trend analysis"
monitoring_scope:
- "Data ingestion quality"
- "Transformation accuracy"
- "Output data validation"
- "End-to-end pipeline health"
alerting_configuration:
alert_severity_levels:
critical:
conditions:
- "Data quality score drops below 90%"
- "Critical business rules fail"
- "Data pipeline failure"
response:
- "Immediate notification to on-call team"
- "Automatic pipeline pause (if configured)"
- "Escalation to management after 30 minutes"
warning:
conditions:
- "Data quality score drops below 95%"
- "Anomaly detection threshold exceeded"
- "SLA near breach"
response:
- "Notification to data team"
- "Log detailed information"
- "Schedule review within 4 hours"
info:
conditions:
- "Quality improvement detected"
- "New data patterns identified"
- "Successful quality remediation"
response:
- "Log information"
- "Update quality metrics"
- "Include in daily reports"
notification_channels:
- channel: "Email"
recipients: ["data-quality-team@company.com"]
alert_types: ["Critical", "Warning"]
- channel: "Slack"
recipients: ["#data-quality-alerts"]
alert_types: ["Critical", "Warning", "Info"]
- channel: "PagerDuty"
recipients: ["Data Engineering On-Call"]
alert_types: ["Critical"]
- channel: "JIRA"
project: "Data Quality Issues"
alert_types: ["Critical", "Warning"]
quality_reporting:
executive_reporting:
monthly_quality_scorecard:
metrics:
- "Overall data quality score trend"
- "Quality improvement initiatives impact"
- "Cost of poor data quality"
- "Quality SLA compliance rates"
quality_incident_summary:
content:
- "Number and severity of quality incidents"
- "Mean time to detection and resolution"
- "Root cause analysis summary"
- "Prevention measures implemented"
operational_reporting:
daily_quality_reports:
content:
- "Quality check results by dataset"
- "Anomaly detection findings"
- "Data freshness compliance"
- "Outstanding quality issues"
quality_trend_analysis:
content:
- "Quality score trends over time"
- "Seasonal patterns in data quality"
- "Quality improvement effectiveness"
- "Predictive quality forecasting"
template_guidance:
implementation_strategy:
phase_1_foundation:
duration: "4-6 weeks"
focus: "Basic quality checks implementation"
deliverables:
- "Schema validation checks"
- "Completeness validation"
- "Basic business rule validation"
- "Simple monitoring dashboard"
phase_2_expansion:
duration: "6-8 weeks"
focus: "Advanced quality checks and automation"
deliverables:
- "Statistical anomaly detection"
- "Referential integrity validation"
- "Automated testing framework"
- "Enhanced monitoring and alerting"
phase_3_optimization:
duration: "4-6 weeks"
focus: "Performance optimization and advanced features"
deliverables:
- "Machine learning based anomaly detection"
- "Predictive quality analytics"
- "Advanced reporting and visualization"
- "Quality governance processes"
best_practices:
- "Start with critical business data first"
- "Implement checks incrementally"
- "Balance thoroughness with performance"
- "Involve business stakeholders in rule definition"
- "Maintain comprehensive documentation"
- "Regular review and update of quality rules"
integration_points:
- data_governance: "Align with governance policies"
- monitoring_setup: "Integrate with monitoring infrastructure"
- incident_management: "Connect to incident response processes"
- data_lineage: "Track quality through data lineage"
template_metadata:
recommended_review_cycle: "Monthly quality framework assessment"
minimum_fields:
- quality_framework
- data_profiling_checks
- schema_validation_checks
- business_rule_checks
- automated_testing
- monitoring_alerting
automation_potential:
- "Automated test generation from schemas"
- "ML-based anomaly detection"
- "Self-healing data quality processes"
- "Dynamic quality rule adjustment"
success_metrics:
- "Data quality score improvement"
- "Reduction in quality incidents"
- "Mean time to quality issue detection"
- "Business user satisfaction with data quality"