UNPKG

agentic-data-stack-community

Version:

AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.

224 lines (195 loc) 7.31 kB
# Data Pipeline Template # Standardized template for data pipeline design and implementation across the AI Agentic Data Stack Framework metadata: template_id: "data-pipeline-tmpl" name: "Data Pipeline Template" version: "1.0.0" description: "Comprehensive template for building scalable and reliable data pipelines" category: "data-engineering" tags: ["pipeline", "etl", "data-flow", "orchestration", "automation"] created_by: "AI Agentic Data Stack Framework" created_date: "2025-01-23" # Pipeline Configuration pipeline_config: pipeline_id: "${data_pipeline_id}" pipeline_name: "${data_pipeline_name}" description: "${pipeline_description}" version: "${pipeline_version}" pipeline_type: "${pipeline_type}" # batch, streaming, hybrid, micro_batch business_domain: "${business_domain}" # Data Sources data_sources: - source_id: "${source_id}" source_name: "${source_name}" source_type: "${source_type}" # database, file, api, stream, queue connection_config: connection_string: "${connection_string}" authentication: "${source_authentication}" timeout: ${connection_timeout} extraction_config: extraction_method: "${extraction_method}" # full, incremental, cdc schedule: "${extraction_schedule}" batch_size: ${extraction_batch_size} # Data Targets data_targets: - target_id: "${target_id}" target_name: "${target_name}" target_type: "${target_type}" # database, file, api, stream loading_strategy: "${loading_strategy}" # full_refresh, append, upsert, merge connection_config: connection_string: "${target_connection_string}" authentication: "${target_authentication}" # Pipeline Stages pipeline_stages: # Data Extraction extraction_stage: stage_name: "Data Extraction" stage_order: 1 execution_config: parallel_execution: ${extraction_parallel_execution} timeout: ${extraction_timeout} retry_policy: "${extraction_retry_policy}" quality_checks: - check_name: "Source Data Availability" check_type: "availability" # Data Transformation transformation_stage: stage_name: "Data Transformation" stage_order: 2 transformations: - transformation_id: "${transformation_id}" transformation_name: "${transformation_name}" transformation_type: "${transformation_type}" # cleansing, enrichment, aggregation business_rules: ["${business_rules}"] # Data Loading loading_stage: stage_name: "Data Loading" stage_order: 3 loading_config: loading_method: "${loading_method}" conflict_resolution: "${conflict_resolution_strategy}" post_load_validation: ${post_load_validation_enabled} # Orchestration Configuration orchestration_config: orchestration_tool: "${orchestration_tool}" # airflow, prefect, dagster, azure_data_factory scheduling: schedule_type: "${schedule_type}" # cron, event_driven, manual schedule_expression: "${schedule_expression}" timezone: "${schedule_timezone}" dependencies: upstream_dependencies: ["${upstream_dependencies}"] downstream_dependencies: ["${downstream_dependencies}"] execution_config: max_parallel_tasks: ${max_parallel_tasks} task_timeout: ${task_timeout_minutes} retry_attempts: ${pipeline_retry_attempts} # Data Quality Framework data_quality: quality_gates: - gate_id: "${quality_gate_id}" gate_name: "${quality_gate_name}" gate_type: "${gate_type}" # pre_processing, post_processing, final quality_rules: - rule_name: "${quality_rule_name}" rule_type: "${quality_rule_type}" threshold: ${quality_rule_threshold} action_on_failure: "${failure_action}" # stop, warn, continue data_profiling: profiling_enabled: ${data_profiling_enabled} profiling_frequency: "${profiling_frequency}" profile_storage: "${profile_storage_location}" # Monitoring and Alerting monitoring_alerting: monitoring_config: metrics_collection: ${metrics_collection_enabled} log_aggregation: ${log_aggregation_enabled} performance_tracking: ${performance_tracking_enabled} alerting_rules: - alert_name: "${alert_name}" alert_condition: "${alert_condition}" alert_severity: "${alert_severity}" # low, medium, high, critical notification_channels: ["${notification_channels}"] sla_configuration: execution_time_sla: ${execution_time_sla_minutes} data_freshness_sla: ${data_freshness_sla_hours} success_rate_sla: ${success_rate_sla_percentage} # Error Handling and Recovery error_handling: error_strategy: "${error_handling_strategy}" recovery_procedures: - error_type: "${error_type}" recovery_action: "${recovery_action}" escalation_required: ${escalation_required} backup_and_restore: backup_enabled: ${backup_enabled} backup_frequency: "${backup_frequency}" retention_period: "${backup_retention_period}" # Security Configuration security_config: data_encryption: encryption_at_rest: ${encryption_at_rest} encryption_in_transit: ${encryption_in_transit} key_management: "${key_management_service}" access_control: authentication_method: "${pipeline_authentication}" authorization_model: "${pipeline_authorization}" audit_logging: ${pipeline_audit_logging} # Performance Optimization performance_config: resource_allocation: cpu_cores: ${pipeline_cpu_cores} memory_gb: ${pipeline_memory_gb} storage_gb: ${pipeline_storage_gb} optimization_techniques: parallel_processing: ${parallel_processing_enabled} caching: "${caching_strategy}" compression: "${compression_algorithm}" partitioning: "${partitioning_strategy}" # Testing Configuration testing_config: unit_testing: test_coverage_target: ${test_coverage_target} test_data_location: "${test_data_location}" integration_testing: end_to_end_testing: ${e2e_testing_enabled} performance_testing: ${performance_testing_enabled} data_testing: data_validation_tests: ${data_validation_tests_enabled} schema_evolution_tests: ${schema_evolution_tests_enabled} # Validation Rules validation_rules: required_fields: - data_pipeline_id - pipeline_name - pipeline_type - data_sources - data_targets - pipeline_stages template: name: "Data Pipeline Template" description: "Comprehensive template for building and managing data pipelines" version: "1.0.0" sections: - name: "pipeline_metadata" description: "Pipeline identification and metadata" required: true - name: "data_sources" description: "Input data source configurations" required: true - name: "data_targets" description: "Output data target configurations" required: true - name: "pipeline_stages" description: "Transformation and processing stages" required: true - name: "quality_assurance" description: "Data quality checks and monitoring" required: true - name: "validation_rules" description: "Template validation and quality rules" required: true # Template Metadata template_metadata: author: "AI Agentic Data Stack Framework" maintainer: "Data Engineer" last_updated: "2025-01-23"