UNPKG

agentic-data-stack-community

Version:

AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.

460 lines (393 loc) 16.9 kB
# Monitoring Template # Standardized template for comprehensive system monitoring and observability across the AI Agentic Data Stack Framework metadata: template_id: "monitoring-tmpl" name: "Monitoring Template" version: "1.0.0" description: "Comprehensive template for system monitoring, alerting, and observability implementation" category: "operations-maintenance" tags: ["monitoring", "observability", "alerting", "metrics", "logging"] created_by: "AI Agentic Data Stack Framework" created_date: "2025-01-23" template: name: "Monitoring Template" description: "Template for comprehensive system monitoring and observability" version: "1.0.0" # Monitoring Configuration monitoring_config: monitoring_framework_id: "${monitoring_framework_id}" framework_name: "${monitoring_framework_name}" monitored_system: "${monitored_system_name}" monitoring_scope: "${monitoring_scope}" # application, infrastructure, business, security monitoring_strategy: "${monitoring_strategy}" # reactive, proactive, predictive monitoring_owner: "${monitoring_owner}" # Metrics Collection metrics_collection: # Metrics Platform metrics_platform: "${metrics_platform}" # prometheus, datadog, new_relic, cloudwatch # System Metrics system_metrics: # Infrastructure Metrics infrastructure_metrics: - metric_name: "${infrastructure_metric_name}" metric_type: "${infrastructure_metric_type}" # counter, gauge, histogram, summary metric_description: "${infrastructure_metric_description}" collection_interval: ${infrastructure_metric_interval_seconds} retention_period: "${infrastructure_metric_retention}" # CPU Metrics cpu_metrics: cpu_utilization: ${cpu_utilization_enabled} cpu_load_average: ${cpu_load_average_enabled} cpu_cores: ${cpu_cores_monitoring_enabled} # Memory Metrics memory_metrics: memory_usage: ${memory_usage_enabled} heap_usage: ${heap_usage_enabled} garbage_collection: ${gc_monitoring_enabled} # Storage Metrics storage_metrics: disk_usage: ${disk_usage_enabled} disk_io: ${disk_io_enabled} inode_usage: ${inode_usage_enabled} # Network Metrics network_metrics: network_throughput: ${network_throughput_enabled} network_latency: ${network_latency_enabled} connection_count: ${connection_count_enabled} # Application Metrics application_metrics: # Performance Metrics performance_metrics: - metric_name: "${app_performance_metric_name}" metric_endpoint: "${app_metric_endpoint}" collection_method: "${app_metric_collection_method}" # pull, push, scrape # Response Time Metrics response_time: average_response_time: ${avg_response_time_enabled} percentile_response_times: ["${response_time_percentiles}"] # p50, p95, p99 max_response_time: ${max_response_time_enabled} # Throughput Metrics throughput: requests_per_second: ${requests_per_second_enabled} transactions_per_second: ${transactions_per_second_enabled} concurrent_users: ${concurrent_users_enabled} # Error Metrics error_metrics: error_rate: ${error_rate_enabled} error_count: ${error_count_enabled} error_types: ["${monitored_error_types}"] # Business Metrics business_metrics: - metric_name: "${business_metric_name}" metric_description: "${business_metric_description}" business_value: "${business_metric_value}" calculation_method: "${business_metric_calculation}" data_source: "${business_metric_data_source}" update_frequency: "${business_metric_update_frequency}" # Logging Strategy logging_strategy: # Log Aggregation log_aggregation: log_platform: "${log_aggregation_platform}" # elk, splunk, fluentd, loki log_forwarding: "${log_forwarding_method}" # agent, sidecar, direct # Log Configuration log_config: # Application Logs application_logs: log_level: "${application_log_level}" # trace, debug, info, warn, error, fatal log_format: "${application_log_format}" # json, logfmt, plain log_rotation: "${application_log_rotation}" log_retention: "${application_log_retention_period}" # System Logs system_logs: system_log_collection: ${system_log_collection_enabled} kernel_logs: ${kernel_logs_enabled} audit_logs: ${audit_logs_enabled} security_logs: ${security_logs_enabled} # Access Logs access_logs: web_server_logs: ${web_server_logs_enabled} api_access_logs: ${api_access_logs_enabled} database_access_logs: ${database_access_logs_enabled} # Structured Logging structured_logging: structured_format: "${structured_log_format}" # json, logstash correlation_ids: ${correlation_ids_enabled} context_propagation: ${log_context_propagation_enabled} # Log Analysis log_analysis: log_parsing: "${log_parsing_rules}" log_enrichment: ${log_enrichment_enabled} log_indexing: "${log_indexing_strategy}" search_capabilities: ["${log_search_capabilities}"] # Distributed Tracing distributed_tracing: # Tracing Configuration tracing_config: tracing_enabled: ${distributed_tracing_enabled} tracing_platform: "${tracing_platform}" # jaeger, zipkin, x_ray, datadog_apm sampling_strategy: "${tracing_sampling_strategy}" # probabilistic, rate_limiting, adaptive sampling_rate: ${tracing_sampling_rate} # Trace Collection trace_collection: instrumentation_method: "${instrumentation_method}" # auto, manual, hybrid trace_exporters: ["${trace_exporters}"] trace_processors: ["${trace_processors}"] # Service Mapping service_mapping: service_discovery: "${service_discovery_method}" dependency_mapping: ${dependency_mapping_enabled} service_graph_visualization: ${service_graph_enabled} # Performance Analysis performance_analysis: bottleneck_detection: ${bottleneck_detection_enabled} latency_analysis: ${latency_analysis_enabled} error_correlation: ${error_correlation_enabled} # Alerting Configuration alerting_config: # Alerting Platform alerting_platform: "${alerting_platform}" # prometheus_alertmanager, pagerduty, opsgenie # Alert Rules alert_rules: # Infrastructure Alerts infrastructure_alerts: - alert_name: "${infrastructure_alert_name}" alert_condition: "${infrastructure_alert_condition}" severity: "${infrastructure_alert_severity}" # critical, warning, info threshold_value: ${infrastructure_alert_threshold} evaluation_interval: ${infrastructure_alert_evaluation_interval} # CPU Alerts cpu_alerts: high_cpu_usage: threshold: ${high_cpu_threshold_percentage} duration: "${high_cpu_duration}" # Memory Alerts memory_alerts: high_memory_usage: threshold: ${high_memory_threshold_percentage} duration: "${high_memory_duration}" # Storage Alerts storage_alerts: disk_space_low: threshold: ${low_disk_space_threshold_percentage} duration: "${low_disk_space_duration}" # Application Alerts application_alerts: - alert_name: "${app_alert_name}" alert_description: "${app_alert_description}" # Performance Alerts performance_alerts: high_response_time: threshold: ${high_response_time_threshold_ms} percentile: "${response_time_percentile}" # p95, p99 low_throughput: threshold: ${low_throughput_threshold} measurement_window: "${throughput_measurement_window}" # Error Alerts error_alerts: high_error_rate: threshold: ${high_error_rate_threshold_percentage} measurement_window: "${error_rate_measurement_window}" # Business Alerts business_alerts: - alert_name: "${business_alert_name}" business_impact: "${business_alert_impact}" alert_condition: "${business_alert_condition}" # Notification Configuration notification_config: # Notification Channels notification_channels: - channel_name: "${notification_channel_name}" channel_type: "${notification_channel_type}" # email, sms, slack, webhook channel_endpoint: "${notification_channel_endpoint}" channel_priority: "${notification_channel_priority}" # Escalation Policies escalation_policies: - policy_name: "${escalation_policy_name}" escalation_levels: - level: ${escalation_level} escalation_time: ${escalation_time_minutes} notification_targets: ["${escalation_notification_targets}"] # Alert Suppression alert_suppression: suppression_rules: ["${alert_suppression_rules}"] maintenance_windows: ["${maintenance_windows}"] alert_correlation: ${alert_correlation_enabled} # Dashboard Configuration dashboard_config: # Dashboard Platform dashboard_platform: "${dashboard_platform}" # grafana, kibana, datadog, new_relic # Dashboard Categories dashboard_categories: # Infrastructure Dashboards infrastructure_dashboards: - dashboard_name: "${infrastructure_dashboard_name}" dashboard_description: "${infrastructure_dashboard_description}" refresh_interval: "${infrastructure_dashboard_refresh}" # Dashboard Panels panels: - panel_name: "${infrastructure_panel_name}" panel_type: "${infrastructure_panel_type}" # graph, table, heatmap, stat data_source: "${infrastructure_panel_data_source}" query: "${infrastructure_panel_query}" # Application Dashboards application_dashboards: - dashboard_name: "${app_dashboard_name}" dashboard_scope: "${app_dashboard_scope}" # service, component, endpoint # Performance Panels performance_panels: response_time_panel: "${response_time_panel_config}" throughput_panel: "${throughput_panel_config}" error_rate_panel: "${error_rate_panel_config}" # Business Dashboards business_dashboards: - dashboard_name: "${business_dashboard_name}" stakeholder_audience: ["${business_dashboard_audience}"] update_frequency: "${business_dashboard_update_frequency}" # Dashboard Access Control dashboard_access: authentication_required: ${dashboard_authentication_required} role_based_access: ${dashboard_rbac_enabled} public_dashboards: ["${public_dashboards}"] # Health Checks health_checks: # Health Check Configuration health_check_config: health_check_enabled: ${health_checks_enabled} health_check_endpoint: "${health_check_endpoint}" health_check_interval: ${health_check_interval_seconds} # Application Health Checks application_health: - service_name: "${health_check_service_name}" health_check_type: "${health_check_type}" # http, tcp, command, database endpoint_url: "${health_check_endpoint_url}" timeout: ${health_check_timeout_seconds} success_criteria: ["${health_check_success_criteria}"] # Infrastructure Health Checks infrastructure_health: - component_name: "${infrastructure_component_name}" component_type: "${infrastructure_component_type}" # server, database, network health_indicators: ["${component_health_indicators}"] # Dependency Health Checks dependency_health: - dependency_name: "${dependency_name}" dependency_type: "${dependency_type}" # external_api, database, queue health_check_method: "${dependency_health_check_method}" circuit_breaker_enabled: ${dependency_circuit_breaker_enabled} # Performance Monitoring performance_monitoring: # Performance Baselines performance_baselines: - metric_name: "${baseline_metric_name}" baseline_value: ${baseline_metric_value} baseline_period: "${baseline_measurement_period}" deviation_threshold: ${baseline_deviation_threshold} # Performance Testing Integration performance_testing: load_testing_integration: ${load_testing_integration_enabled} performance_regression_detection: ${performance_regression_detection} automated_performance_alerts: ${automated_performance_alerts} # Capacity Planning capacity_planning: capacity_metrics: ["${capacity_planning_metrics}"] growth_projections: ["${capacity_growth_projections}"] scaling_recommendations: ["${scaling_recommendations}"] # Incident Management Integration incident_management: # Incident Detection incident_detection: automated_incident_creation: ${automated_incident_creation_enabled} incident_correlation: ${incident_correlation_enabled} incident_prioritization: "${incident_prioritization_method}" # Incident Response incident_response: response_team_notification: ["${incident_response_team}"] incident_escalation: "${incident_escalation_procedure}" incident_documentation: "${incident_documentation_template}" # Post-Incident Analysis post_incident_analysis: root_cause_analysis: "${root_cause_analysis_process}" lessons_learned: "${lessons_learned_process}" improvement_actions: ["${post_incident_improvement_actions}"] # Compliance Monitoring compliance_monitoring: # Regulatory Compliance regulatory_compliance: - regulation: "${regulation_name}" # gdpr, hipaa, sox, pci_dss compliance_metrics: ["${regulation_compliance_metrics}"] monitoring_requirements: ["${regulation_monitoring_requirements}"] reporting_frequency: "${regulation_reporting_frequency}" # Audit Trail audit_trail: audit_logging_enabled: ${audit_logging_enabled} audit_log_retention: "${audit_log_retention_period}" audit_log_integrity: ${audit_log_integrity_protection} # Security Monitoring security_monitoring: security_events: ["${monitored_security_events}"] threat_detection: ${threat_detection_enabled} security_dashboards: ["${security_dashboards}"] # Data Retention and Archival data_retention: # Metrics Retention metrics_retention: short_term_retention: "${metrics_short_term_retention}" long_term_retention: "${metrics_long_term_retention}" data_compression: ${metrics_data_compression_enabled} # Log Retention log_retention: application_log_retention: "${application_log_retention_period}" system_log_retention: "${system_log_retention_period}" archive_strategy: "${log_archive_strategy}" # Data Lifecycle Management data_lifecycle: hot_data_period: "${hot_data_retention_period}" warm_data_period: "${warm_data_retention_period}" cold_data_period: "${cold_data_retention_period}" data_deletion_policy: "${data_deletion_policy}" # Integration Configuration integration_config: # External Integrations external_integrations: - integration_name: "${monitoring_integration_name}" integration_type: "${monitoring_integration_type}" # webhook, api, message_queue endpoint_url: "${monitoring_integration_endpoint}" authentication: "${monitoring_integration_auth}" # ITSM Integration itsm_integration: itsm_platform: "${itsm_platform}" # servicenow, jira_service_desk, remedy ticket_creation: ${automated_ticket_creation_enabled} ticket_correlation: ${ticket_correlation_enabled} # Communication Platform Integration communication_integration: chat_platform: "${monitoring_chat_platform}" # slack, teams, discord notification_formatting: "${chat_notification_format}" channel_routing: ["${chat_channel_routing_rules}"] # Validation Rules validation_rules: required_fields: - monitoring_framework_id - framework_name - monitored_system - metrics_collection - logging_strategy - alerting_config - dashboard_config # Template Metadata template_metadata: author: "AI Agentic Data Stack Framework" maintainer: "Site Reliability Engineer" last_updated: "2025-01-23" sections: - name: "monitoring_overview" description: "Monitoring strategy and objectives" required: true - name: "metrics" description: "Key metrics and KPIs" required: true - name: "alerting" description: "Alerting rules and notifications" required: true - name: "dashboards" description: "Monitoring dashboards" required: false