agentic-data-stack-community
Version:
AI Agentic Data Stack Framework - Community Edition. Open source data engineering framework with 4 core agents, essential templates, and 3-dimensional quality validation.
460 lines (393 loc) • 16.9 kB
YAML
# Monitoring Template
# Standardized template for comprehensive system monitoring and observability across the AI Agentic Data Stack Framework
metadata:
template_id: "monitoring-tmpl"
name: "Monitoring Template"
version: "1.0.0"
description: "Comprehensive template for system monitoring, alerting, and observability implementation"
category: "operations-maintenance"
tags: ["monitoring", "observability", "alerting", "metrics", "logging"]
created_by: "AI Agentic Data Stack Framework"
created_date: "2025-01-23"
template:
name: "Monitoring Template"
description: "Template for comprehensive system monitoring and observability"
version: "1.0.0"
# Monitoring Configuration
monitoring_config:
monitoring_framework_id: "${monitoring_framework_id}"
framework_name: "${monitoring_framework_name}"
monitored_system: "${monitored_system_name}"
monitoring_scope: "${monitoring_scope}" # application, infrastructure, business, security
monitoring_strategy: "${monitoring_strategy}" # reactive, proactive, predictive
monitoring_owner: "${monitoring_owner}"
# Metrics Collection
metrics_collection:
# Metrics Platform
metrics_platform: "${metrics_platform}" # prometheus, datadog, new_relic, cloudwatch
# System Metrics
system_metrics:
# Infrastructure Metrics
infrastructure_metrics:
- metric_name: "${infrastructure_metric_name}"
metric_type: "${infrastructure_metric_type}" # counter, gauge, histogram, summary
metric_description: "${infrastructure_metric_description}"
collection_interval: ${infrastructure_metric_interval_seconds}
retention_period: "${infrastructure_metric_retention}"
# CPU Metrics
cpu_metrics:
cpu_utilization: ${cpu_utilization_enabled}
cpu_load_average: ${cpu_load_average_enabled}
cpu_cores: ${cpu_cores_monitoring_enabled}
# Memory Metrics
memory_metrics:
memory_usage: ${memory_usage_enabled}
heap_usage: ${heap_usage_enabled}
garbage_collection: ${gc_monitoring_enabled}
# Storage Metrics
storage_metrics:
disk_usage: ${disk_usage_enabled}
disk_io: ${disk_io_enabled}
inode_usage: ${inode_usage_enabled}
# Network Metrics
network_metrics:
network_throughput: ${network_throughput_enabled}
network_latency: ${network_latency_enabled}
connection_count: ${connection_count_enabled}
# Application Metrics
application_metrics:
# Performance Metrics
performance_metrics:
- metric_name: "${app_performance_metric_name}"
metric_endpoint: "${app_metric_endpoint}"
collection_method: "${app_metric_collection_method}" # pull, push, scrape
# Response Time Metrics
response_time:
average_response_time: ${avg_response_time_enabled}
percentile_response_times: ["${response_time_percentiles}"] # p50, p95, p99
max_response_time: ${max_response_time_enabled}
# Throughput Metrics
throughput:
requests_per_second: ${requests_per_second_enabled}
transactions_per_second: ${transactions_per_second_enabled}
concurrent_users: ${concurrent_users_enabled}
# Error Metrics
error_metrics:
error_rate: ${error_rate_enabled}
error_count: ${error_count_enabled}
error_types: ["${monitored_error_types}"]
# Business Metrics
business_metrics:
- metric_name: "${business_metric_name}"
metric_description: "${business_metric_description}"
business_value: "${business_metric_value}"
calculation_method: "${business_metric_calculation}"
data_source: "${business_metric_data_source}"
update_frequency: "${business_metric_update_frequency}"
# Logging Strategy
logging_strategy:
# Log Aggregation
log_aggregation:
log_platform: "${log_aggregation_platform}" # elk, splunk, fluentd, loki
log_forwarding: "${log_forwarding_method}" # agent, sidecar, direct
# Log Configuration
log_config:
# Application Logs
application_logs:
log_level: "${application_log_level}" # trace, debug, info, warn, error, fatal
log_format: "${application_log_format}" # json, logfmt, plain
log_rotation: "${application_log_rotation}"
log_retention: "${application_log_retention_period}"
# System Logs
system_logs:
system_log_collection: ${system_log_collection_enabled}
kernel_logs: ${kernel_logs_enabled}
audit_logs: ${audit_logs_enabled}
security_logs: ${security_logs_enabled}
# Access Logs
access_logs:
web_server_logs: ${web_server_logs_enabled}
api_access_logs: ${api_access_logs_enabled}
database_access_logs: ${database_access_logs_enabled}
# Structured Logging
structured_logging:
structured_format: "${structured_log_format}" # json, logstash
correlation_ids: ${correlation_ids_enabled}
context_propagation: ${log_context_propagation_enabled}
# Log Analysis
log_analysis:
log_parsing: "${log_parsing_rules}"
log_enrichment: ${log_enrichment_enabled}
log_indexing: "${log_indexing_strategy}"
search_capabilities: ["${log_search_capabilities}"]
# Distributed Tracing
distributed_tracing:
# Tracing Configuration
tracing_config:
tracing_enabled: ${distributed_tracing_enabled}
tracing_platform: "${tracing_platform}" # jaeger, zipkin, x_ray, datadog_apm
sampling_strategy: "${tracing_sampling_strategy}" # probabilistic, rate_limiting, adaptive
sampling_rate: ${tracing_sampling_rate}
# Trace Collection
trace_collection:
instrumentation_method: "${instrumentation_method}" # auto, manual, hybrid
trace_exporters: ["${trace_exporters}"]
trace_processors: ["${trace_processors}"]
# Service Mapping
service_mapping:
service_discovery: "${service_discovery_method}"
dependency_mapping: ${dependency_mapping_enabled}
service_graph_visualization: ${service_graph_enabled}
# Performance Analysis
performance_analysis:
bottleneck_detection: ${bottleneck_detection_enabled}
latency_analysis: ${latency_analysis_enabled}
error_correlation: ${error_correlation_enabled}
# Alerting Configuration
alerting_config:
# Alerting Platform
alerting_platform: "${alerting_platform}" # prometheus_alertmanager, pagerduty, opsgenie
# Alert Rules
alert_rules:
# Infrastructure Alerts
infrastructure_alerts:
- alert_name: "${infrastructure_alert_name}"
alert_condition: "${infrastructure_alert_condition}"
severity: "${infrastructure_alert_severity}" # critical, warning, info
threshold_value: ${infrastructure_alert_threshold}
evaluation_interval: ${infrastructure_alert_evaluation_interval}
# CPU Alerts
cpu_alerts:
high_cpu_usage:
threshold: ${high_cpu_threshold_percentage}
duration: "${high_cpu_duration}"
# Memory Alerts
memory_alerts:
high_memory_usage:
threshold: ${high_memory_threshold_percentage}
duration: "${high_memory_duration}"
# Storage Alerts
storage_alerts:
disk_space_low:
threshold: ${low_disk_space_threshold_percentage}
duration: "${low_disk_space_duration}"
# Application Alerts
application_alerts:
- alert_name: "${app_alert_name}"
alert_description: "${app_alert_description}"
# Performance Alerts
performance_alerts:
high_response_time:
threshold: ${high_response_time_threshold_ms}
percentile: "${response_time_percentile}" # p95, p99
low_throughput:
threshold: ${low_throughput_threshold}
measurement_window: "${throughput_measurement_window}"
# Error Alerts
error_alerts:
high_error_rate:
threshold: ${high_error_rate_threshold_percentage}
measurement_window: "${error_rate_measurement_window}"
# Business Alerts
business_alerts:
- alert_name: "${business_alert_name}"
business_impact: "${business_alert_impact}"
alert_condition: "${business_alert_condition}"
# Notification Configuration
notification_config:
# Notification Channels
notification_channels:
- channel_name: "${notification_channel_name}"
channel_type: "${notification_channel_type}" # email, sms, slack, webhook
channel_endpoint: "${notification_channel_endpoint}"
channel_priority: "${notification_channel_priority}"
# Escalation Policies
escalation_policies:
- policy_name: "${escalation_policy_name}"
escalation_levels:
- level: ${escalation_level}
escalation_time: ${escalation_time_minutes}
notification_targets: ["${escalation_notification_targets}"]
# Alert Suppression
alert_suppression:
suppression_rules: ["${alert_suppression_rules}"]
maintenance_windows: ["${maintenance_windows}"]
alert_correlation: ${alert_correlation_enabled}
# Dashboard Configuration
dashboard_config:
# Dashboard Platform
dashboard_platform: "${dashboard_platform}" # grafana, kibana, datadog, new_relic
# Dashboard Categories
dashboard_categories:
# Infrastructure Dashboards
infrastructure_dashboards:
- dashboard_name: "${infrastructure_dashboard_name}"
dashboard_description: "${infrastructure_dashboard_description}"
refresh_interval: "${infrastructure_dashboard_refresh}"
# Dashboard Panels
panels:
- panel_name: "${infrastructure_panel_name}"
panel_type: "${infrastructure_panel_type}" # graph, table, heatmap, stat
data_source: "${infrastructure_panel_data_source}"
query: "${infrastructure_panel_query}"
# Application Dashboards
application_dashboards:
- dashboard_name: "${app_dashboard_name}"
dashboard_scope: "${app_dashboard_scope}" # service, component, endpoint
# Performance Panels
performance_panels:
response_time_panel: "${response_time_panel_config}"
throughput_panel: "${throughput_panel_config}"
error_rate_panel: "${error_rate_panel_config}"
# Business Dashboards
business_dashboards:
- dashboard_name: "${business_dashboard_name}"
stakeholder_audience: ["${business_dashboard_audience}"]
update_frequency: "${business_dashboard_update_frequency}"
# Dashboard Access Control
dashboard_access:
authentication_required: ${dashboard_authentication_required}
role_based_access: ${dashboard_rbac_enabled}
public_dashboards: ["${public_dashboards}"]
# Health Checks
health_checks:
# Health Check Configuration
health_check_config:
health_check_enabled: ${health_checks_enabled}
health_check_endpoint: "${health_check_endpoint}"
health_check_interval: ${health_check_interval_seconds}
# Application Health Checks
application_health:
- service_name: "${health_check_service_name}"
health_check_type: "${health_check_type}" # http, tcp, command, database
endpoint_url: "${health_check_endpoint_url}"
timeout: ${health_check_timeout_seconds}
success_criteria: ["${health_check_success_criteria}"]
# Infrastructure Health Checks
infrastructure_health:
- component_name: "${infrastructure_component_name}"
component_type: "${infrastructure_component_type}" # server, database, network
health_indicators: ["${component_health_indicators}"]
# Dependency Health Checks
dependency_health:
- dependency_name: "${dependency_name}"
dependency_type: "${dependency_type}" # external_api, database, queue
health_check_method: "${dependency_health_check_method}"
circuit_breaker_enabled: ${dependency_circuit_breaker_enabled}
# Performance Monitoring
performance_monitoring:
# Performance Baselines
performance_baselines:
- metric_name: "${baseline_metric_name}"
baseline_value: ${baseline_metric_value}
baseline_period: "${baseline_measurement_period}"
deviation_threshold: ${baseline_deviation_threshold}
# Performance Testing Integration
performance_testing:
load_testing_integration: ${load_testing_integration_enabled}
performance_regression_detection: ${performance_regression_detection}
automated_performance_alerts: ${automated_performance_alerts}
# Capacity Planning
capacity_planning:
capacity_metrics: ["${capacity_planning_metrics}"]
growth_projections: ["${capacity_growth_projections}"]
scaling_recommendations: ["${scaling_recommendations}"]
# Incident Management Integration
incident_management:
# Incident Detection
incident_detection:
automated_incident_creation: ${automated_incident_creation_enabled}
incident_correlation: ${incident_correlation_enabled}
incident_prioritization: "${incident_prioritization_method}"
# Incident Response
incident_response:
response_team_notification: ["${incident_response_team}"]
incident_escalation: "${incident_escalation_procedure}"
incident_documentation: "${incident_documentation_template}"
# Post-Incident Analysis
post_incident_analysis:
root_cause_analysis: "${root_cause_analysis_process}"
lessons_learned: "${lessons_learned_process}"
improvement_actions: ["${post_incident_improvement_actions}"]
# Compliance Monitoring
compliance_monitoring:
# Regulatory Compliance
regulatory_compliance:
- regulation: "${regulation_name}" # gdpr, hipaa, sox, pci_dss
compliance_metrics: ["${regulation_compliance_metrics}"]
monitoring_requirements: ["${regulation_monitoring_requirements}"]
reporting_frequency: "${regulation_reporting_frequency}"
# Audit Trail
audit_trail:
audit_logging_enabled: ${audit_logging_enabled}
audit_log_retention: "${audit_log_retention_period}"
audit_log_integrity: ${audit_log_integrity_protection}
# Security Monitoring
security_monitoring:
security_events: ["${monitored_security_events}"]
threat_detection: ${threat_detection_enabled}
security_dashboards: ["${security_dashboards}"]
# Data Retention and Archival
data_retention:
# Metrics Retention
metrics_retention:
short_term_retention: "${metrics_short_term_retention}"
long_term_retention: "${metrics_long_term_retention}"
data_compression: ${metrics_data_compression_enabled}
# Log Retention
log_retention:
application_log_retention: "${application_log_retention_period}"
system_log_retention: "${system_log_retention_period}"
archive_strategy: "${log_archive_strategy}"
# Data Lifecycle Management
data_lifecycle:
hot_data_period: "${hot_data_retention_period}"
warm_data_period: "${warm_data_retention_period}"
cold_data_period: "${cold_data_retention_period}"
data_deletion_policy: "${data_deletion_policy}"
# Integration Configuration
integration_config:
# External Integrations
external_integrations:
- integration_name: "${monitoring_integration_name}"
integration_type: "${monitoring_integration_type}" # webhook, api, message_queue
endpoint_url: "${monitoring_integration_endpoint}"
authentication: "${monitoring_integration_auth}"
# ITSM Integration
itsm_integration:
itsm_platform: "${itsm_platform}" # servicenow, jira_service_desk, remedy
ticket_creation: ${automated_ticket_creation_enabled}
ticket_correlation: ${ticket_correlation_enabled}
# Communication Platform Integration
communication_integration:
chat_platform: "${monitoring_chat_platform}" # slack, teams, discord
notification_formatting: "${chat_notification_format}"
channel_routing: ["${chat_channel_routing_rules}"]
# Validation Rules
validation_rules:
required_fields:
- monitoring_framework_id
- framework_name
- monitored_system
- metrics_collection
- logging_strategy
- alerting_config
- dashboard_config
# Template Metadata
template_metadata:
author: "AI Agentic Data Stack Framework"
maintainer: "Site Reliability Engineer"
last_updated: "2025-01-23"
sections:
- name: "monitoring_overview"
description: "Monitoring strategy and objectives"
required: true
- name: "metrics"
description: "Key metrics and KPIs"
required: true
- name: "alerting"
description: "Alerting rules and notifications"
required: true
- name: "dashboards"
description: "Monitoring dashboards"
required: false