UNPKG

claude-flow-novice

Version:

Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes Local RuVector Accelerator and all CFN skills for complete functionality.

297 lines (262 loc) 10.4 kB
global: smtp_smarthost: 'localhost:587' smtp_from: 'alerts@cfn-loop.local' resolve_timeout: 5m # Route alerts based on severity and other labels route: group_by: ['alertname', 'team', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 12h receiver: 'default' routes: # Critical (P0) alerts -> PagerDuty + Slack critical channel - match: severity: critical receiver: 'pagerduty-critical' group_wait: 10s repeat_interval: 30m routes: - match: alertname: 'CriticalAgentFailureRate' receiver: 'pagerduty-critical' - match: alertname: 'CriticalHealthCheckFailure' receiver: 'pagerduty-critical' - match: alertname: 'CFNLoopStuck' receiver: 'pagerduty-critical' - match: alertname: 'RedisConnectionLoss' receiver: 'pagerduty-critical' - match: alertname: 'PostgreSQLConnectionLoss' receiver: 'pagerduty-critical' - match: alertname: 'DockerDaemonUnavailable' receiver: 'pagerduty-critical' - match: alertname: 'DiskSpaceExhaustion' receiver: 'pagerduty-critical' # Warning (P1) alerts -> Slack warning channel with escalation - match: severity: warning receiver: 'slack-warning' group_wait: 30s repeat_interval: 2h routes: - match: alertname: 'HighCostPerHour' receiver: 'slack-warning-escalate' repeat_interval: 1h - match: alertname: 'SlowAgentExecution' receiver: 'slack-warning-escalate' repeat_interval: 1h - match: alertname: 'HighAgentMemoryUsage' receiver: 'slack-warning-escalate' repeat_interval: 1h # Info (P2) alerts -> Slack info channel only - match: severity: info receiver: 'slack-info' group_wait: 1m repeat_interval: 4h # Health check specific routing - match: alertname: 'HealthCheckFailure' receiver: 'slack-health' group_wait: 15s repeat_interval: 30m # Cost alerts with team-specific routing - match: alertname: 'HighCostPerHour' receiver: 'slack-cost' group_wait: 1m repeat_interval: 1h # Inhibition rules to prevent alert spam inhibit_rules: # Inhibit warning alerts if critical alert is firing for same instance - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'instance'] # Inhibit info alerts if warning alert is firing for same service - source_match: severity: 'warning' target_match: severity: 'info' equal: ['service'] # Inhibit all lower severity alerts if CFN Loop is stuck - source_match: alertname: 'CFNLoopStuck' target_match_re: alertname: '(HighAgentFailureRate|SlowAgentExecution|LowConsensusScore|LowTestPassRate)' # Inhibit agent-specific alerts if infrastructure is down - source_match_re: alertname: '(RedisConnectionLoss|PostgreSQLConnectionLoss|DockerDaemonUnavailable)' target_match_re: alertname: '(HighAgentFailureRate|SlowAgentExecution|CFNLoopStuck)' # Inhibit cost alerts during system-wide outages - source_match_re: alertname: '(CriticalHealthCheckFailure|DiskSpaceExhaustion)' target_match: alertname: 'HighCostPerHour' # Alert receivers receivers: # Default receiver (fallback) - name: 'default' email_configs: - to: 'admin@cfn-loop.local' subject: '[CFN Loop] {{ .GroupLabels.alertname }}' body: | {{ range .Alerts }} Alert: {{ .Annotations.summary }} Description: {{ .Annotations.description }} Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }} {{ end }} # PagerDuty integration for critical alerts - name: 'pagerduty-critical' pagerduty_configs: - routing_key: '${PAGERDUTY_SERVICE_KEY}' description: '{{ .GroupLabels.alertname }}' details: firing: '{{ .Alerts.Firing | len }}' severity: critical summary: '{{ .Annotations.summary }}' description: '{{ .Annotations.description }}' team: '{{ .GroupLabels.team }}' runbook_url: 'https://docs.cfn-loop.local/runbooks/{{ .GroupLabels.alertname | lower }}.html' # Slack critical channel - name: 'slack-critical' slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#cfn-critical' title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}' text: | *Team:* {{ .GroupLabels.team | default "Unknown" }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Actions:* <https://docs.cfn-loop.local/runbooks/{{ .GroupLabels.alertname | lower }}.md|Runbook> | <https://grafana.cfn-loop.local/d/cfn-overview|Dashboard> {{ range .Alerts }} {{ .Labels.instance }} - {{ .Annotations.description }} {{ end }} color: 'danger' send_resolved: true icon_emoji: ':rotating_light:' username: 'CFN Loop Alertmanager' # Slack warning channel - name: 'slack-warning' slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#cfn-warnings' title: '⚠️ WARNING: {{ .GroupLabels.alertname }}' text: | *Team:* {{ .GroupLabels.team | default "Unknown" }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Actions:* <https://docs.cfn-loop.local/runbooks/{{ .GroupLabels.alertname | lower }}.md|Runbook> | <https://grafana.cfn-loop.local/d/cfn-overview|Dashboard> {{ range .Alerts }} {{ .Labels.instance }} - {{ .Annotations.description }} {{ end }} color: 'warning' send_resolved: true icon_emoji: ':warning:' username: 'CFN Loop Alertmanager' # Slack warning channel with escalation - name: 'slack-warning-escalate' slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#cfn-warnings' title: '⚠️ WARNING (Escalating): {{ .GroupLabels.alertname }}' text: | *Team:* {{ .GroupLabels.team | default "Unknown" }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Escalation:* This alert has been active for over 30 minutes and requires attention. *Actions:* <https://docs.cfn-loop.local/runbooks/{{ .GroupLabels.alertname | lower }}.md|Runbook> | <https://grafana.cfn-loop.local/d/cfn-overview|Dashboard> {{ range .Alerts }} {{ .Labels.instance }} - {{ .Annotations.description }} {{ end }} color: 'warning' send_resolved: true icon_emoji: ':rotating_light:' username: 'CFN Loop Alertmanager' # Slack info channel - name: 'slack-info' slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#cfn-info' title: 'ℹ️ INFO: {{ .GroupLabels.alertname }}' text: | *Team:* {{ .GroupLabels.team | default "Unknown" }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Actions:* <https://docs.cfn-loop.local/runbooks/{{ .GroupLabels.alertname | lower }}.md|Runbook> | <https://grafana.cfn-loop.local/d/cfn-overview|Dashboard> {{ range .Alerts }} {{ .Labels.instance }} - {{ .Annotations.description }} {{ end }} color: 'good' send_resolved: true icon_emoji: ':information_source:' username: 'CFN Loop Alertmanager' # Slack health check channel - name: 'slack-health' slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#cfn-health' title: '🏥 Health Check: {{ .GroupLabels.alertname }}' text: | *Check Type:* {{ .GroupLabels.check_type | default "Unknown" }} *Error Type:* {{ .GroupLabels.error_type | default "Unknown" }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Actions:* <https://docs.cfn-loop.local/runbooks/{{ .GroupLabels.alertname | lower }}.md|Runbook> | <https://grafana.cfn-loop.local/d/health-checks|Health Dashboard> {{ range .Alerts }} {{ .Labels.instance }} - {{ .Annotations.description }} {{ end }} color: '#36a64f' send_resolved: true icon_emoji: ':hospital:' username: 'CFN Loop Health Monitor' # Slack cost alerts channel - name: 'slack-cost' slack_configs: - api_url: '${SLACK_WEBHOOK_URL}' channel: '#cfn-cost' title: '💰 Cost Alert: {{ .GroupLabels.alertname }}' text: | *Team:* {{ .GroupLabels.team | default "Unknown" }} *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Cost Impact:* Review current spending patterns and consider optimization. *Actions:* <https://docs.cfn-loop.local/runbooks/{{ .GroupLabels.alertname | lower }}.md|Runbook> | <https://grafana.cfn-loop.local/d/cfn-cost|Cost Dashboard> {{ range .Alerts }} {{ .Labels.instance }} - {{ .Annotations.description }} {{ end }} color: '#ff9800' send_resolved: true icon_emoji: ':money_with_wings:' username: 'CFN Loop Cost Monitor' # Time intervals for different alerting behaviors time_intervals: # Business hours for cost alerts (weekdays 9-5) - name: 'business-hours' time_intervals: - times: - start_time: '09:00' end_time: '17:00' weekdays: ['monday:friday'] # After hours for critical alerts only - name: 'after-hours' time_intervals: - times: - start_time: '17:01' end_time: '08:59' weekdays: ['monday:friday'] - weekdays: ['saturday', 'sunday'] # Silence management templates templates: - '/etc/alertmanager/templates/*.tmpl'