UNPKG

claude-flow-novice

Version:

Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes Local RuVector Accelerator and all CFN skills for complete functionality.

226 lines (211 loc) 7.87 kB
groups: - name: zai_rate_limits interval: 30s rules: - alert: ZaiRateLimitHigh expr: (zai_rate_limit_used / zai_rate_limit_total) > 0.80 for: 5m labels: severity: warning team: platform component: zai-provider annotations: summary: "Z.ai rate limit usage above 80%" description: "Z.ai rate limit is at {{ $value | humanizePercentage }} for {{ $labels.team }}" runbook_url: "https://docs.example.com/runbooks/zai-rate-limit" - alert: ZaiRateLimitCritical expr: (zai_rate_limit_used / zai_rate_limit_total) > 0.90 for: 2m labels: severity: critical team: platform component: zai-provider annotations: summary: "Z.ai rate limit CRITICAL - above 90%" description: "Z.ai rate limit is at {{ $value | humanizePercentage }} for {{ $labels.team }}. Throttling imminent." runbook_url: "https://docs.example.com/runbooks/zai-rate-limit-critical" - name: zai_failures interval: 30s rules: - alert: ZaiHighErrorRate expr: | ( rate(zai_requests_total{status="error"}[5m]) / rate(zai_requests_total[5m]) ) > 0.05 for: 5m labels: severity: warning team: platform annotations: summary: "High Z.ai error rate detected" description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.team }}" runbook_url: "https://docs.example.com/runbooks/zai-high-errors" - alert: ZaiProviderDown expr: up{job="zai-exporter"} == 0 for: 2m labels: severity: critical team: platform annotations: summary: "Z.ai provider unavailable" description: "Z.ai metrics exporter has been down for more than 2 minutes" runbook_url: "https://docs.example.com/runbooks/zai-provider-down" - alert: ZaiRequestTimeoutHigh expr: | ( rate(zai_requests_total{status="timeout"}[5m]) / rate(zai_requests_total[5m]) ) > 0.10 for: 10m labels: severity: warning team: platform annotations: summary: "High Z.ai timeout rate" description: "Timeout rate is {{ $value | humanizePercentage }} for {{ $labels.team }}" - name: cost_anomalies interval: 60s rules: - alert: CostAnomalyDetected expr: | ( sum by (team) (rate(zai_api_cost_usd[1h])) / sum by (team) (rate(zai_api_cost_usd[1h] offset 24h)) ) > 1.20 for: 15m labels: severity: warning team: finance component: cost-tracking annotations: summary: "Cost anomaly detected for {{ $labels.team }}" description: "Hourly cost increased by {{ $value | humanizePercentage }} compared to 24h ago" runbook_url: "https://docs.example.com/runbooks/cost-anomaly" - alert: DailyCostBudgetExceeded expr: | sum by (team) (increase(zai_api_cost_usd[24h])) > 100 for: 1h labels: severity: critical team: finance annotations: summary: "Daily cost budget exceeded for {{ $labels.team }}" description: "24h cost is ${{ $value }} (budget: $100)" runbook_url: "https://docs.example.com/runbooks/budget-exceeded" - alert: CostPerRequestAnomaly expr: | ( (sum(zai_api_cost_usd) / sum(zai_requests_total)) / (sum(zai_api_cost_usd offset 24h) / sum(zai_requests_total offset 24h)) ) > 1.50 for: 30m labels: severity: warning team: finance annotations: summary: "Cost per request anomaly detected" description: "Cost per request increased by {{ $value | humanizePercentage }} vs 24h ago" - name: coordinator_health interval: 30s rules: - alert: CoordinatorUnhealthy expr: coordinator_health_status{status!="healthy"} == 1 for: 5m labels: severity: warning team: platform annotations: summary: "Coordinator {{ $labels.coordinator }} unhealthy" description: "Coordinator {{ $labels.coordinator }} has been unhealthy for 5+ minutes" runbook_url: "https://docs.example.com/runbooks/coordinator-unhealthy" - alert: CoordinatorTaskQueueHigh expr: coordinator_tasks_pending > 50 for: 15m labels: severity: warning team: platform annotations: summary: "High task queue for {{ $labels.coordinator }}" description: "Task queue has {{ $value }} pending tasks for 15+ minutes" - alert: CoordinatorNoHeartbeat expr: (time() - coordinator_last_heartbeat_timestamp_seconds) > 300 for: 5m labels: severity: critical team: platform annotations: summary: "Coordinator {{ $labels.coordinator }} no heartbeat" description: "No heartbeat received from {{ $labels.coordinator }} for 5+ minutes" runbook_url: "https://docs.example.com/runbooks/coordinator-heartbeat-missing" - name: performance_degradation interval: 30s rules: - alert: HighLatencyP95 expr: | histogram_quantile(0.95, sum by (team, le) (rate(zai_request_duration_seconds_bucket[5m])) ) > 5 for: 10m labels: severity: warning team: platform annotations: summary: "High P95 latency for {{ $labels.team }}" description: "P95 latency is {{ $value }}s (threshold: 5s)" - alert: LowSuccessRate expr: | ( sum by (team) (rate(zai_requests_total{status="success"}[10m])) / sum by (team) (rate(zai_requests_total[10m])) ) < 0.95 for: 15m labels: severity: critical team: platform annotations: summary: "Low success rate for {{ $labels.team }}" description: "Success rate is {{ $value | humanizePercentage }} (threshold: 95%)" runbook_url: "https://docs.example.com/runbooks/low-success-rate" - name: slo_violations interval: 60s rules: - alert: AvailabilitySLOViolation expr: | ( sum(rate(zai_requests_total{status="success"}[30d])) / sum(rate(zai_requests_total[30d])) ) < 0.999 for: 1h labels: severity: critical team: sre slo: availability annotations: summary: "Availability SLO violation" description: "30-day availability is {{ $value | humanizePercentage }} (SLO: 99.9%)" runbook_url: "https://docs.example.com/runbooks/slo-availability" - alert: ErrorBudgetExhausted expr: | ( 1 - ( sum(rate(zai_requests_total{status="success"}[30d])) / sum(rate(zai_requests_total[30d])) ) ) > 0.001 for: 2h labels: severity: warning team: sre slo: error-budget annotations: summary: "Error budget exhausted" description: "Monthly error budget exceeded - current error rate: {{ $value | humanizePercentage }}" runbook_url: "https://docs.example.com/runbooks/error-budget"