UNPKG

claude-flow-novice

Version:

Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes CodeSearch (hybrid SQLite + pgvector), mem0/memgraph specialists, and all CFN skills.

316 lines (284 loc) 11.7 kB
groups: - name: cfn_loop_recording_rules interval: 30s rules: # Agent spawn rate (per minute) - record: cfn:agent_spawn_rate:1m expr: rate(cfn_agent_spawns_total[1m]) # Agent execution success rate - record: cfn:agent_success_rate:5m expr: | sum(rate(cfn_agent_executions_total{status="success"}[5m])) by (team, agent_type) / sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type) # Average agent execution duration (P50, P95, P99) - record: cfn:agent_duration:p50 expr: histogram_quantile(0.50, rate(cfn_agent_execution_duration_seconds_bucket[5m])) - record: cfn:agent_duration:p95 expr: histogram_quantile(0.95, rate(cfn_agent_execution_duration_seconds_bucket[5m])) - record: cfn:agent_duration:p99 expr: histogram_quantile(0.99, rate(cfn_agent_execution_duration_seconds_bucket[5m])) # Cost aggregation by team - record: cfn:cost_by_team:1h expr: sum(increase(cfn_agent_cost_dollars_total[1h])) by (team) # Cost aggregation by project - record: cfn:cost_by_project:1h expr: sum(increase(cfn_agent_cost_dollars_total[1h])) by (project) # Token usage by provider - record: cfn:tokens_by_provider:1h expr: sum(increase(cfn_agent_tokens_total[1h])) by (provider, token_type) - name: cfn_loop_alerts rules: # High agent failure rate - alert: HighAgentFailureRate expr: | ( sum(rate(cfn_agent_executions_total{status="failure"}[5m])) by (team, agent_type) / sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type) ) > 0.10 for: 5m labels: severity: warning annotations: summary: "High agent failure rate detected" description: "Agent {{ $labels.agent_type }} in team {{ $labels.team }} has failure rate above 10% (current: {{ $value | humanizePercentage }})" # Critical agent failure rate - alert: CriticalAgentFailureRate expr: | ( sum(rate(cfn_agent_executions_total{status="failure"}[5m])) by (team, agent_type) / sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type) ) > 0.25 for: 2m labels: severity: critical annotations: summary: "Critical agent failure rate detected" description: "Agent {{ $labels.agent_type }} in team {{ $labels.team }} has failure rate above 25% (current: {{ $value | humanizePercentage }})" # Slow agent execution (P95 above 5 minutes) - alert: SlowAgentExecution expr: cfn:agent_duration:p95 > 300 for: 10m labels: severity: warning annotations: summary: "Slow agent execution detected" description: "P95 agent execution time is above 5 minutes (current: {{ $value | humanizeDuration }})" # Health check failures - alert: HealthCheckFailure expr: | rate(cfn_health_check_failure_total[5m]) > 0 for: 5m labels: severity: warning annotations: summary: "Health check failures detected" description: "Health check {{ $labels.check_type }} is failing (error: {{ $labels.error_type }})" # Critical health check failures (multiple checks failing) - alert: CriticalHealthCheckFailure expr: | count(rate(cfn_health_check_failure_total[5m]) > 0) >= 2 for: 2m labels: severity: critical annotations: summary: "Multiple health checks failing" description: "Multiple health checks are failing - system may be unhealthy" # High cost per hour - alert: HighCostPerHour expr: | sum(rate(cfn_agent_cost_dollars_total[1h])) by (team) > 10 for: 1h labels: severity: warning annotations: summary: "High hourly cost detected" description: "Team {{ $labels.team }} is spending more than $10/hour (current: ${{ $value | humanize }})" # Docker operation failures - alert: DockerOperationFailures expr: | rate(cfn_docker_operations_total{status="failure"}[5m]) > 0.05 for: 5m labels: severity: warning annotations: summary: "Docker operation failures detected" description: "Docker {{ $labels.operation }} operations are failing at {{ $value | humanizePercentage }}" # High memory usage by agents - alert: HighAgentMemoryUsage expr: | cfn_agent_memory_usage_bytes > 2147483648 # 2GB for: 5m labels: severity: warning annotations: summary: "High agent memory usage" description: "Agent {{ $labels.agent_id }} ({{ $labels.agent_type }}) is using {{ $value | humanize1024 }} of memory" # CFN Loop stuck (no progress for 30 minutes) - alert: CFNLoopStuck expr: | (time() - max(cfn_loop_iterations_total) by (task_id)) > 1800 for: 5m labels: severity: critical annotations: summary: "CFN Loop appears stuck" description: "Task {{ $labels.task_id }} has not progressed in over 30 minutes" # Low consensus score - alert: LowConsensusScore expr: | cfn_loop_consensus_score < 0.7 for: 5m labels: severity: warning annotations: summary: "Low CFN Loop consensus score" description: "Task {{ $labels.task_id }} iteration {{ $labels.iteration }} has low consensus ({{ $value | humanizePercentage }})" # Low test pass rate - alert: LowTestPassRate expr: | cfn_loop_test_pass_rate < 0.95 for: 2m labels: severity: warning annotations: summary: "Low CFN Loop test pass rate" description: "Task {{ $labels.task_id }} iteration {{ $labels.iteration }} has low test pass rate ({{ $value | humanizePercentage }})" # P0 Critical Infrastructure Alerts - alert: RedisConnectivityLost expr: up{job="redis"} == 0 for: 1m labels: severity: critical priority: P0 annotations: summary: "Redis connectivity lost" description: "Redis instance {{ $labels.instance }} is unreachable for 1 minute. Immediate action required." runbook: "docs/runbooks/redis-connection-loss.md" - alert: PostgreSQLConnectivityLost expr: up{job="postgres"} == 0 for: 1m labels: severity: critical priority: P0 annotations: summary: "PostgreSQL connectivity lost" description: "PostgreSQL instance {{ $labels.instance }} is unreachable for 1 minute. Immediate action required." runbook: "docs/runbooks/postgres-connection-loss.md" - alert: DockerDaemonUnavailable expr: up{job="docker"} == 0 for: 2m labels: severity: critical priority: P0 annotations: summary: "Docker daemon unavailable" description: "Docker daemon on {{ $labels.instance }} is unavailable for 2 minutes." runbook: "docs/runbooks/docker-daemon-unavailable.md" - alert: DiskSpaceCritical expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.10 for: 5m labels: severity: critical priority: P0 annotations: summary: "Disk space critical on {{ $labels.instance }}" description: "Less than 10% disk space available on {{ $labels.mountpoint }} ({{ $value | humanizePercentage }} free)" runbook: "docs/runbooks/disk-space-exhaustion.md" - alert: HealthCheckConsecutiveFailures expr: | count_over_time((rate(cfn_health_check_failure_total[5m]) > 0)[15m:]) >= 3 for: 1m labels: severity: critical priority: P0 annotations: summary: "Three consecutive health check failures" description: "System health checks have failed 3 or more times consecutively" - alert: CFNLoopStuckCritical expr: | (time() - max(cfn_loop_iterations_total) by (task_id)) > 3600 for: 5m labels: severity: critical priority: P0 annotations: summary: "CFN Loop critically stuck for over 1 hour" description: "Task {{ $labels.task_id }} has not progressed in over 1 hour" runbook: "docs/runbooks/cfn-loop-stuck.md" # P1 Warning Alerts - alert: BackupFailure expr: up{job="backup"} == 0 or increase(backup_failure_total[1h]) > 0 for: 5m labels: severity: warning priority: P1 annotations: summary: "Backup operation failed" description: "Backup job on {{ $labels.instance }} has failed" runbook: "docs/runbooks/backup-failure.md" - alert: CertificateExpiringSoon expr: (ssl_cert_expiry_seconds - time()) < (7 * 24 * 3600) for: 1h labels: severity: warning priority: P1 annotations: summary: "SSL certificate expiring within 7 days" description: "Certificate {{ $labels.cn }} expires in less than 7 days" runbook: "docs/runbooks/certificate-expiration.md" - alert: AgentMemoryCritical expr: | cfn_agent_memory_usage_bytes > 3221225472 # 3GB for: 5m labels: severity: warning priority: P1 annotations: summary: "Agent memory usage exceeds 3GB" description: "Agent {{ $labels.agent_type }} on team {{ $labels.team }} using {{ $value | humanizeBytes }}" runbook: "docs/runbooks/memory-exhaustion.md" # P2 Info Alerts - alert: UnusualAgentSpawnRate expr: | abs(cfn:agent_spawn_rate:1m - avg_over_time(cfn:agent_spawn_rate:1m[1h])) > 2 * stddev_over_time(cfn:agent_spawn_rate:1m[1h]) for: 10m labels: severity: info priority: P2 annotations: summary: "Unusual agent spawn rate detected" description: "Agent spawn rate is {{ $value }} standard deviations from normal" - alert: DiskSpaceWarning expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.20 for: 10m labels: severity: info priority: P2 annotations: summary: "Disk space warning on {{ $labels.instance }}" description: "Less than 20% disk space available on {{ $labels.mountpoint }} ({{ $value | humanizePercentage }} free)" runbook: "docs/runbooks/disk-space-exhaustion.md" - alert: APIRateLimitApproaching expr: | (api_requests_total / api_rate_limit_total) > 0.80 for: 5m labels: severity: info priority: P2 annotations: summary: "API rate limit approaching for {{ $labels.provider }}" description: "{{ $labels.provider }} API usage at {{ $value | humanizePercentage }} of rate limit" - alert: ConsensusScoreLow expr: | cfn_loop_consensus_score < 0.90 and cfn_loop_consensus_score >= 0.70 for: 5m labels: severity: info priority: P2 annotations: summary: "CFN Loop consensus score below optimal" description: "Task {{ $labels.task_id }} has consensus score {{ $value | humanizePercentage }} (below 0.90 threshold)"