claude-flow-novice
Version:
Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes CodeSearch (hybrid SQLite + pgvector), mem0/memgraph specialists, and all CFN skills.
316 lines (284 loc) • 11.7 kB
YAML
groups:
- name: cfn_loop_recording_rules
interval: 30s
rules:
# Agent spawn rate (per minute)
- record: cfn:agent_spawn_rate:1m
expr: rate(cfn_agent_spawns_total[1m])
# Agent execution success rate
- record: cfn:agent_success_rate:5m
expr: |
sum(rate(cfn_agent_executions_total{status="success"}[5m])) by (team, agent_type)
/
sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type)
# Average agent execution duration (P50, P95, P99)
- record: cfn:agent_duration:p50
expr: histogram_quantile(0.50, rate(cfn_agent_execution_duration_seconds_bucket[5m]))
- record: cfn:agent_duration:p95
expr: histogram_quantile(0.95, rate(cfn_agent_execution_duration_seconds_bucket[5m]))
- record: cfn:agent_duration:p99
expr: histogram_quantile(0.99, rate(cfn_agent_execution_duration_seconds_bucket[5m]))
# Cost aggregation by team
- record: cfn:cost_by_team:1h
expr: sum(increase(cfn_agent_cost_dollars_total[1h])) by (team)
# Cost aggregation by project
- record: cfn:cost_by_project:1h
expr: sum(increase(cfn_agent_cost_dollars_total[1h])) by (project)
# Token usage by provider
- record: cfn:tokens_by_provider:1h
expr: sum(increase(cfn_agent_tokens_total[1h])) by (provider, token_type)
- name: cfn_loop_alerts
rules:
# High agent failure rate
- alert: HighAgentFailureRate
expr: |
(
sum(rate(cfn_agent_executions_total{status="failure"}[5m])) by (team, agent_type)
/
sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type)
) > 0.10
for: 5m
labels:
severity: warning
annotations:
summary: "High agent failure rate detected"
description: "Agent {{ $labels.agent_type }} in team {{ $labels.team }} has failure rate above 10% (current: {{ $value | humanizePercentage }})"
# Critical agent failure rate
- alert: CriticalAgentFailureRate
expr: |
(
sum(rate(cfn_agent_executions_total{status="failure"}[5m])) by (team, agent_type)
/
sum(rate(cfn_agent_executions_total[5m])) by (team, agent_type)
) > 0.25
for: 2m
labels:
severity: critical
annotations:
summary: "Critical agent failure rate detected"
description: "Agent {{ $labels.agent_type }} in team {{ $labels.team }} has failure rate above 25% (current: {{ $value | humanizePercentage }})"
# Slow agent execution (P95 above 5 minutes)
- alert: SlowAgentExecution
expr: cfn:agent_duration:p95 > 300
for: 10m
labels:
severity: warning
annotations:
summary: "Slow agent execution detected"
description: "P95 agent execution time is above 5 minutes (current: {{ $value | humanizeDuration }})"
# Health check failures
- alert: HealthCheckFailure
expr: |
rate(cfn_health_check_failure_total[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Health check failures detected"
description: "Health check {{ $labels.check_type }} is failing (error: {{ $labels.error_type }})"
# Critical health check failures (multiple checks failing)
- alert: CriticalHealthCheckFailure
expr: |
count(rate(cfn_health_check_failure_total[5m]) > 0) >= 2
for: 2m
labels:
severity: critical
annotations:
summary: "Multiple health checks failing"
description: "Multiple health checks are failing - system may be unhealthy"
# High cost per hour
- alert: HighCostPerHour
expr: |
sum(rate(cfn_agent_cost_dollars_total[1h])) by (team) > 10
for: 1h
labels:
severity: warning
annotations:
summary: "High hourly cost detected"
description: "Team {{ $labels.team }} is spending more than $10/hour (current: ${{ $value | humanize }})"
# Docker operation failures
- alert: DockerOperationFailures
expr: |
rate(cfn_docker_operations_total{status="failure"}[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Docker operation failures detected"
description: "Docker {{ $labels.operation }} operations are failing at {{ $value | humanizePercentage }}"
# High memory usage by agents
- alert: HighAgentMemoryUsage
expr: |
cfn_agent_memory_usage_bytes > 2147483648 # 2GB
for: 5m
labels:
severity: warning
annotations:
summary: "High agent memory usage"
description: "Agent {{ $labels.agent_id }} ({{ $labels.agent_type }}) is using {{ $value | humanize1024 }} of memory"
# CFN Loop stuck (no progress for 30 minutes)
- alert: CFNLoopStuck
expr: |
(time() - max(cfn_loop_iterations_total) by (task_id)) > 1800
for: 5m
labels:
severity: critical
annotations:
summary: "CFN Loop appears stuck"
description: "Task {{ $labels.task_id }} has not progressed in over 30 minutes"
# Low consensus score
- alert: LowConsensusScore
expr: |
cfn_loop_consensus_score < 0.7
for: 5m
labels:
severity: warning
annotations:
summary: "Low CFN Loop consensus score"
description: "Task {{ $labels.task_id }} iteration {{ $labels.iteration }} has low consensus ({{ $value | humanizePercentage }})"
# Low test pass rate
- alert: LowTestPassRate
expr: |
cfn_loop_test_pass_rate < 0.95
for: 2m
labels:
severity: warning
annotations:
summary: "Low CFN Loop test pass rate"
description: "Task {{ $labels.task_id }} iteration {{ $labels.iteration }} has low test pass rate ({{ $value | humanizePercentage }})"
# P0 Critical Infrastructure Alerts
- alert: RedisConnectivityLost
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
priority: P0
annotations:
summary: "Redis connectivity lost"
description: "Redis instance {{ $labels.instance }} is unreachable for 1 minute. Immediate action required."
runbook: "docs/runbooks/redis-connection-loss.md"
- alert: PostgreSQLConnectivityLost
expr: up{job="postgres"} == 0
for: 1m
labels:
severity: critical
priority: P0
annotations:
summary: "PostgreSQL connectivity lost"
description: "PostgreSQL instance {{ $labels.instance }} is unreachable for 1 minute. Immediate action required."
runbook: "docs/runbooks/postgres-connection-loss.md"
- alert: DockerDaemonUnavailable
expr: up{job="docker"} == 0
for: 2m
labels:
severity: critical
priority: P0
annotations:
summary: "Docker daemon unavailable"
description: "Docker daemon on {{ $labels.instance }} is unavailable for 2 minutes."
runbook: "docs/runbooks/docker-daemon-unavailable.md"
- alert: DiskSpaceCritical
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.10
for: 5m
labels:
severity: critical
priority: P0
annotations:
summary: "Disk space critical on {{ $labels.instance }}"
description: "Less than 10% disk space available on {{ $labels.mountpoint }} ({{ $value | humanizePercentage }} free)"
runbook: "docs/runbooks/disk-space-exhaustion.md"
- alert: HealthCheckConsecutiveFailures
expr: |
count_over_time((rate(cfn_health_check_failure_total[5m]) > 0)[15m:]) >= 3
for: 1m
labels:
severity: critical
priority: P0
annotations:
summary: "Three consecutive health check failures"
description: "System health checks have failed 3 or more times consecutively"
- alert: CFNLoopStuckCritical
expr: |
(time() - max(cfn_loop_iterations_total) by (task_id)) > 3600
for: 5m
labels:
severity: critical
priority: P0
annotations:
summary: "CFN Loop critically stuck for over 1 hour"
description: "Task {{ $labels.task_id }} has not progressed in over 1 hour"
runbook: "docs/runbooks/cfn-loop-stuck.md"
# P1 Warning Alerts
- alert: BackupFailure
expr: up{job="backup"} == 0 or increase(backup_failure_total[1h]) > 0
for: 5m
labels:
severity: warning
priority: P1
annotations:
summary: "Backup operation failed"
description: "Backup job on {{ $labels.instance }} has failed"
runbook: "docs/runbooks/backup-failure.md"
- alert: CertificateExpiringSoon
expr: (ssl_cert_expiry_seconds - time()) < (7 * 24 * 3600)
for: 1h
labels:
severity: warning
priority: P1
annotations:
summary: "SSL certificate expiring within 7 days"
description: "Certificate {{ $labels.cn }} expires in less than 7 days"
runbook: "docs/runbooks/certificate-expiration.md"
- alert: AgentMemoryCritical
expr: |
cfn_agent_memory_usage_bytes > 3221225472 # 3GB
for: 5m
labels:
severity: warning
priority: P1
annotations:
summary: "Agent memory usage exceeds 3GB"
description: "Agent {{ $labels.agent_type }} on team {{ $labels.team }} using {{ $value | humanizeBytes }}"
runbook: "docs/runbooks/memory-exhaustion.md"
# P2 Info Alerts
- alert: UnusualAgentSpawnRate
expr: |
abs(cfn:agent_spawn_rate:1m - avg_over_time(cfn:agent_spawn_rate:1m[1h]))
> 2 * stddev_over_time(cfn:agent_spawn_rate:1m[1h])
for: 10m
labels:
severity: info
priority: P2
annotations:
summary: "Unusual agent spawn rate detected"
description: "Agent spawn rate is {{ $value }} standard deviations from normal"
- alert: DiskSpaceWarning
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.20
for: 10m
labels:
severity: info
priority: P2
annotations:
summary: "Disk space warning on {{ $labels.instance }}"
description: "Less than 20% disk space available on {{ $labels.mountpoint }} ({{ $value | humanizePercentage }} free)"
runbook: "docs/runbooks/disk-space-exhaustion.md"
- alert: APIRateLimitApproaching
expr: |
(api_requests_total / api_rate_limit_total) > 0.80
for: 5m
labels:
severity: info
priority: P2
annotations:
summary: "API rate limit approaching for {{ $labels.provider }}"
description: "{{ $labels.provider }} API usage at {{ $value | humanizePercentage }} of rate limit"
- alert: ConsensusScoreLow
expr: |
cfn_loop_consensus_score < 0.90 and cfn_loop_consensus_score >= 0.70
for: 5m
labels:
severity: info
priority: P2
annotations:
summary: "CFN Loop consensus score below optimal"
description: "Task {{ $labels.task_id }} has consensus score {{ $value | humanizePercentage }} (below 0.90 threshold)"