claude-flow-novice
Version:
Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes Local RuVector Accelerator and all CFN skills for complete functionality.
117 lines (107 loc) • 4.25 kB
YAML
groups:
- name: error_rate_alerts
interval: 30s
rules:
- alert: HighErrorRate
expr: |
(sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) > 0.01
for: 5m
labels:
severity: critical
component: integration
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
runbook_url: "docs/INCIDENT_RESPONSE.md"
- alert: ErrorRateWarning
expr: |
(sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m]))) > 0.001
for: 10m
labels:
severity: warning
component: integration
annotations:
summary: "Elevated error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 0.1%)"
- alert: DatabaseErrorRate
expr: |
(sum(rate(db_queries_failed_total[5m])) / sum(rate(db_queries_total[5m]))) > 0.01
for: 5m
labels:
severity: critical
component: database
annotations:
summary: "High database error rate"
description: "Database error rate is {{ $value | humanizePercentage }}"
runbook_url: "docs/ROLLBACK_RUNBOOK.md"
- alert: CoordinationProtocolErrors
expr: |
(sum(rate(coordination_protocol_errors_total[5m])) / sum(rate(coordination_messages_total[5m]))) > 0.05
for: 5m
labels:
severity: high
component: coordination
annotations:
summary: "High coordination protocol error rate"
description: "Protocol error rate is {{ $value | humanizePercentage }} (threshold: 5%)"
- alert: IntegrationPointFailures
expr: |
sum(rate(integration_point_failures_total[5m])) > 5
for: 5m
labels:
severity: high
component: integration
annotations:
summary: "Integration point failures detected"
description: "{{ $value }} integration point failures per second"
- alert: SkillExecutionFailures
expr: |
(sum(rate(skill_executions_failed_total[5m])) / sum(rate(skill_executions_total[5m]))) > 0.05
for: 10m
labels:
severity: warning
component: skill_deployment
annotations:
summary: "Elevated skill execution failure rate"
description: "Skill failure rate is {{ $value | humanizePercentage }}"
- alert: DataValidationErrors
expr: |
sum(rate(data_validation_errors_total[5m])) > 10
for: 5m
labels:
severity: critical
component: data_integrity
annotations:
summary: "Data validation errors detected"
description: "{{ $value }} validation errors per second - possible data corruption"
runbook_url: "docs/INCIDENT_RESPONSE.md"
- alert: TransactionRollbacks
expr: |
sum(rate(db_transactions_rolled_back_total[5m])) > 50
for: 10m
labels:
severity: high
component: database
annotations:
summary: "High transaction rollback rate"
description: "{{ $value }} transactions rolled back per second"
- alert: ArtifactStorageErrors
expr: |
(sum(rate(artifact_storage_errors_total[5m])) / sum(rate(artifact_storage_operations_total[5m]))) > 0.01
for: 5m
labels:
severity: high
component: artifact_storage
annotations:
summary: "Artifact storage errors detected"
description: "{{ $value | humanizePercentage }} of operations failed"
- alert: MetricCollectionErrors
expr: |
sum(rate(metrics_collection_errors_total[5m])) > 100
for: 10m
labels:
severity: warning
component: metrics
annotations:
summary: "Metrics collection failures"
description: "{{ $value }} metric collection errors per second"