UNPKG

claude-flow-novice

Version:

Claude Flow Novice - Advanced orchestration platform for multi-agent AI workflows with CFN Loop architecture Includes Local RuVector Accelerator and all CFN skills for complete functionality.

163 lines (149 loc) 5.78 kB
groups: - name: resource_alerts interval: 30s rules: - alert: HighCPUUsage expr: | (sum(rate(process_cpu_seconds_total[5m])) / count(process_cpu_seconds_total) * 100) > 80 for: 10m labels: severity: warning resource: cpu annotations: summary: "High CPU usage detected" description: "Average CPU usage is {{ $value | humanize }}% (threshold: 80%)" - alert: CriticalCPUUsage expr: | (sum(rate(process_cpu_seconds_total[5m])) / count(process_cpu_seconds_total) * 100) > 90 for: 5m labels: severity: critical resource: cpu annotations: summary: "Critical CPU usage" description: "Average CPU usage is {{ $value | humanize }}%" runbook_url: "docs/INCIDENT_RESPONSE.md" - alert: HighMemoryUsage expr: | (sum(process_resident_memory_bytes) / sum(node_memory_MemTotal_bytes) * 100) > 85 for: 10m labels: severity: warning resource: memory annotations: summary: "High memory usage detected" description: "Memory usage is {{ $value | humanize }}% (threshold: 85%)" - alert: CriticalMemoryUsage expr: | (sum(process_resident_memory_bytes) / sum(node_memory_MemTotal_bytes) * 100) > 95 for: 5m labels: severity: critical resource: memory annotations: summary: "Critical memory usage" description: "Memory usage is {{ $value | humanize }}%" runbook_url: "docs/INCIDENT_RESPONSE.md" - alert: HighDiskUsage expr: | (node_filesystem_avail_bytes / node_filesystem_size_bytes * 100) < 15 for: 15m labels: severity: warning resource: disk annotations: summary: "Low disk space" description: "Available disk space is {{ $value | humanize }}% (threshold: 15%)" - alert: CriticalDiskSpace expr: | (node_filesystem_avail_bytes / node_filesystem_size_bytes * 100) < 5 for: 5m labels: severity: critical resource: disk annotations: summary: "Critical disk space condition" description: "Available disk space is {{ $value | humanize }}%" runbook_url: "docs/INCIDENT_RESPONSE.md" - alert: HighDiskIOUsage expr: | (rate(node_disk_io_time_seconds_total[5m]) / on(device) rate(node_disk_io_time_ms_total[5m])) > 0.8 for: 10m labels: severity: warning resource: disk_io annotations: summary: "High disk I/O utilization" description: "Disk I/O utilization is {{ $value | humanizePercentage }}" - alert: HighNetworkBandwidth expr: | sum(rate(node_network_transmit_bytes_total[5m]) + rate(node_network_receive_bytes_total[5m])) > 1000000000 for: 10m labels: severity: warning resource: network annotations: summary: "High network bandwidth usage" description: "Network bandwidth is {{ $value | humanize }} bytes/sec" - alert: DatabaseConnectionPoolExhaustion expr: | (pg_stat_activity_count / pg_settings_max_connections) > 0.85 for: 5m labels: severity: critical component: database annotations: summary: "Database connection pool near exhaustion" description: "{{ $value | humanizePercentage }} of connections in use" runbook_url: "docs/ROLLBACK_RUNBOOK.md" - alert: RedisMemoryUsageHigh expr: | (redis_memory_used_bytes / redis_memory_max_bytes) > 0.8 for: 10m labels: severity: warning component: redis annotations: summary: "Redis memory usage high" description: "Redis memory usage is {{ $value | humanizePercentage }}" - alert: NodeOutOfMemory expr: | (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.05 for: 5m labels: severity: critical resource: memory annotations: summary: "Node running out of memory" description: "Available memory is {{ $value | humanizePercentage }}" runbook_url: "docs/INCIDENT_RESPONSE.md" - alert: FileDescriptorExhaustion expr: | (process_open_fds / process_max_fds) > 0.85 for: 10m labels: severity: warning resource: file_descriptors annotations: summary: "File descriptor limit near exhaustion" description: "{{ $value | humanizePercentage }} of file descriptors in use" - alert: TemporaryStorageUsage expr: | (node_filesystem_avail_bytes{fstype="tmpfs"} / node_filesystem_size_bytes{fstype="tmpfs"}) < 0.1 for: 10m labels: severity: warning resource: tmp_storage annotations: summary: "Temporary storage usage high" description: "Temporary storage available: {{ $value | humanizePercentage }}" - alert: DatabaseDiskSpaceUsage expr: | (pg_database_size_bytes / node_filesystem_size_bytes) > 0.7 for: 15m labels: severity: warning component: database annotations: summary: "Database disk usage growing" description: "Database size is {{ $value | humanizePercentage }} of available space"