@clduab11/gemini-flow
Version:
Revolutionary AI agent swarm coordination platform with Google Services integration, multimedia processing, and production-ready monitoring. Features 8 Google AI services, quantum computing capabilities, and enterprise-grade security.
617 lines (580 loc) • 20.2 kB
YAML
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: gemini-flow-monitoring
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
external_labels:
cluster: 'gemini-flow-gke'
environment: 'production'
region: 'us-central1'
rule_files:
- "/etc/prometheus/rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
scrape_interval: 5s
# Kubernetes API server
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- default
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
# Kubernetes nodes
- job_name: 'kubernetes-nodes'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Kubernetes node exporter
- job_name: 'kubernetes-nodes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
# Gemini-Flow application metrics
- job_name: 'gemini-flow-api'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- gemini-flow
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: gemini-flow
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: api
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# Gemini-Flow worker metrics
- job_name: 'gemini-flow-worker'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- gemini-flow
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: gemini-flow
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: worker
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# Redis metrics
- job_name: 'redis'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- gemini-flow
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: redis
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
# PostgreSQL metrics
- job_name: 'postgresql'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- gemini-flow
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: postgresql
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
# Istio mesh metrics
- job_name: 'istio-mesh'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istio-proxy;http-monitoring
# Istio pilot metrics
- job_name: 'istio-pilot'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- istio-system
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: istiod;http-monitoring
# Kubernetes service discovery for ServiceMonitor CRDs
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# Google Cloud Monitoring (for GCP resources)
- job_name: 'gcp-monitoring'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://monitoring.googleapis.com/v1/projects/PROJECT_ID/metricDescriptors
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115
# Argo Rollouts metrics
- job_name: 'argo-rollouts'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- argo-rollouts
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
action: keep
regex: argo-rollouts
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
# Unleash feature flags metrics
- job_name: 'unleash'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- gemini-flow
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: unleash
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
metrics_path: /internal-backstage/prometheus
alert_rules.yml: |
groups:
- name: gemini-flow.rules
rules:
- alert: GeminiFlowHighErrorRate
expr: rate(http_requests_total{job="gemini-flow-api",status=~"5.."}[5m]) / rate(http_requests_total{job="gemini-flow-api"}[5m]) > 0.05
for: 5m
labels:
severity: warning
service: gemini-flow
annotations:
summary: "High error rate detected for Gemini-Flow API"
description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
- alert: GeminiFlowHighLatency
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="gemini-flow-api"}[5m])) > 1.0
for: 5m
labels:
severity: warning
service: gemini-flow
annotations:
summary: "High latency detected for Gemini-Flow API"
description: "95th percentile latency is {{ $value }}s for the last 5 minutes"
- alert: GeminiFlowPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total{namespace="gemini-flow"}[15m]) > 0
for: 5m
labels:
severity: critical
service: gemini-flow
annotations:
summary: "Gemini-Flow pod is crash looping"
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has restarted {{ $value }} times in the last 15 minutes"
- alert: GeminiFlowHighCPUUsage
expr: rate(container_cpu_usage_seconds_total{namespace="gemini-flow",container="gemini-flow-api"}[5m]) > 0.8
for: 10m
labels:
severity: warning
service: gemini-flow
annotations:
summary: "High CPU usage for Gemini-Flow"
description: "CPU usage is {{ $value | humanizePercentage }} for pod {{ $labels.pod }}"
- alert: GeminiFlowHighMemoryUsage
expr: container_memory_usage_bytes{namespace="gemini-flow",container="gemini-flow-api"} / container_spec_memory_limit_bytes{namespace="gemini-flow",container="gemini-flow-api"} > 0.85
for: 10m
labels:
severity: warning
service: gemini-flow
annotations:
summary: "High memory usage for Gemini-Flow"
description: "Memory usage is {{ $value | humanizePercentage }} for pod {{ $labels.pod }}"
- alert: GeminiFlowServiceDown
expr: up{job="gemini-flow-api"} == 0
for: 1m
labels:
severity: critical
service: gemini-flow
annotations:
summary: "Gemini-Flow service is down"
description: "Gemini-Flow API service has been down for more than 1 minute"
- alert: RedisConnectionFailure
expr: redis_up{job="redis"} == 0
for: 1m
labels:
severity: critical
service: redis
annotations:
summary: "Redis connection failure"
description: "Cannot connect to Redis instance for more than 1 minute"
- alert: PostgreSQLConnectionFailure
expr: pg_up{job="postgresql"} == 0
for: 1m
labels:
severity: critical
service: postgresql
annotations:
summary: "PostgreSQL connection failure"
description: "Cannot connect to PostgreSQL instance for more than 1 minute"
- alert: GeminiFlowDeploymentRolloutStuck
expr: kube_deployment_status_condition{namespace="gemini-flow",condition="Progressing",status="false"} == 1
for: 15m
labels:
severity: warning
service: gemini-flow
annotations:
summary: "Gemini-Flow deployment rollout is stuck"
description: "Deployment {{ $labels.deployment }} rollout is stuck for more than 15 minutes"
- alert: VertexAIQuotaExceeded
expr: increase(vertex_ai_quota_exceeded_total[5m]) > 0
for: 1m
labels:
severity: warning
service: vertex-ai
annotations:
summary: "Vertex AI quota exceeded"
description: "Vertex AI quota has been exceeded {{ $value }} times in the last 5 minutes"
- alert: FeatureFlagServiceDown
expr: up{job="unleash"} == 0
for: 2m
labels:
severity: warning
service: feature-flags
annotations:
summary: "Feature flag service is down"
description: "Unleash feature flag service has been down for more than 2 minutes"
- name: kubernetes.rules
rules:
- alert: KubernetesNodeDown
expr: up{job="kubernetes-nodes"} == 0
for: 5m
labels:
severity: critical
service: kubernetes
annotations:
summary: "Kubernetes node is down"
description: "Node {{ $labels.instance }} has been down for more than 5 minutes"
- alert: KubernetesNodeHighCPU
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
service: kubernetes
annotations:
summary: "High CPU usage on Kubernetes node"
description: "CPU usage is {{ $value }}% on node {{ $labels.instance }}"
- alert: KubernetesNodeHighMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 10m
labels:
severity: warning
service: kubernetes
annotations:
summary: "High memory usage on Kubernetes node"
description: "Memory usage is {{ $value }}% on node {{ $labels.instance }}"
- alert: KubernetesPodPending
expr: kube_pod_status_phase{phase="Pending"} == 1
for: 5m
labels:
severity: warning
service: kubernetes
annotations:
summary: "Pod is stuck in pending state"
description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has been pending for more than 5 minutes"
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: gemini-flow-monitoring
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: monitoring
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: prometheus
template:
metadata:
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: monitoring
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
spec:
serviceAccountName: prometheus
securityContext:
runAsNonRoot: true
runAsUser: 65534
fsGroup: 65534
containers:
- name: prometheus
image: prom/prometheus:v2.47.0
args:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus/'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
- '--storage.tsdb.retention.time=30d'
- '--storage.tsdb.retention.size=50GB'
ports:
- containerPort: 9090
name: http
resources:
requests:
cpu: 500m
memory: 2Gi
limits:
cpu: 2000m
memory: 8Gi
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: storage
mountPath: /prometheus
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: config
configMap:
name: prometheus-config
- name: storage
persistentVolumeClaim:
claimName: prometheus-storage
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: gemini-flow-monitoring
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: monitoring
spec:
selector:
app.kubernetes.io/name: prometheus
ports:
- port: 9090
targetPort: 9090
name: http
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-storage
namespace: gemini-flow-monitoring
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: storage
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
storageClassName: fast-ssd
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: gemini-flow-monitoring
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: monitoring
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: monitoring
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: ["extensions", "apps"]
resources:
- deployments
- replicasets
- statefulsets
- daemonsets
verbs: ["get", "list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
labels:
app.kubernetes.io/name: prometheus
app.kubernetes.io/component: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: gemini-flow-monitoring