UNPKG

@clduab11/gemini-flow

Version:

Revolutionary AI agent swarm coordination platform with Google Services integration, multimedia processing, and production-ready monitoring. Features 8 Google AI services, quantum computing capabilities, and enterprise-grade security.

617 lines (580 loc) 20.2 kB
apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: gemini-flow-monitoring labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: monitoring data: prometheus.yml: | global: scrape_interval: 15s scrape_timeout: 10s evaluation_interval: 15s external_labels: cluster: 'gemini-flow-gke' environment: 'production' region: 'us-central1' rule_files: - "/etc/prometheus/rules/*.yml" alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 scrape_configs: # Prometheus self-monitoring - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] scrape_interval: 5s # Kubernetes API server - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints namespaces: names: - default scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https # Kubernetes nodes - job_name: 'kubernetes-nodes' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) # Kubernetes node exporter - job_name: 'kubernetes-nodes-cadvisor' kubernetes_sd_configs: - role: node scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor # Gemini-Flow application metrics - job_name: 'gemini-flow-api' kubernetes_sd_configs: - role: pod namespaces: names: - gemini-flow relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] action: keep regex: gemini-flow - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] action: keep regex: api - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name # Gemini-Flow worker metrics - job_name: 'gemini-flow-worker' kubernetes_sd_configs: - role: pod namespaces: names: - gemini-flow relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] action: keep regex: gemini-flow - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component] action: keep regex: worker - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name # Redis metrics - job_name: 'redis' kubernetes_sd_configs: - role: pod namespaces: names: - gemini-flow relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app] action: keep regex: redis - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ # PostgreSQL metrics - job_name: 'postgresql' kubernetes_sd_configs: - role: pod namespaces: names: - gemini-flow relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app] action: keep regex: postgresql - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true # Istio mesh metrics - job_name: 'istio-mesh' kubernetes_sd_configs: - role: endpoints namespaces: names: - istio-system relabel_configs: - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: istio-proxy;http-monitoring # Istio pilot metrics - job_name: 'istio-pilot' kubernetes_sd_configs: - role: endpoints namespaces: names: - istio-system relabel_configs: - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: istiod;http-monitoring # Kubernetes service discovery for ServiceMonitor CRDs - job_name: 'kubernetes-service-endpoints' kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name # Google Cloud Monitoring (for GCP resources) - job_name: 'gcp-monitoring' metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://monitoring.googleapis.com/v1/projects/PROJECT_ID/metricDescriptors relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: blackbox-exporter:9115 # Argo Rollouts metrics - job_name: 'argo-rollouts' kubernetes_sd_configs: - role: pod namespaces: names: - argo-rollouts relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] action: keep regex: argo-rollouts - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true # Unleash feature flags metrics - job_name: 'unleash' kubernetes_sd_configs: - role: pod namespaces: names: - gemini-flow relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app] action: keep regex: unleash - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true metrics_path: /internal-backstage/prometheus alert_rules.yml: | groups: - name: gemini-flow.rules rules: - alert: GeminiFlowHighErrorRate expr: rate(http_requests_total{job="gemini-flow-api",status=~"5.."}[5m]) / rate(http_requests_total{job="gemini-flow-api"}[5m]) > 0.05 for: 5m labels: severity: warning service: gemini-flow annotations: summary: "High error rate detected for Gemini-Flow API" description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes" - alert: GeminiFlowHighLatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="gemini-flow-api"}[5m])) > 1.0 for: 5m labels: severity: warning service: gemini-flow annotations: summary: "High latency detected for Gemini-Flow API" description: "95th percentile latency is {{ $value }}s for the last 5 minutes" - alert: GeminiFlowPodCrashLooping expr: increase(kube_pod_container_status_restarts_total{namespace="gemini-flow"}[15m]) > 0 for: 5m labels: severity: critical service: gemini-flow annotations: summary: "Gemini-Flow pod is crash looping" description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has restarted {{ $value }} times in the last 15 minutes" - alert: GeminiFlowHighCPUUsage expr: rate(container_cpu_usage_seconds_total{namespace="gemini-flow",container="gemini-flow-api"}[5m]) > 0.8 for: 10m labels: severity: warning service: gemini-flow annotations: summary: "High CPU usage for Gemini-Flow" description: "CPU usage is {{ $value | humanizePercentage }} for pod {{ $labels.pod }}" - alert: GeminiFlowHighMemoryUsage expr: container_memory_usage_bytes{namespace="gemini-flow",container="gemini-flow-api"} / container_spec_memory_limit_bytes{namespace="gemini-flow",container="gemini-flow-api"} > 0.85 for: 10m labels: severity: warning service: gemini-flow annotations: summary: "High memory usage for Gemini-Flow" description: "Memory usage is {{ $value | humanizePercentage }} for pod {{ $labels.pod }}" - alert: GeminiFlowServiceDown expr: up{job="gemini-flow-api"} == 0 for: 1m labels: severity: critical service: gemini-flow annotations: summary: "Gemini-Flow service is down" description: "Gemini-Flow API service has been down for more than 1 minute" - alert: RedisConnectionFailure expr: redis_up{job="redis"} == 0 for: 1m labels: severity: critical service: redis annotations: summary: "Redis connection failure" description: "Cannot connect to Redis instance for more than 1 minute" - alert: PostgreSQLConnectionFailure expr: pg_up{job="postgresql"} == 0 for: 1m labels: severity: critical service: postgresql annotations: summary: "PostgreSQL connection failure" description: "Cannot connect to PostgreSQL instance for more than 1 minute" - alert: GeminiFlowDeploymentRolloutStuck expr: kube_deployment_status_condition{namespace="gemini-flow",condition="Progressing",status="false"} == 1 for: 15m labels: severity: warning service: gemini-flow annotations: summary: "Gemini-Flow deployment rollout is stuck" description: "Deployment {{ $labels.deployment }} rollout is stuck for more than 15 minutes" - alert: VertexAIQuotaExceeded expr: increase(vertex_ai_quota_exceeded_total[5m]) > 0 for: 1m labels: severity: warning service: vertex-ai annotations: summary: "Vertex AI quota exceeded" description: "Vertex AI quota has been exceeded {{ $value }} times in the last 5 minutes" - alert: FeatureFlagServiceDown expr: up{job="unleash"} == 0 for: 2m labels: severity: warning service: feature-flags annotations: summary: "Feature flag service is down" description: "Unleash feature flag service has been down for more than 2 minutes" - name: kubernetes.rules rules: - alert: KubernetesNodeDown expr: up{job="kubernetes-nodes"} == 0 for: 5m labels: severity: critical service: kubernetes annotations: summary: "Kubernetes node is down" description: "Node {{ $labels.instance }} has been down for more than 5 minutes" - alert: KubernetesNodeHighCPU expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 10m labels: severity: warning service: kubernetes annotations: summary: "High CPU usage on Kubernetes node" description: "CPU usage is {{ $value }}% on node {{ $labels.instance }}" - alert: KubernetesNodeHighMemory expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 10m labels: severity: warning service: kubernetes annotations: summary: "High memory usage on Kubernetes node" description: "Memory usage is {{ $value }}% on node {{ $labels.instance }}" - alert: KubernetesPodPending expr: kube_pod_status_phase{phase="Pending"} == 1 for: 5m labels: severity: warning service: kubernetes annotations: summary: "Pod is stuck in pending state" description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has been pending for more than 5 minutes" --- apiVersion: apps/v1 kind: Deployment metadata: name: prometheus namespace: gemini-flow-monitoring labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: monitoring spec: replicas: 1 selector: matchLabels: app.kubernetes.io/name: prometheus template: metadata: labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: monitoring annotations: prometheus.io/scrape: "true" prometheus.io/port: "9090" spec: serviceAccountName: prometheus securityContext: runAsNonRoot: true runAsUser: 65534 fsGroup: 65534 containers: - name: prometheus image: prom/prometheus:v2.47.0 args: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus/' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--web.enable-lifecycle' - '--web.enable-admin-api' - '--storage.tsdb.retention.time=30d' - '--storage.tsdb.retention.size=50GB' ports: - containerPort: 9090 name: http resources: requests: cpu: 500m memory: 2Gi limits: cpu: 2000m memory: 8Gi volumeMounts: - name: config mountPath: /etc/prometheus - name: storage mountPath: /prometheus livenessProbe: httpGet: path: /-/healthy port: 9090 initialDelaySeconds: 30 periodSeconds: 10 readinessProbe: httpGet: path: /-/ready port: 9090 initialDelaySeconds: 5 periodSeconds: 5 volumes: - name: config configMap: name: prometheus-config - name: storage persistentVolumeClaim: claimName: prometheus-storage --- apiVersion: v1 kind: Service metadata: name: prometheus namespace: gemini-flow-monitoring labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: monitoring spec: selector: app.kubernetes.io/name: prometheus ports: - port: 9090 targetPort: 9090 name: http --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-storage namespace: gemini-flow-monitoring labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: storage spec: accessModes: - ReadWriteOnce resources: requests: storage: 50Gi storageClassName: fast-ssd --- apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: gemini-flow-monitoring labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: monitoring --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: monitoring rules: - apiGroups: [""] resources: - nodes - nodes/proxy - nodes/metrics - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: ["extensions", "apps"] resources: - deployments - replicasets - statefulsets - daemonsets verbs: ["get", "list", "watch"] - apiGroups: ["networking.k8s.io"] resources: - ingresses verbs: ["get", "list", "watch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus labels: app.kubernetes.io/name: prometheus app.kubernetes.io/component: monitoring roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: gemini-flow-monitoring