UNPKG

openclaw-grafana-lens

Version:

OpenClaw plugin that gives AI agents full Grafana access — 18 composable tools for PromQL/LogQL/TraceQL queries, dashboard creation, alerting, SRE investigation, security monitoring, data collection pipeline management via Grafana Alloy (29 recipes), and

481 lines (480 loc) 25.1 kB
/** * grafana_investigate tool * * Multi-signal investigation triage: gathers metrics, logs, traces, and context * signals in parallel, then generates hypothesis suggestions with specific * tool+params for follow-up. * * Design: This is a "first step" accelerator — use it to quickly gather * evidence across all signal types, then follow up with individual tools * (grafana_query, grafana_query_logs, grafana_query_traces) for deep-dives. * * Follows the same resilient pattern as grafana_security_check: * Promise.allSettled for graceful degradation when signal sources are unavailable. */ import { jsonResult, readStringParam } from "../sdk-compat.js"; import { instanceProperties } from "./instance-param.js"; /** Lookback windows mapped to step sizes for range queries. */ const WINDOW_CONFIG = { "1h": { seconds: 3600, step: "60" }, "6h": { seconds: 21600, step: "300" }, "24h": { seconds: 86400, step: "900" }, }; const PLAIN_METRIC_RE = /^[a-zA-Z_:][a-zA-Z0-9_:]*$/; export function createInvestigateToolFactory(registry, store) { return (_ctx) => ({ name: "grafana_investigate", label: "Investigate", description: [ "WORKFLOW: Use as the FIRST step for investigating alerts, errors, anomalies, or any 'what's wrong?' question.", "Gathers multi-signal evidence in parallel (metrics, logs, traces, annotations, active alerts) and generates hypothesis suggestions with specific tool+params for follow-up.", "Each hypothesis includes a `testWith` field with the exact tool name and parameters to test it — use these for deep-dives.", "Focus can be an alert UID, metric name, or free-text symptom description.", "Gracefully degrades when Loki or Tempo datasources are unavailable — you still get metrics and context signals.", "After investigating, use grafana_annotate to mark findings and grafana_check_alerts to acknowledge alerts.", ].join(" "), parameters: { type: "object", properties: { ...instanceProperties(registry), focus: { type: "string", description: "Alert UID, metric name, or symptom description (e.g., 'alert-abc', 'openclaw_lens_daily_cost_usd', 'high error rate')", }, timeWindow: { type: "string", enum: Object.keys(WINDOW_CONFIG), description: "Lookback window. Default: '1h'. Use '6h' for broader context, '24h' for daily patterns.", }, service: { type: "string", description: "Filter logs/traces to a specific service name. Default: 'openclaw'.", }, }, required: ["focus"], }, async execute(_toolCallId, params) { const client = registry.get(readStringParam(params, "instance")); const focus = readStringParam(params, "focus", { required: true, label: "Focus" }); const timeWindow = readStringParam(params, "timeWindow") ?? "1h"; const service = readStringParam(params, "service") ?? "openclaw"; const windowCfg = WINDOW_CONFIG[timeWindow]; if (!windowCfg) { return jsonResult({ error: `Invalid timeWindow '${timeWindow}' — use '1h', '6h', or '24h'` }); } const nowMs = Date.now(); const fromMs = nowMs - windowCfg.seconds * 1000; const nowSec = Math.floor(nowMs / 1000); const fromSec = nowSec - windowCfg.seconds; try { // ── Step 1: Auto-discover datasources ────────────────────────── const datasources = await client.listDatasources(); const promDs = datasources.find((d) => d.type === "prometheus"); const lokiDs = datasources.find((d) => d.type === "loki"); const tempoDs = datasources.find((d) => d.type === "tempo"); if (!promDs) { return jsonResult({ error: "No Prometheus datasource found — investigation requires at least Prometheus. Use grafana_explore_datasources to verify.", }); } // ── Step 2: Resolve focus ──────────────────────────────────── const isMetricFocus = PLAIN_METRIC_RE.test(focus); // Check if focus matches an alert in the store const pendingAlerts = store.getPendingAlerts(); const matchingAlert = pendingAlerts.find((a) => a.id === focus); const focusExpr = isMetricFocus ? focus : undefined; const logSearchTerm = isMetricFocus ? undefined : focus; // ── Step 3: Gather all signals in parallel ──────────────────── const metricPromises = gatherMetricSignals(client, promDs.uid, focusExpr, windowCfg, nowSec, fromSec); const logPromise = lokiDs ? gatherLogSignals(client, lokiDs.uid, service, logSearchTerm, timeWindow, fromMs, nowMs) : Promise.resolve(null); const tracePromise = tempoDs ? gatherTraceSignals(client, tempoDs.uid, service, timeWindow, fromMs, nowMs) : Promise.resolve(null); const contextPromise = gatherContextSignals(client, fromMs, nowMs); const [metricResult, logResult, traceResult, contextResult] = await Promise.allSettled([ metricPromises, logPromise, tracePromise, contextPromise, ]); // ── Step 4: Extract results with graceful degradation ──────── const metricSignals = metricResult.status === "fulfilled" ? metricResult.value : undefined; const logSignals = logResult.status === "fulfilled" ? logResult.value : undefined; const traceSignals = traceResult.status === "fulfilled" ? traceResult.value : undefined; const contextSignals = contextResult.status === "fulfilled" ? contextResult.value : undefined; // ── Step 5: Generate hypotheses ────────────────────────────── const hypotheses = generateHypotheses({ focus, focusExpr, matchingAlert, metricSignals, logSignals, traceSignals, contextSignals, promDsUid: promDs.uid, lokiDsUid: lokiDs?.uid, tempoDsUid: tempoDs?.uid, service, }); // ── Step 6: Build limitations list ─────────────────────────── const limitations = []; if (!lokiDs) limitations.push("No Loki datasource found — log signals unavailable"); if (!tempoDs) limitations.push("No Tempo datasource found — trace signals unavailable"); if (metricResult.status === "rejected") limitations.push(`Metric queries failed: ${metricResult.reason instanceof Error ? metricResult.reason.message : String(metricResult.reason)}`); if (logResult.status === "rejected") limitations.push(`Log queries failed: ${logResult.reason instanceof Error ? logResult.reason.message : String(logResult.reason)}`); if (traceResult.status === "rejected") limitations.push(`Trace queries failed: ${traceResult.reason instanceof Error ? traceResult.reason.message : String(traceResult.reason)}`); return jsonResult({ timeWindow: { from: new Date(fromMs).toISOString(), to: new Date(nowMs).toISOString(), duration: timeWindow, }, focus, ...(metricSignals ? { metricSignals } : {}), ...(logSignals ? { logSignals } : {}), ...(traceSignals ? { traceSignals } : {}), ...(contextSignals ? { contextSignals } : {}), suggestedHypotheses: hypotheses, limitations, }); } catch (err) { const reason = err instanceof Error ? err.message : String(err); return jsonResult({ error: `Investigation failed: ${reason}` }); } }, }); // ── Signal gathering functions ───────────────────────────────────── async function gatherMetricSignals(client, promDsUid, focusExpr, windowCfg, nowSec, fromSec) { const signals = {}; if (focusExpr) { // Query the focused metric const [instantResult, rangeResult] = await Promise.allSettled([ client.queryPrometheus(promDsUid, focusExpr), client.queryPrometheusRange(promDsUid, focusExpr, String(fromSec), String(nowSec), windowCfg.step), ]); const focusData = {}; if (instantResult.status === "fulfilled") { const first = instantResult.value.data.result[0]; if (first) focusData.current = first.value[1]; } if (rangeResult.status === "fulfilled") { const series = rangeResult.value.data.result[0]; if (series && series.values.length > 0) { // Sample max 10 points for context efficiency const step = Math.max(1, Math.floor(series.values.length / 10)); focusData.trend = series.values .filter((_, i) => i % step === 0 || i === series.values.length - 1) .map(([ts, v]) => ({ time: new Date(ts * 1000).toISOString(), value: v, })); } } // Anomaly scoring against 7d baseline const [avgResult, stddevResult] = await Promise.allSettled([ client.queryPrometheus(promDsUid, `avg_over_time(${focusExpr}[7d])`), client.queryPrometheus(promDsUid, `stddev_over_time(${focusExpr}[7d])`), ]); if (avgResult.status === "fulfilled" && stddevResult.status === "fulfilled") { const avgVal = avgResult.value.data.result[0]?.value[1]; const stddevVal = stddevResult.value.data.result[0]?.value[1]; if (avgVal && stddevVal && focusData.current) { const avg = parseFloat(avgVal); const stddev = parseFloat(stddevVal); const current = parseFloat(focusData.current); const zScore = Math.abs((current - avg) / (stddev + 1e-10)); focusData.anomalyScore = parseFloat(zScore.toFixed(2)); focusData.anomalySeverity = zScore >= 3 ? "critical" : zScore >= 2 ? "significant" : zScore >= 1.5 ? "mild" : "normal"; } } if (Object.keys(focusData).length > 0) signals.focus = focusData; } // RED signals (always query these for general health context) const [rateResult, errorRateResult, p95Result] = await Promise.allSettled([ client.queryPrometheus(promDsUid, "sum(rate(openclaw_lens_messages_processed_total[5m])) or vector(0)"), client.queryPrometheus(promDsUid, "sum(rate(openclaw_lens_messages_processed_total{outcome=\"error\"}[5m])) / (sum(rate(openclaw_lens_messages_processed_total[5m])) + 0.001)"), client.queryPrometheus(promDsUid, "histogram_quantile(0.95, sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le))"), ]); const red = {}; if (rateResult.status === "fulfilled") { red.rate = rateResult.value.data.result[0]?.value[1]; } if (errorRateResult.status === "fulfilled") { red.errorRate = errorRateResult.value.data.result[0]?.value[1]; } if (p95Result.status === "fulfilled") { red.p95Latency = p95Result.value.data.result[0]?.value[1]; } if (Object.keys(red).length > 0) signals.red = red; return signals; } async function gatherLogSignals(client, lokiDsUid, service, searchTerm, timeWindow, fromMs, nowMs) { const fromNs = String(fromMs * 1_000_000); const toNs = String(nowMs * 1_000_000); // Statistics-first: volume, severity breakdown, top patterns, then samples const baseSelector = `{service_name="${service}"}`; const errorFilter = searchTerm ? `${baseSelector} | json | level="ERROR" |= "${searchTerm.replace(/"/g, '\\"')}"` : `${baseSelector} | json | level="ERROR"`; const [volumeResult, severityResult, sampleResult] = await Promise.allSettled([ client.queryLokiRange(lokiDsUid, `sum(count_over_time(${baseSelector}[${timeWindow}]))`, fromNs, toNs, { limit: 1 }), client.queryLokiRange(lokiDsUid, `sum by (level) (count_over_time(${baseSelector} | json [${timeWindow}]))`, fromNs, toNs, { limit: 10 }), client.queryLokiRange(lokiDsUid, errorFilter, fromNs, toNs, { limit: 5, direction: "backward" }), ]); const signals = { totalVolume: 0, errorCount: 0, bySeverity: {}, topPatterns: [], sampleErrors: [], }; // Volume if (volumeResult.status === "fulfilled") { const result = volumeResult.value; const firstStream = result?.data?.result?.[0]; if (firstStream?.values?.length) { signals.totalVolume = parseInt(firstStream.values[firstStream.values.length - 1][1], 10) || 0; } } // Severity breakdown if (severityResult.status === "fulfilled") { const result = severityResult.value; if (result?.data?.result) { for (const stream of result.data.result) { const level = stream.metric?.level ?? "unknown"; const lastVal = stream.values?.length ? parseInt(stream.values[stream.values.length - 1][1], 10) : 0; signals.bySeverity[level] = lastVal || 0; if (level === "ERROR" || level === "error") { signals.errorCount = lastVal || 0; } } } } // Sample errors if (sampleResult.status === "fulfilled") { const result = sampleResult.value; if (result?.data?.result) { for (const stream of result.data.result) { if (stream.values) { for (const [ts, line] of stream.values) { signals.sampleErrors.push({ timestamp: new Date(parseInt(ts, 10) / 1_000_000).toISOString(), line: line.length > 200 ? line.slice(0, 200) + "…" : line, }); } } } signals.sampleErrors = signals.sampleErrors.slice(0, 5); } } return signals; } async function gatherTraceSignals(client, tempoDsUid, service, _timeWindow, fromMs, nowMs) { const fromSec = String(Math.floor(fromMs / 1000)); const toSec = String(Math.floor(nowMs / 1000)); const [errorResult, slowResult] = await Promise.allSettled([ client.searchTraces(tempoDsUid, `{ resource.service.name = "${service}" && status = error }`, { start: fromSec, end: toSec, limit: 5 }), client.searchTraces(tempoDsUid, `{ resource.service.name = "${service}" && duration > 10s }`, { start: fromSec, end: toSec, limit: 5 }), ]); const mapTraces = (result) => { if (result.status !== "fulfilled") return []; return result.value.traces.map((t) => ({ traceId: t.traceID, rootService: t.rootServiceName, rootSpan: t.rootTraceName, durationMs: t.durationMs, })); }; return { errorTraces: mapTraces(errorResult), slowTraces: mapTraces(slowResult), }; } async function gatherContextSignals(client, fromMs, nowMs) { const [annotationsResult] = await Promise.allSettled([ client.getAnnotations({ from: fromMs, to: nowMs, limit: 10 }), ]); const recentAnnotations = annotationsResult.status === "fulfilled" ? annotationsResult.value.map((a) => ({ text: a.text, tags: a.tags, time: new Date(a.time).toISOString(), })) : []; const pending = store.getPendingAlerts(); const alertsActive = pending.map((a) => ({ title: a.title, status: a.status, since: new Date(a.receivedAt).toISOString(), })); return { recentAnnotations, alertsActive }; } } function generateHypotheses(ctx) { const hypotheses = []; // H1: Error spike + recent annotation → deployment may have caused issues if (ctx.logSignals?.errorCount && ctx.logSignals.errorCount > 0 && ctx.contextSignals?.recentAnnotations?.length) { const recentAnnotation = ctx.contextSignals.recentAnnotations[0]; hypotheses.push({ hypothesis: `Recent event "${recentAnnotation.text}" may have caused the error increase`, evidence: `${ctx.logSignals.errorCount} errors found, annotation "${recentAnnotation.text}" at ${recentAnnotation.time}`, confidence: "medium", testWith: { tool: "grafana_query", params: { datasourceUid: ctx.promDsUid, expr: `sum(rate(openclaw_lens_messages_processed_total{outcome="error"}[5m]))`, queryType: "range", start: recentAnnotation.time, }, }, }); } // H2: High latency + context pressure → context window saturation const p95 = ctx.metricSignals?.red?.p95Latency ? parseFloat(ctx.metricSignals.red.p95Latency) : 0; if (p95 > 10) { hypotheses.push({ hypothesis: "High LLM latency may indicate context window saturation or provider throttling", evidence: `P95 latency is ${p95.toFixed(1)}s (>10s threshold)`, confidence: "medium", testWith: { tool: "grafana_query", params: { datasourceUid: ctx.promDsUid, expr: "openclaw_lens_context_tokens{type=\"used\"} / openclaw_lens_context_tokens{type=\"limit\"} * 100", }, }, }); } // H3: Error rate is elevated const errorRate = ctx.metricSignals?.red?.errorRate ? parseFloat(ctx.metricSignals.red.errorRate) : 0; if (errorRate > 0.01) { hypotheses.push({ hypothesis: `Error rate is elevated at ${(errorRate * 100).toFixed(1)}%`, evidence: `Error rate: ${(errorRate * 100).toFixed(1)}%, message rate: ${ctx.metricSignals?.red?.rate ?? "unknown"}`, confidence: "high", testWith: ctx.lokiDsUid ? { tool: "grafana_query_logs", params: { datasourceUid: ctx.lokiDsUid, expr: `topk(10, sum by (event_name) (count_over_time({service_name="${ctx.service}"} | json | level="ERROR" [1h])))`, }, } : { tool: "grafana_query", params: { datasourceUid: ctx.promDsUid, expr: `sum by (outcome) (rate(openclaw_lens_messages_processed_total[5m]))`, }, }, }); } // H4: Anomaly detected on focus metric if (ctx.metricSignals?.focus?.anomalyScore && ctx.metricSignals.focus.anomalyScore >= 2) { hypotheses.push({ hypothesis: `Focus metric shows ${ctx.metricSignals.focus.anomalySeverity} anomaly (${ctx.metricSignals.focus.anomalyScore}σ from baseline)`, evidence: `Current: ${ctx.metricSignals.focus.current}, z-score: ${ctx.metricSignals.focus.anomalyScore}σ`, confidence: ctx.metricSignals.focus.anomalyScore >= 3 ? "high" : "medium", testWith: { tool: "grafana_explain_metric", params: { datasourceUid: ctx.promDsUid, expr: ctx.focusExpr ?? ctx.focus, period: "24h", }, }, }); } // H5: Error traces available → specific request paths failing if (ctx.traceSignals?.errorTraces && ctx.traceSignals.errorTraces.length > 0) { const trace = ctx.traceSignals.errorTraces[0]; hypotheses.push({ hypothesis: `Specific request paths are failing — ${ctx.traceSignals.errorTraces.length} error trace(s) found`, evidence: `Error trace: ${trace.rootSpan} (${trace.durationMs}ms) in ${trace.rootService}`, confidence: "high", testWith: ctx.tempoDsUid ? { tool: "grafana_query_traces", params: { datasourceUid: ctx.tempoDsUid, traceId: trace.traceId, queryType: "get", }, } : { tool: "grafana_query", params: { datasourceUid: ctx.promDsUid, expr: `sum by (tool) (rate(openclaw_lens_tool_error_classes_total[5m]))`, }, }, }); } // H6: Slow traces → performance degradation if (ctx.traceSignals?.slowTraces && ctx.traceSignals.slowTraces.length > 0 && hypotheses.length < 5) { const trace = ctx.traceSignals.slowTraces[0]; hypotheses.push({ hypothesis: `Performance degradation — ${ctx.traceSignals.slowTraces.length} slow trace(s) found (>10s)`, evidence: `Slow trace: ${trace.rootSpan} (${trace.durationMs}ms) in ${trace.rootService}`, confidence: "medium", testWith: ctx.tempoDsUid ? { tool: "grafana_query_traces", params: { datasourceUid: ctx.tempoDsUid, traceId: trace.traceId, queryType: "get", }, } : { tool: "grafana_query", params: { datasourceUid: ctx.promDsUid, expr: "histogram_quantile(0.95, sum by (le, gen_ai_request_model) (rate(gen_ai_client_operation_duration_seconds_bucket[5m])))", }, }, }); } // H7: Active alerts suggest ongoing issues if (ctx.contextSignals?.alertsActive && ctx.contextSignals.alertsActive.length > 0 && !ctx.matchingAlert) { hypotheses.push({ hypothesis: `${ctx.contextSignals.alertsActive.length} active alert(s) may be related to the investigation`, evidence: ctx.contextSignals.alertsActive.map((a) => `${a.title} (${a.status} since ${a.since})`).join("; "), confidence: "low", testWith: { tool: "grafana_check_alerts", params: { action: "list" }, }, }); } // If no hypotheses generated, provide generic investigation path if (hypotheses.length === 0) { hypotheses.push({ hypothesis: "No clear anomaly detected — investigate further with focused queries", evidence: "All metric signals within normal range, no error traces found", confidence: "low", testWith: { tool: "grafana_explain_metric", params: { datasourceUid: ctx.promDsUid, expr: ctx.focusExpr ?? "openclaw_lens_daily_cost_usd", period: "24h", }, }, }); } return hypotheses; }