openclaw-grafana-lens
Version:
OpenClaw plugin that gives AI agents full Grafana access — 18 composable tools for PromQL/LogQL/TraceQL queries, dashboard creation, alerting, SRE investigation, security monitoring, data collection pipeline management via Grafana Alloy (29 recipes), and
481 lines (480 loc) • 25.1 kB
JavaScript
/**
* grafana_investigate tool
*
* Multi-signal investigation triage: gathers metrics, logs, traces, and context
* signals in parallel, then generates hypothesis suggestions with specific
* tool+params for follow-up.
*
* Design: This is a "first step" accelerator — use it to quickly gather
* evidence across all signal types, then follow up with individual tools
* (grafana_query, grafana_query_logs, grafana_query_traces) for deep-dives.
*
* Follows the same resilient pattern as grafana_security_check:
* Promise.allSettled for graceful degradation when signal sources are unavailable.
*/
import { jsonResult, readStringParam } from "../sdk-compat.js";
import { instanceProperties } from "./instance-param.js";
/** Lookback windows mapped to step sizes for range queries. */
const WINDOW_CONFIG = {
"1h": { seconds: 3600, step: "60" },
"6h": { seconds: 21600, step: "300" },
"24h": { seconds: 86400, step: "900" },
};
const PLAIN_METRIC_RE = /^[a-zA-Z_:][a-zA-Z0-9_:]*$/;
export function createInvestigateToolFactory(registry, store) {
return (_ctx) => ({
name: "grafana_investigate",
label: "Investigate",
description: [
"WORKFLOW: Use as the FIRST step for investigating alerts, errors, anomalies, or any 'what's wrong?' question.",
"Gathers multi-signal evidence in parallel (metrics, logs, traces, annotations, active alerts) and generates hypothesis suggestions with specific tool+params for follow-up.",
"Each hypothesis includes a `testWith` field with the exact tool name and parameters to test it — use these for deep-dives.",
"Focus can be an alert UID, metric name, or free-text symptom description.",
"Gracefully degrades when Loki or Tempo datasources are unavailable — you still get metrics and context signals.",
"After investigating, use grafana_annotate to mark findings and grafana_check_alerts to acknowledge alerts.",
].join(" "),
parameters: {
type: "object",
properties: {
...instanceProperties(registry),
focus: {
type: "string",
description: "Alert UID, metric name, or symptom description (e.g., 'alert-abc', 'openclaw_lens_daily_cost_usd', 'high error rate')",
},
timeWindow: {
type: "string",
enum: Object.keys(WINDOW_CONFIG),
description: "Lookback window. Default: '1h'. Use '6h' for broader context, '24h' for daily patterns.",
},
service: {
type: "string",
description: "Filter logs/traces to a specific service name. Default: 'openclaw'.",
},
},
required: ["focus"],
},
async execute(_toolCallId, params) {
const client = registry.get(readStringParam(params, "instance"));
const focus = readStringParam(params, "focus", { required: true, label: "Focus" });
const timeWindow = readStringParam(params, "timeWindow") ?? "1h";
const service = readStringParam(params, "service") ?? "openclaw";
const windowCfg = WINDOW_CONFIG[timeWindow];
if (!windowCfg) {
return jsonResult({ error: `Invalid timeWindow '${timeWindow}' — use '1h', '6h', or '24h'` });
}
const nowMs = Date.now();
const fromMs = nowMs - windowCfg.seconds * 1000;
const nowSec = Math.floor(nowMs / 1000);
const fromSec = nowSec - windowCfg.seconds;
try {
// ── Step 1: Auto-discover datasources ──────────────────────────
const datasources = await client.listDatasources();
const promDs = datasources.find((d) => d.type === "prometheus");
const lokiDs = datasources.find((d) => d.type === "loki");
const tempoDs = datasources.find((d) => d.type === "tempo");
if (!promDs) {
return jsonResult({
error: "No Prometheus datasource found — investigation requires at least Prometheus. Use grafana_explore_datasources to verify.",
});
}
// ── Step 2: Resolve focus ────────────────────────────────────
const isMetricFocus = PLAIN_METRIC_RE.test(focus);
// Check if focus matches an alert in the store
const pendingAlerts = store.getPendingAlerts();
const matchingAlert = pendingAlerts.find((a) => a.id === focus);
const focusExpr = isMetricFocus ? focus : undefined;
const logSearchTerm = isMetricFocus ? undefined : focus;
// ── Step 3: Gather all signals in parallel ────────────────────
const metricPromises = gatherMetricSignals(client, promDs.uid, focusExpr, windowCfg, nowSec, fromSec);
const logPromise = lokiDs
? gatherLogSignals(client, lokiDs.uid, service, logSearchTerm, timeWindow, fromMs, nowMs)
: Promise.resolve(null);
const tracePromise = tempoDs
? gatherTraceSignals(client, tempoDs.uid, service, timeWindow, fromMs, nowMs)
: Promise.resolve(null);
const contextPromise = gatherContextSignals(client, fromMs, nowMs);
const [metricResult, logResult, traceResult, contextResult] = await Promise.allSettled([
metricPromises,
logPromise,
tracePromise,
contextPromise,
]);
// ── Step 4: Extract results with graceful degradation ────────
const metricSignals = metricResult.status === "fulfilled" ? metricResult.value : undefined;
const logSignals = logResult.status === "fulfilled" ? logResult.value : undefined;
const traceSignals = traceResult.status === "fulfilled" ? traceResult.value : undefined;
const contextSignals = contextResult.status === "fulfilled" ? contextResult.value : undefined;
// ── Step 5: Generate hypotheses ──────────────────────────────
const hypotheses = generateHypotheses({
focus,
focusExpr,
matchingAlert,
metricSignals,
logSignals,
traceSignals,
contextSignals,
promDsUid: promDs.uid,
lokiDsUid: lokiDs?.uid,
tempoDsUid: tempoDs?.uid,
service,
});
// ── Step 6: Build limitations list ───────────────────────────
const limitations = [];
if (!lokiDs)
limitations.push("No Loki datasource found — log signals unavailable");
if (!tempoDs)
limitations.push("No Tempo datasource found — trace signals unavailable");
if (metricResult.status === "rejected")
limitations.push(`Metric queries failed: ${metricResult.reason instanceof Error ? metricResult.reason.message : String(metricResult.reason)}`);
if (logResult.status === "rejected")
limitations.push(`Log queries failed: ${logResult.reason instanceof Error ? logResult.reason.message : String(logResult.reason)}`);
if (traceResult.status === "rejected")
limitations.push(`Trace queries failed: ${traceResult.reason instanceof Error ? traceResult.reason.message : String(traceResult.reason)}`);
return jsonResult({
timeWindow: {
from: new Date(fromMs).toISOString(),
to: new Date(nowMs).toISOString(),
duration: timeWindow,
},
focus,
...(metricSignals ? { metricSignals } : {}),
...(logSignals ? { logSignals } : {}),
...(traceSignals ? { traceSignals } : {}),
...(contextSignals ? { contextSignals } : {}),
suggestedHypotheses: hypotheses,
limitations,
});
}
catch (err) {
const reason = err instanceof Error ? err.message : String(err);
return jsonResult({ error: `Investigation failed: ${reason}` });
}
},
});
// ── Signal gathering functions ─────────────────────────────────────
async function gatherMetricSignals(client, promDsUid, focusExpr, windowCfg, nowSec, fromSec) {
const signals = {};
if (focusExpr) {
// Query the focused metric
const [instantResult, rangeResult] = await Promise.allSettled([
client.queryPrometheus(promDsUid, focusExpr),
client.queryPrometheusRange(promDsUid, focusExpr, String(fromSec), String(nowSec), windowCfg.step),
]);
const focusData = {};
if (instantResult.status === "fulfilled") {
const first = instantResult.value.data.result[0];
if (first)
focusData.current = first.value[1];
}
if (rangeResult.status === "fulfilled") {
const series = rangeResult.value.data.result[0];
if (series && series.values.length > 0) {
// Sample max 10 points for context efficiency
const step = Math.max(1, Math.floor(series.values.length / 10));
focusData.trend = series.values
.filter((_, i) => i % step === 0 || i === series.values.length - 1)
.map(([ts, v]) => ({
time: new Date(ts * 1000).toISOString(),
value: v,
}));
}
}
// Anomaly scoring against 7d baseline
const [avgResult, stddevResult] = await Promise.allSettled([
client.queryPrometheus(promDsUid, `avg_over_time(${focusExpr}[7d])`),
client.queryPrometheus(promDsUid, `stddev_over_time(${focusExpr}[7d])`),
]);
if (avgResult.status === "fulfilled" && stddevResult.status === "fulfilled") {
const avgVal = avgResult.value.data.result[0]?.value[1];
const stddevVal = stddevResult.value.data.result[0]?.value[1];
if (avgVal && stddevVal && focusData.current) {
const avg = parseFloat(avgVal);
const stddev = parseFloat(stddevVal);
const current = parseFloat(focusData.current);
const zScore = Math.abs((current - avg) / (stddev + 1e-10));
focusData.anomalyScore = parseFloat(zScore.toFixed(2));
focusData.anomalySeverity = zScore >= 3 ? "critical" : zScore >= 2 ? "significant" : zScore >= 1.5 ? "mild" : "normal";
}
}
if (Object.keys(focusData).length > 0)
signals.focus = focusData;
}
// RED signals (always query these for general health context)
const [rateResult, errorRateResult, p95Result] = await Promise.allSettled([
client.queryPrometheus(promDsUid, "sum(rate(openclaw_lens_messages_processed_total[5m])) or vector(0)"),
client.queryPrometheus(promDsUid, "sum(rate(openclaw_lens_messages_processed_total{outcome=\"error\"}[5m])) / (sum(rate(openclaw_lens_messages_processed_total[5m])) + 0.001)"),
client.queryPrometheus(promDsUid, "histogram_quantile(0.95, sum(rate(gen_ai_client_operation_duration_seconds_bucket[5m])) by (le))"),
]);
const red = {};
if (rateResult.status === "fulfilled") {
red.rate = rateResult.value.data.result[0]?.value[1];
}
if (errorRateResult.status === "fulfilled") {
red.errorRate = errorRateResult.value.data.result[0]?.value[1];
}
if (p95Result.status === "fulfilled") {
red.p95Latency = p95Result.value.data.result[0]?.value[1];
}
if (Object.keys(red).length > 0)
signals.red = red;
return signals;
}
async function gatherLogSignals(client, lokiDsUid, service, searchTerm, timeWindow, fromMs, nowMs) {
const fromNs = String(fromMs * 1_000_000);
const toNs = String(nowMs * 1_000_000);
// Statistics-first: volume, severity breakdown, top patterns, then samples
const baseSelector = `{service_name="${service}"}`;
const errorFilter = searchTerm
? `${baseSelector} | json | level="ERROR" |= "${searchTerm.replace(/"/g, '\\"')}"`
: `${baseSelector} | json | level="ERROR"`;
const [volumeResult, severityResult, sampleResult] = await Promise.allSettled([
client.queryLokiRange(lokiDsUid, `sum(count_over_time(${baseSelector}[${timeWindow}]))`, fromNs, toNs, { limit: 1 }),
client.queryLokiRange(lokiDsUid, `sum by (level) (count_over_time(${baseSelector} | json [${timeWindow}]))`, fromNs, toNs, { limit: 10 }),
client.queryLokiRange(lokiDsUid, errorFilter, fromNs, toNs, { limit: 5, direction: "backward" }),
]);
const signals = {
totalVolume: 0,
errorCount: 0,
bySeverity: {},
topPatterns: [],
sampleErrors: [],
};
// Volume
if (volumeResult.status === "fulfilled") {
const result = volumeResult.value;
const firstStream = result?.data?.result?.[0];
if (firstStream?.values?.length) {
signals.totalVolume = parseInt(firstStream.values[firstStream.values.length - 1][1], 10) || 0;
}
}
// Severity breakdown
if (severityResult.status === "fulfilled") {
const result = severityResult.value;
if (result?.data?.result) {
for (const stream of result.data.result) {
const level = stream.metric?.level ?? "unknown";
const lastVal = stream.values?.length ? parseInt(stream.values[stream.values.length - 1][1], 10) : 0;
signals.bySeverity[level] = lastVal || 0;
if (level === "ERROR" || level === "error") {
signals.errorCount = lastVal || 0;
}
}
}
}
// Sample errors
if (sampleResult.status === "fulfilled") {
const result = sampleResult.value;
if (result?.data?.result) {
for (const stream of result.data.result) {
if (stream.values) {
for (const [ts, line] of stream.values) {
signals.sampleErrors.push({
timestamp: new Date(parseInt(ts, 10) / 1_000_000).toISOString(),
line: line.length > 200 ? line.slice(0, 200) + "…" : line,
});
}
}
}
signals.sampleErrors = signals.sampleErrors.slice(0, 5);
}
}
return signals;
}
async function gatherTraceSignals(client, tempoDsUid, service, _timeWindow, fromMs, nowMs) {
const fromSec = String(Math.floor(fromMs / 1000));
const toSec = String(Math.floor(nowMs / 1000));
const [errorResult, slowResult] = await Promise.allSettled([
client.searchTraces(tempoDsUid, `{ resource.service.name = "${service}" && status = error }`, { start: fromSec, end: toSec, limit: 5 }),
client.searchTraces(tempoDsUid, `{ resource.service.name = "${service}" && duration > 10s }`, { start: fromSec, end: toSec, limit: 5 }),
]);
const mapTraces = (result) => {
if (result.status !== "fulfilled")
return [];
return result.value.traces.map((t) => ({
traceId: t.traceID,
rootService: t.rootServiceName,
rootSpan: t.rootTraceName,
durationMs: t.durationMs,
}));
};
return {
errorTraces: mapTraces(errorResult),
slowTraces: mapTraces(slowResult),
};
}
async function gatherContextSignals(client, fromMs, nowMs) {
const [annotationsResult] = await Promise.allSettled([
client.getAnnotations({ from: fromMs, to: nowMs, limit: 10 }),
]);
const recentAnnotations = annotationsResult.status === "fulfilled"
? annotationsResult.value.map((a) => ({
text: a.text,
tags: a.tags,
time: new Date(a.time).toISOString(),
}))
: [];
const pending = store.getPendingAlerts();
const alertsActive = pending.map((a) => ({
title: a.title,
status: a.status,
since: new Date(a.receivedAt).toISOString(),
}));
return { recentAnnotations, alertsActive };
}
}
function generateHypotheses(ctx) {
const hypotheses = [];
// H1: Error spike + recent annotation → deployment may have caused issues
if (ctx.logSignals?.errorCount && ctx.logSignals.errorCount > 0 && ctx.contextSignals?.recentAnnotations?.length) {
const recentAnnotation = ctx.contextSignals.recentAnnotations[0];
hypotheses.push({
hypothesis: `Recent event "${recentAnnotation.text}" may have caused the error increase`,
evidence: `${ctx.logSignals.errorCount} errors found, annotation "${recentAnnotation.text}" at ${recentAnnotation.time}`,
confidence: "medium",
testWith: {
tool: "grafana_query",
params: {
datasourceUid: ctx.promDsUid,
expr: `sum(rate(openclaw_lens_messages_processed_total{outcome="error"}[5m]))`,
queryType: "range",
start: recentAnnotation.time,
},
},
});
}
// H2: High latency + context pressure → context window saturation
const p95 = ctx.metricSignals?.red?.p95Latency ? parseFloat(ctx.metricSignals.red.p95Latency) : 0;
if (p95 > 10) {
hypotheses.push({
hypothesis: "High LLM latency may indicate context window saturation or provider throttling",
evidence: `P95 latency is ${p95.toFixed(1)}s (>10s threshold)`,
confidence: "medium",
testWith: {
tool: "grafana_query",
params: {
datasourceUid: ctx.promDsUid,
expr: "openclaw_lens_context_tokens{type=\"used\"} / openclaw_lens_context_tokens{type=\"limit\"} * 100",
},
},
});
}
// H3: Error rate is elevated
const errorRate = ctx.metricSignals?.red?.errorRate ? parseFloat(ctx.metricSignals.red.errorRate) : 0;
if (errorRate > 0.01) {
hypotheses.push({
hypothesis: `Error rate is elevated at ${(errorRate * 100).toFixed(1)}%`,
evidence: `Error rate: ${(errorRate * 100).toFixed(1)}%, message rate: ${ctx.metricSignals?.red?.rate ?? "unknown"}`,
confidence: "high",
testWith: ctx.lokiDsUid
? {
tool: "grafana_query_logs",
params: {
datasourceUid: ctx.lokiDsUid,
expr: `topk(10, sum by (event_name) (count_over_time({service_name="${ctx.service}"} | json | level="ERROR" [1h])))`,
},
}
: {
tool: "grafana_query",
params: {
datasourceUid: ctx.promDsUid,
expr: `sum by (outcome) (rate(openclaw_lens_messages_processed_total[5m]))`,
},
},
});
}
// H4: Anomaly detected on focus metric
if (ctx.metricSignals?.focus?.anomalyScore && ctx.metricSignals.focus.anomalyScore >= 2) {
hypotheses.push({
hypothesis: `Focus metric shows ${ctx.metricSignals.focus.anomalySeverity} anomaly (${ctx.metricSignals.focus.anomalyScore}σ from baseline)`,
evidence: `Current: ${ctx.metricSignals.focus.current}, z-score: ${ctx.metricSignals.focus.anomalyScore}σ`,
confidence: ctx.metricSignals.focus.anomalyScore >= 3 ? "high" : "medium",
testWith: {
tool: "grafana_explain_metric",
params: {
datasourceUid: ctx.promDsUid,
expr: ctx.focusExpr ?? ctx.focus,
period: "24h",
},
},
});
}
// H5: Error traces available → specific request paths failing
if (ctx.traceSignals?.errorTraces && ctx.traceSignals.errorTraces.length > 0) {
const trace = ctx.traceSignals.errorTraces[0];
hypotheses.push({
hypothesis: `Specific request paths are failing — ${ctx.traceSignals.errorTraces.length} error trace(s) found`,
evidence: `Error trace: ${trace.rootSpan} (${trace.durationMs}ms) in ${trace.rootService}`,
confidence: "high",
testWith: ctx.tempoDsUid
? {
tool: "grafana_query_traces",
params: {
datasourceUid: ctx.tempoDsUid,
traceId: trace.traceId,
queryType: "get",
},
}
: {
tool: "grafana_query",
params: {
datasourceUid: ctx.promDsUid,
expr: `sum by (tool) (rate(openclaw_lens_tool_error_classes_total[5m]))`,
},
},
});
}
// H6: Slow traces → performance degradation
if (ctx.traceSignals?.slowTraces && ctx.traceSignals.slowTraces.length > 0 && hypotheses.length < 5) {
const trace = ctx.traceSignals.slowTraces[0];
hypotheses.push({
hypothesis: `Performance degradation — ${ctx.traceSignals.slowTraces.length} slow trace(s) found (>10s)`,
evidence: `Slow trace: ${trace.rootSpan} (${trace.durationMs}ms) in ${trace.rootService}`,
confidence: "medium",
testWith: ctx.tempoDsUid
? {
tool: "grafana_query_traces",
params: {
datasourceUid: ctx.tempoDsUid,
traceId: trace.traceId,
queryType: "get",
},
}
: {
tool: "grafana_query",
params: {
datasourceUid: ctx.promDsUid,
expr: "histogram_quantile(0.95, sum by (le, gen_ai_request_model) (rate(gen_ai_client_operation_duration_seconds_bucket[5m])))",
},
},
});
}
// H7: Active alerts suggest ongoing issues
if (ctx.contextSignals?.alertsActive && ctx.contextSignals.alertsActive.length > 0 && !ctx.matchingAlert) {
hypotheses.push({
hypothesis: `${ctx.contextSignals.alertsActive.length} active alert(s) may be related to the investigation`,
evidence: ctx.contextSignals.alertsActive.map((a) => `${a.title} (${a.status} since ${a.since})`).join("; "),
confidence: "low",
testWith: {
tool: "grafana_check_alerts",
params: { action: "list" },
},
});
}
// If no hypotheses generated, provide generic investigation path
if (hypotheses.length === 0) {
hypotheses.push({
hypothesis: "No clear anomaly detected — investigate further with focused queries",
evidence: "All metric signals within normal range, no error traces found",
confidence: "low",
testWith: {
tool: "grafana_explain_metric",
params: {
datasourceUid: ctx.promDsUid,
expr: ctx.focusExpr ?? "openclaw_lens_daily_cost_usd",
period: "24h",
},
},
});
}
return hypotheses;
}