openclaw-grafana-lens
Version:
OpenClaw plugin that gives AI agents full Grafana access — 18 composable tools for PromQL/LogQL/TraceQL queries, dashboard creation, alerting, SRE investigation, security monitoring, data collection pipeline management via Grafana Alloy (29 recipes), and
420 lines (419 loc) • 21.7 kB
JavaScript
/**
* grafana_explain_metric tool
*
* Gathers structured data about a metric — current value, trend over a period,
* min/max/avg statistics, and metadata (type/help/unit). The agent uses this
* enriched context to explain what a metric means and why it changed.
*
* Counter-aware: auto-detects counter metrics (via metadata or _total suffix)
* and wraps the trend query in rate() so the agent sees actual rate of change
* instead of raw monotonically-increasing cumulative values.
*/
import { jsonResult, readStringParam } from "../sdk-compat.js";
import { instanceProperties } from "./instance-param.js";
import { getHealthContext } from "./health-context.js";
import { KNOWN_BREAKDOWNS_MAP } from "../metric-definitions.js";
const PERIOD_CONFIG = {
"24h": { seconds: 86_400, step: "300", label: "24 hours" },
"7d": { seconds: 604_800, step: "3600", label: "7 days" },
"30d": { seconds: 2_592_000, step: "21600", label: "30 days" },
};
const PLAIN_METRIC_RE = /^[a-zA-Z_:][a-zA-Z0-9_:]*$/;
/** Rate window for counter trend queries — standard Prometheus default. */
const RATE_WINDOW = "5m";
/**
* Standard Prometheus/OTel infrastructure labels that aren't useful for
* drill-down suggestions. Users want to break down by semantic labels
* (token_type, model, channel) not by deployment topology.
*/
const EXCLUDED_LABELS = new Set([
"__name__", "instance", "job", "le",
"service_name", "service_namespace", "service_version",
]);
/**
* Static label breakdown knowledge for well-known metric families.
* Derived from the shared metric-definitions registry (src/metric-definitions.ts).
* Labels are ordered by analytical importance (most useful first).
* Uses Prometheus names (with _total suffix for counters) since the tool queries Prometheus.
*/
const KNOWN_BREAKDOWNS = KNOWN_BREAKDOWNS_MAP;
/**
* Resolve meaningful label names for breaking down a metric.
* Uses static knowledge for well-known metrics (works even with no data);
* falls back to dynamically-discovered labels from query results.
*
* Suffix-tolerant: tries the exact name first, then strips Prometheus
* suffixes (_total, _bucket, _count, _sum) to match the base definition.
* This handles the OTel→Prometheus naming gap — agents may pass either form.
*/
export function resolveBreakdowns(metricName, dynamicLabels) {
// Exact match first
if (KNOWN_BREAKDOWNS[metricName])
return KNOWN_BREAKDOWNS[metricName];
// Strip Prometheus suffixes to find base metric
const stripped = metricName
.replace(/_bucket$/, "")
.replace(/_count$/, "")
.replace(/_sum$/, "")
.replace(/_total$/, "");
// Try the stripped name, then try with _total (counter convention)
return KNOWN_BREAKDOWNS[stripped]
?? KNOWN_BREAKDOWNS[`${stripped}_total`]
?? dynamicLabels;
}
/** Round to 4 significant digits for context efficiency. */
const sig4 = (n) => Number(n.toPrecision(4));
/** Compute min/max/avg from a numeric array in a single pass. */
function computeStats(values) {
let min = values[0];
let max = values[0];
let sum = 0;
for (const v of values) {
if (v < min)
min = v;
if (v > max)
max = v;
sum += v;
}
return { min, max, avg: sum / values.length };
}
/** Determine direction using 1% hysteresis threshold. */
function detectDirection(current, baseline) {
if (current > baseline * 1.01)
return "up";
if (current < baseline * 0.99)
return "down";
return "flat";
}
/**
* Detect metric type using metadata API + naming convention fallback.
* OTLP-pushed metrics often lack metadata, so _total suffix is a reliable fallback.
*/
function detectMetricType(metricName, metadata) {
if (metadata?.type)
return metadata.type;
if (metricName.endsWith("_total"))
return "counter";
if (metricName.endsWith("_bucket"))
return "histogram";
return undefined;
}
/**
* Extract unique semantic label names from instant query results.
* Excludes infrastructure labels (__name__, job, instance, etc.)
* that don't provide useful drill-down dimensions.
*/
export function extractLabelNames(results) {
const labelSet = new Set();
for (const r of results) {
for (const key of Object.keys(r.metric)) {
if (!EXCLUDED_LABELS.has(key))
labelSet.add(key);
}
}
return Array.from(labelSet).sort();
}
/**
* Generate suggested drill-down queries based on discovered labels and metric type.
* Counter metrics get rate() wrapping; gauges/unknown use raw expressions.
* Returns empty array when no labels are available.
*/
export function buildSuggestedQueries(expr, labels, metricType) {
if (labels.length === 0)
return [];
const isCounter = metricType === "counter";
const baseExpr = isCounter ? `rate(${expr}[${RATE_WINDOW}])` : expr;
const queries = [];
// Top-k by first label — most useful as a starting point
queries.push({
query: `topk(5, sum by (${labels[0]}) (${baseExpr}))`,
description: `Top 5 by ${labels[0]}`,
});
// Breakdown by each label
for (const label of labels) {
queries.push({
query: `sum by (${label}) (${baseExpr})`,
description: `Breakdown by ${label}`,
});
}
// Full multi-label breakdown if 2+ labels
if (labels.length >= 2) {
queries.push({
query: `sum by (${labels.join(", ")}) (${baseExpr})`,
description: `Full breakdown by all labels`,
});
}
return queries;
}
export function createExplainMetricToolFactory(registry) {
return (_ctx) => ({
name: "grafana_explain_metric",
label: "Explain Metric",
description: [
"Get structured context about a metric: current value, trend (change %, direction), stats (min/max/avg), and metadata (type/help/unit).",
"WORKFLOW: Use when user asks 'what does this metric mean?', 'why did it spike?', 'is this normal?', or 'show me the trend'.",
"Returns enriched data for the agent to interpret — the agent provides the narrative.",
"Counter-aware: auto-detects counter metrics and shows rate of change (not raw cumulative values) for trends.",
"Response includes `metricType` (counter/gauge/histogram/summary), `trendQuery` (actual PromQL used for trend), `suggestedQueries` (drill-down PromQL by label), and `suggestedBreakdowns` (label names for decomposition — always available for known OpenClaw metrics, even with no data).",
"Includes anomaly scoring (sigma-based z-score against 7-day baseline) and seasonality comparison (vs 1 day ago, vs 7 days ago) for 24h period queries. Returns `anomaly` (score, severity: normal/mild/significant/critical, baseline) and `seasonality` (vs1dAgo, vs7dAgo with change percent).",
"Period comparison: set `compareWith: 'previous'` to compare the current period with the immediately preceding one (e.g., this week vs. last week). Returns a `comparison` object with previous period stats and change (absolute, percentage, direction). Eliminates manual multi-query workflows for period-over-period analysis.",
"Requires a datasourceUid — use grafana_explore_datasources to find it.",
"Supports any PromQL expression. Metadata only available for plain metric names.",
"For raw PromQL with custom time parameters, complex multi-metric expressions, or range queries with specific steps, use grafana_query instead.",
].join(" "),
parameters: {
type: "object",
properties: {
...instanceProperties(registry),
datasourceUid: {
type: "string",
description: "UID of the Prometheus datasource (use grafana_explore_datasources to find it)",
},
expr: {
type: "string",
description: "PromQL expression or plain metric name (e.g., 'openclaw_lens_daily_cost_usd' or 'openclaw_lens_tokens_total')",
},
period: {
type: "string",
enum: Object.keys(PERIOD_CONFIG),
description: "Lookback period for trend and stats (default: '24h')",
},
compareWith: {
type: "string",
enum: ["previous"],
description: "Set to 'previous' to compare current period with the same-length window immediately before it (e.g., this week vs. last week). Adds a 'comparison' object to the response.",
},
},
required: ["datasourceUid", "expr"],
},
async execute(_toolCallId, params) {
const client = registry.get(readStringParam(params, "instance"));
const datasourceUid = readStringParam(params, "datasourceUid", { required: true, label: "Datasource UID" });
const expr = readStringParam(params, "expr", { required: true, label: "PromQL expression" });
const period = readStringParam(params, "period") ?? "24h";
const compareWith = readStringParam(params, "compareWith");
const periodCfg = PERIOD_CONFIG[period];
if (!periodCfg) {
return jsonResult({ error: `Invalid period '${period}' — use '24h', '7d', or '30d'` });
}
const isPlainMetric = PLAIN_METRIC_RE.test(expr);
const nowSec = Math.floor(Date.now() / 1000);
const startSec = nowSec - periodCfg.seconds;
// ── Step 1: Fetch metadata + instant in parallel (range query depends on type) ──
let metadata;
const [metaSettled, instantResult] = await Promise.allSettled([
isPlainMetric
? client.getMetricMetadata(datasourceUid, { metric: expr })
: Promise.resolve(undefined),
client.queryPrometheus(datasourceUid, expr),
]);
if (metaSettled.status === "fulfilled" && metaSettled.value) {
const entries = metaSettled.value[expr];
if (entries && entries.length > 0) {
metadata = entries[0];
}
}
// ── Step 2: Detect metric type and choose appropriate range expression ──
const metricType = isPlainMetric ? detectMetricType(expr, metadata) : undefined;
const trendQuery = metricType === "counter" ? `rate(${expr}[${RATE_WINDOW}])` : expr;
// ── Step 3: Run range query (depends on trendQuery from Step 2) ──
// If compareWith="previous", also query the previous period in parallel
const prevStartSec = startSec - periodCfg.seconds;
const prevEndSec = startSec;
const rangePromises = [
client.queryPrometheusRange(datasourceUid, trendQuery, String(startSec), String(nowSec), periodCfg.step),
];
if (compareWith === "previous") {
rangePromises.push(client.queryPrometheusRange(datasourceUid, trendQuery, String(prevStartSec), String(prevEndSec), periodCfg.step));
}
const [rangeResult, prevRangeResult] = await Promise.allSettled(rangePromises);
// ── Current value (always raw — cumulative total is meaningful for counters) ──
let current;
if (instantResult.status === "fulfilled") {
const first = instantResult.value.data.result[0];
if (first) {
current = {
value: first.value[1],
timestamp: new Date(first.value[0] * 1000).toISOString(),
};
}
}
// If both queries failed entirely, return the error
if (instantResult.status === "rejected" && rangeResult.status === "rejected") {
const reason = instantResult.reason instanceof Error ? instantResult.reason.message : String(instantResult.reason);
return jsonResult({ error: `Query failed: ${reason}` });
}
// ── Trend & stats from range data ───────────────────────────────
let trend;
let stats;
if (rangeResult.status === "fulfilled") {
const series = rangeResult.value.data.result[0];
if (series && series.values.length > 0) {
const values = series.values.map(([, v]) => parseFloat(v));
const firstVal = values[0];
const lastVal = values[values.length - 1];
const s = computeStats(values);
let changePercent = null;
if (firstVal !== 0) {
changePercent = parseFloat((((lastVal - firstVal) / firstVal) * 100).toFixed(1));
}
trend = {
changePercent,
direction: detectDirection(lastVal, firstVal),
first: String(firstVal),
last: String(lastVal),
};
stats = {
min: String(sig4(s.min)),
max: String(sig4(s.max)),
avg: String(sig4(s.avg)),
samples: series.values.length,
};
}
}
// ── Comparison with previous period ────────────────────────────
let comparison;
if (compareWith === "previous" && prevRangeResult?.status === "fulfilled") {
const prevSeries = prevRangeResult.value.data.result[0];
if (prevSeries && prevSeries.values.length > 0 && stats) {
const prevValues = prevSeries.values.map(([, v]) => parseFloat(v));
const ps = computeStats(prevValues);
const currentAvg = parseFloat(stats.avg);
const absolute = currentAvg - ps.avg;
let percentage = null;
if (ps.avg !== 0) {
percentage = parseFloat(((absolute / ps.avg) * 100).toFixed(1));
}
comparison = {
previousPeriod: {
from: new Date(prevStartSec * 1000).toISOString(),
to: new Date(prevEndSec * 1000).toISOString(),
avg: String(sig4(ps.avg)),
min: String(sig4(ps.min)),
max: String(sig4(ps.max)),
samples: prevSeries.values.length,
},
change: {
absolute: String(sig4(absolute)),
percentage,
direction: detectDirection(currentAvg, ps.avg),
},
};
}
}
// ── Anomaly scoring + seasonality (24h period only, plain metrics) ──
let anomaly;
let seasonality;
if (period === "24h" && isPlainMetric && stats) {
// Run anomaly + seasonality queries in parallel
const anomalyExpr = trendQuery; // Use rate() for counters, raw for gauges
const [baselineAvgResult, baselineStddevResult, offset1dResult, offset7dResult] = await Promise.allSettled([
client.queryPrometheus(datasourceUid, `avg_over_time(${anomalyExpr}[7d])`),
client.queryPrometheus(datasourceUid, `stddev_over_time(${anomalyExpr}[7d])`),
client.queryPrometheus(datasourceUid, `${expr} offset 1d`),
client.queryPrometheus(datasourceUid, `${expr} offset 7d`),
]);
// ── Anomaly z-score ──
if (baselineAvgResult.status === "fulfilled" && baselineStddevResult.status === "fulfilled"
&& baselineAvgResult.value?.data?.result && baselineStddevResult.value?.data?.result) {
const avgVal = baselineAvgResult.value.data.result[0]?.value[1];
const stddevVal = baselineStddevResult.value.data.result[0]?.value[1];
if (avgVal && stddevVal) {
const baselineAvg = parseFloat(avgVal);
const baselineStddev = parseFloat(stddevVal);
const currentAvg = parseFloat(stats.avg);
const epsilon = 1e-10;
const zScore = Math.abs((currentAvg - baselineAvg) / (baselineStddev + epsilon));
const roundedZ = parseFloat(zScore.toFixed(2));
let severity;
if (roundedZ >= 3)
severity = "critical";
else if (roundedZ >= 2)
severity = "significant";
else if (roundedZ >= 1.5)
severity = "mild";
else
severity = "normal";
const direction = currentAvg > baselineAvg ? "above" : "below";
anomaly = {
score: roundedZ,
severity,
baseline: { avg: String(sig4(baselineAvg)), stddev: String(sig4(baselineStddev)), period: "7d" },
interpretation: `${roundedZ}σ ${direction} 7-day baseline — ${severity} anomaly`,
};
}
}
// ── Seasonality comparison ──
const currentVal = instantResult.status === "fulfilled"
? parseFloat(instantResult.value.data.result[0]?.value[1] ?? "NaN")
: NaN;
if (!isNaN(currentVal)) {
const vs1d = offset1dResult.status === "fulfilled" && offset1dResult.value?.data?.result
? parseFloat(offset1dResult.value.data.result[0]?.value[1] ?? "NaN")
: NaN;
const vs7d = offset7dResult.status === "fulfilled" && offset7dResult.value?.data?.result
? parseFloat(offset7dResult.value.data.result[0]?.value[1] ?? "NaN")
: NaN;
const computeChange = (old) => {
if (isNaN(old) || old === 0)
return null;
return parseFloat((((currentVal - old) / old) * 100).toFixed(1));
};
seasonality = {
vs1dAgo: { value: isNaN(vs1d) ? "N/A" : String(sig4(vs1d)), changePercent: computeChange(vs1d) },
vs7dAgo: { value: isNaN(vs7d) ? "N/A" : String(sig4(vs7d)), changePercent: computeChange(vs7d) },
};
}
}
// ── Extract label names for drill-down suggestions ─────────────
const dynamicLabels = isPlainMetric && instantResult.status === "fulfilled"
? extractLabelNames(instantResult.value.data.result)
: [];
// ── Resolve breakdown label hints (static knowledge + dynamic) ──
const suggestedBreakdowns = isPlainMetric
? resolveBreakdowns(expr, dynamicLabels)
: [];
// Use static breakdowns as fallback when instant query returns no labels
// (e.g., metric is stale — no current time series but range data exists)
const labels = dynamicLabels.length > 0 ? dynamicLabels : suggestedBreakdowns;
const suggestedQueries = buildSuggestedQueries(expr, labels, metricType);
// ── Build response (omit sections without data) ─────────────────
const result = {
status: "success",
expr,
period,
periodLabel: periodCfg.label,
};
if (metricType)
result.metricType = metricType;
if (trendQuery !== expr)
result.trendQuery = trendQuery;
if (current) {
result.current = current;
const health = getHealthContext(expr, current.value);
if (health)
result.healthContext = health;
}
if (trend)
result.trend = trend;
if (stats)
result.stats = stats;
if (comparison)
result.comparison = comparison;
if (anomaly)
result.anomaly = anomaly;
if (seasonality)
result.seasonality = seasonality;
if (metadata)
result.metadata = metadata;
if (suggestedQueries.length > 0)
result.suggestedQueries = suggestedQueries;
if (suggestedBreakdowns.length > 0)
result.suggestedBreakdowns = suggestedBreakdowns;
if (!current && !trend && !stats) {
result.note = `No data found for '${expr}' over the last ${periodCfg.label}`;
}
return jsonResult(result);
},
});
}