openclaw-grafana-lens
Version:
OpenClaw plugin that gives AI agents full Grafana access — 18 composable tools for PromQL/LogQL/TraceQL queries, dashboard creation, alerting, SRE investigation, security monitoring, data collection pipeline management via Grafana Alloy (29 recipes), and
568 lines (567 loc) • 27.3 kB
JavaScript
/**
* grafana_check_alerts tool
*
* Seven actions in one tool:
* - list: Return pending alerts from the webhook store
* - acknowledge: Mark an alert as investigated
* - list_rules: List all configured alert rules from Grafana
* - delete_rule: Delete an alert rule by UID
* - silence / unsilence: Mute / unmute alerts during investigation
* - setup: Create webhook contact point + notification policy route in Grafana
*
* The "setup" action is idempotent — if the contact point already exists,
* it returns the existing UID without creating a duplicate.
*/
import { jsonResult, readStringParam } from "../sdk-compat.js";
import { instanceProperties } from "./instance-param.js";
import { getQueryCapability } from "./explore-datasources.js";
const CONTACT_POINT_NAME = "OpenClaw Alert Webhook";
export function createCheckAlertsToolFactory(registry, store) {
return (_ctx) => ({
name: "grafana_check_alerts",
label: "Grafana Alerts",
description: [
"Check, acknowledge, silence, or set up Grafana alert webhooks. Manage alert rules (list/delete).",
"WORKFLOW: Use action 'list' (default) to see pending alerts — includes suggestedInvestigation with ready-to-use query, tool, and datasource for immediate investigation.",
"Use action 'acknowledge' with alertId to mark an alert as investigated.",
"Use action 'list_rules' to see all configured alert rules with live evaluation state (normal/firing/pending/nodata/error), health, and lastEvaluation — one call for complete alert health. Set compact=true for minimal fields (uid, title, state, condition only).",
"Use action 'delete_rule' with ruleUid to remove an alert rule permanently.",
"Use action 'silence' to mute alerts matching specific labels during investigation (prevents repeat notifications).",
"Use action 'unsilence' with silenceId to remove a silence after resolving.",
"Use action 'setup' to create the webhook contact point and notification policy",
"route — required once before alerts can notify the agent.",
"Use action 'analyze' to detect alert fatigue — identifies always-firing, flapping, and high-frequency alert rules with optimization suggestions.",
"Alerts created via grafana_create_alert auto-route to the webhook.",
].join(" "),
parameters: {
type: "object",
properties: {
...instanceProperties(registry),
action: {
type: "string",
enum: ["list", "acknowledge", "list_rules", "delete_rule", "silence", "unsilence", "setup", "analyze"],
description: "Action to perform. Default: 'list'. Use 'analyze' to detect alert fatigue.",
},
alertId: {
type: "string",
description: "Alert ID to acknowledge (required for action 'acknowledge')",
},
matchers: {
type: "array",
items: {
type: "object",
properties: {
name: { type: "string", description: "Label name" },
value: { type: "string", description: "Label value" },
isRegex: { type: "boolean", description: "Whether value is a regex. Default: false" },
},
required: ["name", "value"],
},
description: "Label matchers for silence (from alert's commonLabels). Required for action 'silence'.",
},
duration: {
type: "string",
description: "Silence duration (e.g., '2h', '30m', '1d'). Default: '2h'. Used with action 'silence'.",
},
comment: {
type: "string",
description: "Reason for silencing. Default: 'Silenced by agent during investigation'. Used with action 'silence'.",
},
ruleUid: {
type: "string",
description: "Alert rule UID to delete (required for action 'delete_rule'). Get UIDs from action 'list_rules'.",
},
silenceId: {
type: "string",
description: "Silence ID to remove (required for action 'unsilence')",
},
compact: {
type: "boolean",
description: "Return minimal fields only for list_rules — {uid, title, state, condition}. Drops folder, ruleGroup, health, lastEvaluation, for, labels, annotations, updated. Use in multi-tool chains. Default: false",
},
webhookUrl: {
type: "string",
description: "Webhook URL for Grafana to POST alerts to. Auto-detected from config if omitted. Only used with action 'setup'.",
},
},
},
async execute(_toolCallId, params) {
const client = registry.get(readStringParam(params, "instance"));
const action = readStringParam(params, "action") ?? "list";
switch (action) {
case "list":
return handleList(client);
case "acknowledge":
return handleAcknowledge(params);
case "list_rules":
return handleListRules(client, typeof params.compact === "boolean" ? params.compact : false);
case "delete_rule":
return handleDeleteRule(client, params);
case "silence":
return handleSilence(client, params);
case "unsilence":
return handleUnsilence(client, params);
case "setup":
return handleSetup(client, params);
case "analyze":
return handleAnalyze(client);
default:
return jsonResult({ error: `Unknown action '${action}'. Use: list, acknowledge, list_rules, delete_rule, silence, unsilence, setup, analyze` });
}
},
});
async function handleList(client) {
const pending = store.getPendingAlerts();
if (pending.length === 0) {
return jsonResult({ status: "success", alerts: [], message: "No pending alerts" });
}
// Fetch alert rules + datasources in parallel to enrich alerts with investigation hints.
// Placed after the empty-check to avoid 2 API round-trips when no alerts are pending.
const enrichment = await fetchInvestigationContext(client);
const MAX_INSTANCES = 5;
return jsonResult({
status: "success",
alertCount: pending.length,
alerts: pending.map((a) => {
const allInstances = a.alerts ?? [];
const instances = allInstances.slice(0, MAX_INSTANCES).map((inst) => ({
status: inst.status,
labels: inst.labels,
annotations: inst.annotations,
startsAt: inst.startsAt,
values: inst.values,
}));
const investigation = buildInvestigationHint(a, enrichment);
return {
id: a.id,
status: a.status,
title: a.title,
message: a.message,
receivedAt: new Date(a.receivedAt).toISOString(),
commonLabels: a.commonLabels,
totalInstances: allInstances.length,
...(allInstances.length > MAX_INSTANCES ? { truncated: true } : {}),
...(investigation ? { suggestedInvestigation: investigation } : {}),
instances,
};
}),
});
}
/**
* Fetch alert rules and datasources in parallel for enriching the list response.
* Best-effort: if either fails, returns partial data — the list still works without enrichment.
*/
async function fetchInvestigationContext(client) {
const [rulesResult, dsResult] = await Promise.allSettled([
client.listAlertRules(),
client.listDatasources(),
]);
const rules = rulesResult.status === "fulfilled" ? rulesResult.value : [];
const datasources = dsResult.status === "fulfilled" ? dsResult.value : [];
const rulesByUid = new Map();
const rulesByName = new Map();
for (const r of rules) {
rulesByUid.set(r.uid, r);
rulesByName.set(r.title, r);
}
const dsMap = new Map();
for (const ds of datasources) {
dsMap.set(ds.uid, ds);
}
return { rulesByUid, rulesByName, dsMap };
}
function handleAcknowledge(params) {
const alertId = readStringParam(params, "alertId", { required: true, label: "Alert ID" });
const found = store.acknowledgeAlert(alertId);
if (!found) {
return jsonResult({ error: `Alert '${alertId}' not found` });
}
return jsonResult({ status: "acknowledged", alertId });
}
async function handleListRules(client, compact) {
try {
// Fetch rule definitions + evaluation state in parallel.
// Eval state is best-effort: if the Prometheus endpoint fails, rules still return without state.
const [rules, stateResult] = await Promise.allSettled([
client.listAlertRules(),
client.getAlertRuleStates(),
]).then(([rulesRes, stateRes]) => [
rulesRes.status === "fulfilled" ? rulesRes.value : null,
stateRes.status === "fulfilled" ? stateRes.value : null,
]);
if (!rules) {
return jsonResult({ error: "Failed to list alert rules — could not reach Grafana provisioning API" });
}
if (rules.length === 0) {
return jsonResult({ status: "success", rules: [], message: "No alert rules configured" });
}
// ── Compact mode — {uid, title, state, condition} only ─────────
if (compact) {
return jsonResult({
status: "success",
ruleCount: rules.length,
rules: rules.map((r) => {
const evalState = stateResult?.get(r.uid);
return {
uid: r.uid,
title: r.title,
state: evalState ? normalizeState(evalState.state) : "unknown",
condition: extractConditionSummary(r),
};
}),
});
}
return jsonResult({
status: "success",
ruleCount: rules.length,
rules: rules.map((r) => {
const evalState = stateResult?.get(r.uid);
return {
uid: r.uid,
title: r.title,
folder: r.folderUID,
ruleGroup: r.ruleGroup,
state: evalState ? normalizeState(evalState.state) : "unknown",
health: evalState?.health ?? "unknown",
lastEvaluation: evalState?.lastEvaluation ?? null,
for: r.for,
labels: r.labels,
annotations: r.annotations,
condition: extractConditionSummary(r),
updated: r.updated,
};
}),
});
}
catch (err) {
const reason = err instanceof Error ? err.message : String(err);
return jsonResult({ error: `Failed to list alert rules: ${reason}` });
}
}
async function handleDeleteRule(client, params) {
const ruleUid = readStringParam(params, "ruleUid", { required: true, label: "Rule UID" });
try {
await client.deleteAlertRule(ruleUid);
return jsonResult({
status: "deleted",
ruleUid,
message: `Alert rule '${ruleUid}' deleted. It will no longer evaluate or fire.`,
});
}
catch (err) {
const reason = err instanceof Error ? err.message : String(err);
return jsonResult({ error: `Failed to delete alert rule: ${reason}` });
}
}
async function handleSilence(client, params) {
const rawMatchers = params.matchers;
if (!rawMatchers || rawMatchers.length === 0) {
return jsonResult({
error: "silence requires 'matchers' — an array of label matchers from the alert's commonLabels. Example: [{ name: 'alertname', value: 'HighCost' }]",
});
}
const matchers = rawMatchers.map((m) => ({
name: m.name,
value: m.value,
isRegex: m.isRegex ?? false,
}));
const duration = readStringParam(params, "duration") ?? "2h";
const comment = readStringParam(params, "comment") ?? "Silenced by agent during investigation";
try {
const result = await client.createSilence(matchers, duration, comment);
return jsonResult({
status: "silenced",
silenceId: result.silenceID,
duration,
matchers,
message: `Alerts matching ${matchers.map((m) => `${m.name}=${m.value}`).join(", ")} silenced for ${duration}. Use action 'unsilence' with silenceId '${result.silenceID}' to remove.`,
});
}
catch (err) {
const reason = err instanceof Error ? err.message : String(err);
return jsonResult({ error: `Failed to create silence: ${reason}` });
}
}
async function handleUnsilence(client, params) {
const silenceId = readStringParam(params, "silenceId", { required: true, label: "Silence ID" });
try {
await client.deleteSilence(silenceId);
return jsonResult({
status: "unsilenced",
silenceId,
message: `Silence '${silenceId}' removed. Alerts will resume notifying.`,
});
}
catch (err) {
const reason = err instanceof Error ? err.message : String(err);
return jsonResult({ error: `Failed to remove silence: ${reason}` });
}
}
async function handleSetup(client, params) {
const webhookUrl = readStringParam(params, "webhookUrl");
try {
// Check if contact point already exists
const existing = await client.listContactPoints();
const found = existing.find((cp) => cp.name === CONTACT_POINT_NAME);
if (found) {
return jsonResult({
status: "already_exists",
contactPointUid: found.uid,
message: `Webhook contact point '${CONTACT_POINT_NAME}' already exists`,
});
}
// Determine webhook URL
const resolvedUrl = webhookUrl ?? resolveWebhookUrl();
// Create webhook contact point
const cp = await client.createContactPoint({
name: CONTACT_POINT_NAME,
type: "webhook",
settings: {
url: resolvedUrl,
httpMethod: "POST",
},
disableResolveMessage: false,
});
// Add notification policy route for managed_by=openclaw alerts
const policyTree = await client.getNotificationPolicies();
// Check if route already exists
const hasRoute = policyTree.routes?.some((r) => r.matchers?.some((m) => m.name === "managed_by" && m.value === "openclaw"));
if (!hasRoute) {
const routes = policyTree.routes ?? [];
routes.push({
receiver: CONTACT_POINT_NAME,
matchers: [{ name: "managed_by", type: "=", value: "openclaw" }],
continue: false,
});
await client.updateNotificationPolicies({
...policyTree,
routes,
});
}
return jsonResult({
status: "created",
contactPointUid: cp.uid,
webhookUrl: resolvedUrl,
message: `Webhook contact point created. Alerts with managed_by=openclaw will notify the agent.`,
});
}
catch (err) {
const reason = err instanceof Error ? err.message : String(err);
return jsonResult({ error: `Failed to set up alert webhook: ${reason}` });
}
}
/**
* Analyze alert rules for fatigue patterns: always-firing, flapping, or high-frequency.
* Uses rule eval state and last evaluation to classify each rule.
*/
async function handleAnalyze(client) {
try {
const [rules, stateResult] = await Promise.allSettled([
client.listAlertRules(),
client.getAlertRuleStates(),
]).then(([rulesRes, stateRes]) => [
rulesRes.status === "fulfilled" ? rulesRes.value : null,
stateRes.status === "fulfilled" ? stateRes.value : null,
]);
if (!rules) {
return jsonResult({ error: "Failed to analyze alert rules — could not reach Grafana provisioning API" });
}
if (rules.length === 0) {
return jsonResult({
status: "success",
totalRules: 0,
fatigueReport: { alwaysFiring: [], flapping: [], healthy: 0 },
overallHealth: "healthy",
suggestions: ["No alert rules configured. Use grafana_create_alert to set up monitoring."],
});
}
const ALWAYS_FIRING_THRESHOLD_MS = 24 * 60 * 60 * 1000; // 24h
const alwaysFiring = [];
const flapping = [];
let healthy = 0;
for (const rule of rules) {
const evalState = stateResult?.get(rule.uid);
const state = evalState ? normalizeState(evalState.state) : "unknown";
const health = evalState?.health ?? "unknown";
// Flapping detection first: rules with error health or nodata state suggest instability
if (health === "error" || state === "nodata" || state === "error") {
flapping.push({
uid: rule.uid,
title: rule.title,
state,
health,
suggestion: state === "nodata"
? "Rule produces no data — check if metric exists and query is correct"
: "Rule evaluation error — check datasource connectivity and query syntax",
});
continue;
}
if (state === "firing" && evalState?.lastEvaluation) {
// Check if firing for > 24h
const lastEvalTime = new Date(evalState.lastEvaluation).getTime();
const firingAge = Date.now() - lastEvalTime;
// If lastEvaluation is recent but state is firing, the rule has been continuously firing
// We use the `for` duration + age heuristic: if firing and last eval is recent, it's been firing since before last eval
if (firingAge < ALWAYS_FIRING_THRESHOLD_MS) {
// Still actively firing — check if the rule's `for` plus active time suggests chronic firing
// Heuristic: if state is firing and rule has been evaluated recently, it's actively firing
// We flag it as always-firing only if we can determine long duration
// For now, we check pending alerts in the store for additional context
const pending = store.getPendingAlerts();
const matchingAlert = pending.find((a) => {
const firstInstance = a.alerts?.[0];
if (!firstInstance?.generatorURL)
return false;
const ruleUid = extractRuleUidFromGeneratorUrl(firstInstance.generatorURL);
return ruleUid === rule.uid;
});
if (matchingAlert) {
const alertAge = Date.now() - matchingAlert.receivedAt;
if (alertAge > ALWAYS_FIRING_THRESHOLD_MS) {
alwaysFiring.push({
uid: rule.uid,
title: rule.title,
firingDuration: `${Math.round(alertAge / (60 * 60 * 1000))}h`,
suggestion: "Consider raising threshold, adding 'for' duration, or silencing if expected",
});
continue;
}
}
}
else {
// Last eval was long ago but state is firing — chronic
alwaysFiring.push({
uid: rule.uid,
title: rule.title,
firingDuration: `>${Math.round(firingAge / (60 * 60 * 1000))}h`,
suggestion: "Consider raising threshold, adding 'for' duration, or silencing if expected",
});
continue;
}
}
healthy++;
}
const overallHealth = alwaysFiring.length > 3 || flapping.length > 3
? "severe_fatigue"
: (alwaysFiring.length > 0 || flapping.length > 0 ? "moderate_fatigue" : "healthy");
const suggestions = [];
if (alwaysFiring.length > 0) {
suggestions.push(`${alwaysFiring.length} rule(s) always firing — review thresholds or add hysteresis with 'for' duration`);
}
if (flapping.length > 0) {
suggestions.push(`${flapping.length} rule(s) in error/nodata state — review query syntax and datasource connectivity`);
}
if (suggestions.length === 0) {
suggestions.push("All alert rules are healthy. No fatigue detected.");
}
return jsonResult({
status: "success",
totalRules: rules.length,
fatigueReport: {
alwaysFiring,
flapping,
healthy,
},
overallHealth,
suggestions,
});
}
catch (err) {
const reason = err instanceof Error ? err.message : String(err);
return jsonResult({ error: `Failed to analyze alert rules: ${reason}` });
}
}
}
/**
* Map Grafana's Prometheus state names to agent-friendly values.
* Grafana returns "inactive" for rules that aren't firing — map to "normal" for clarity.
*/
function normalizeState(s) {
if (s === "inactive")
return "normal";
return s;
}
function resolveWebhookUrl() {
// Default to localhost gateway — user can override via webhookUrl param
return `http://localhost:18789/grafana-lens/alerts`;
}
/**
* Extract the primary query node (refId "A") from an alert rule's data array.
* Returns the datasource UID and expression, or null if not found.
*/
function extractPrimaryQuery(rule) {
const queryNode = rule.data.find((d) => d.refId === "A");
if (!queryNode)
return null;
const expr = queryNode.model?.expr;
if (typeof expr !== "string" || expr.length === 0)
return null;
return { datasourceUid: queryNode.datasourceUid, expr };
}
/**
* Extract a human-readable condition summary from an alert rule's data queries.
* Falls back to the rule's condition refId if no PromQL/LogQL expression is found.
*/
function extractConditionSummary(rule) {
return extractPrimaryQuery(rule)?.expr ?? rule.condition;
}
/**
* Extract rule UID from a Grafana generator URL.
* Format: http://localhost:3000/alerting/<ruleUID>/edit (or /view)
*/
export function extractRuleUidFromGeneratorUrl(url) {
const match = url.match(/\/alerting\/([^/]+)\/(edit|view)/);
return match?.[1] ?? null;
}
/**
* Build a suggestedInvestigation hint for a pending alert by matching it to its rule.
*
* Resolution order:
* 1. Extract rule UID from generatorURL in the first alert instance (most precise)
* 2. Fall back to matching alert title to rule title
*/
function buildInvestigationHint(alert, ctx) {
// Resolve the alert rule
let rule;
// Try generatorURL first (more precise — contains rule UID)
const firstInstance = alert.alerts?.[0];
if (firstInstance?.generatorURL) {
const ruleUid = extractRuleUidFromGeneratorUrl(firstInstance.generatorURL);
if (ruleUid)
rule = ctx.rulesByUid.get(ruleUid);
}
// Fall back to title match (alert title often contains rule title)
if (!rule) {
// Alert title format is "[FIRING:N] RuleName" — try to extract the rule name
const titleMatch = alert.title.match(/\]\s*(.+)$/);
const ruleName = titleMatch?.[1] ?? alert.title;
rule = ctx.rulesByName.get(ruleName);
}
if (!rule)
return null;
// Extract the PromQL/LogQL expression and datasource from the query node
const primary = extractPrimaryQuery(rule);
if (!primary)
return null;
const { datasourceUid: dsUid, expr } = primary;
// Skip internal expression datasources (__expr__)
if (!dsUid || dsUid === "__expr__")
return null;
// Look up datasource type for tool routing — skip if datasource not found
const ds = ctx.dsMap.get(dsUid);
if (!ds)
return null;
const cap = getQueryCapability(ds.type);
if (!cap.supported)
return null;
const hint = cap.queryLanguage === "LogQL"
? `Run this LogQL query with ${cap.queryTool} to investigate. Check for error patterns around the alert trigger time.`
: `Run this PromQL query with ${cap.queryTool} to reproduce the alert condition. If the metric involves errors, also check logs with grafana_query_logs.`;
return {
datasourceUid: dsUid,
condition: expr,
tool: cap.queryTool,
queryLanguage: cap.queryLanguage,
hint,
};
}