openclaw-grafana-lens
Version:
OpenClaw plugin that gives AI agents full Grafana access — 18 composable tools for PromQL/LogQL/TraceQL queries, dashboard creation, alerting, SRE investigation, security monitoring, data collection pipeline management via Grafana Alloy (29 recipes), and
940 lines • 102 kB
JavaScript
/**
* Lifecycle Telemetry — gen_ai-compliant session-scoped traces
*
* Converts OpenClaw plugin hook events into hierarchical OTel traces
* following the gen_ai semantic conventions (v1.27.0+).
*
* Trace hierarchy (all spans share the same trace_id per session):
* invoke_agent openclaw (root, INTERNAL)
* +-- chat {model} (CLIENT)
* +-- execute_tool {toolName} (INTERNAL, sibling of chat)
* +-- chat {model} (CLIENT, next turn)
* +-- openclaw.compaction (INTERNAL)
* +-- openclaw.subagent.spawn {agentId} (INTERNAL)
* +-- openclaw.agent.end (INTERNAL)
*
* Context propagation: Explicit — no global TracerProvider.
* Uses trace.setSpan(ROOT_CONTEXT, parentSpan) + 3rd arg to startSpan().
*
* Also records gen_ai standard metrics:
* - gen_ai.client.token.usage (histogram)
* - gen_ai.client.operation.duration (histogram, seconds)
*/
import { trace, ROOT_CONTEXT, SpanKind, SpanStatusCode, TraceFlags } from "@opentelemetry/api";
import { SeverityNumber } from "@opentelemetry/api-logs";
import { redactSecrets, flattenLogKeys } from "./redact.js";
// ══════════════════════════════════════════════════════════════════════
// Helpers: extract gen_ai fields from lastAssistant (AssistantMessage)
// Defensive narrowing — lastAssistant comes from pi-ai via hook events
// ══════════════════════════════════════════════════════════════════════
/** Extract the actual response model from lastAssistant (may differ from request model) */
export function extractResponseModel(lastAssistant) {
if (lastAssistant &&
typeof lastAssistant === "object" &&
"model" in lastAssistant &&
typeof lastAssistant.model === "string") {
return lastAssistant.model;
}
return undefined;
}
/** Map pi-ai stopReason to gen_ai finish_reason values */
const STOP_REASON_MAP = {
stop: "stop",
length: "max_tokens",
toolUse: "tool_calls",
error: "error",
aborted: "stop",
};
export function extractFinishReason(lastAssistant) {
if (lastAssistant &&
typeof lastAssistant === "object" &&
"stopReason" in lastAssistant &&
typeof lastAssistant.stopReason === "string") {
return STOP_REASON_MAP[lastAssistant.stopReason];
}
return undefined;
}
// gen_ai tool descriptions — hardcoded for our 14 tools, empty for external tools
const TOOL_DESCRIPTIONS = {
grafana_create_dashboard: "Create dashboard from template or custom JSON spec",
grafana_update_dashboard: "Add, remove, or update panels on an existing dashboard",
grafana_query: "Run PromQL instant/range queries against Prometheus datasources",
grafana_query_logs: "Run LogQL queries against Loki datasources",
grafana_create_alert: "Create Grafana-native alert rules with PromQL conditions",
grafana_share_dashboard: "Render panels as PNG and deliver to messaging channels",
grafana_annotate: "Create or query annotations on dashboards",
grafana_explore_datasources: "Discover datasources configured in Grafana",
grafana_list_metrics: "Discover available metrics from Prometheus datasources",
grafana_search: "Search existing dashboards by title or tag",
grafana_get_dashboard: "Get compact dashboard summary with panels and queries",
grafana_check_alerts: "Check, acknowledge, or set up alert webhook notifications",
grafana_push_metrics: "Push custom data via OTLP for external data observatory",
grafana_explain_metric: "Get metric context: current value, trend, stats, metadata",
grafana_security_check: "Run comprehensive security health check with threat-level assessment",
};
/** Truncate a string to maxLen characters, appending "..." if truncated */
function truncate(s, maxLen) {
return s.length > maxLen ? s.slice(0, maxLen) + "..." : s;
}
/** Truncate for span names: uses "…" (single char) to save space in Tempo waterfall */
function truncateForSpanName(text, maxLen) {
if (text.length <= maxLen)
return text;
return text.slice(0, maxLen - 1) + "\u2026";
}
// Approximate pricing weights (relative cost per million tokens)
// Used for cost attribution when exact per-type costs aren't available
const TOKEN_COST_WEIGHTS = {
input: 15, // $15/MTok
output: 75, // $75/MTok
cacheRead: 1.5, // $1.5/MTok
cacheWrite: 18.75, // $18.75/MTok
};
/** Compute p95 from an array of numeric values */
function computeP95(values) {
if (values.length === 0)
return 0;
const sorted = [...values].sort((a, b) => a - b);
const idx = Math.ceil(sorted.length * 0.95) - 1;
return sorted[Math.max(0, idx)];
}
// Safety: force-close sessions older than 24h to prevent memory leaks
const SESSION_MAX_AGE_MS = 24 * 60 * 60 * 1000;
// Max entries in per-session latency reservoir (for P95 computation)
const LATENCY_RESERVOIR_SIZE = 200;
// Max entries in global rolling latency window (for getAvgLatencyMs gauge)
const LATENCY_WINDOW_SIZE = 100;
// Cost thresholds for SRE severity intelligence (Part 6)
const COST_THRESHOLDS = [1, 5, 10];
export function createLifecycleTelemetry(traces, logs, instruments, opts) {
const { tracer } = traces;
// Resolved config options with defaults
const captureContent = opts?.captureContent !== false;
const contentMaxLen = opts?.contentMaxLength ?? 2000;
const shouldRedact = opts?.redactSecrets !== false;
const costEstimator = opts?.costEstimator;
/** Prepare content for capture: truncate + optionally redact */
function prepareContent(text) {
let result = text;
if (shouldRedact)
result = redactSecrets(result);
return truncate(result, contentMaxLen);
}
// ── Security: prompt injection detection patterns ────────────────────
// Copied from openclaw's external-content.ts (src/security/external-content.ts).
// Local copy avoids runtime dependency on an internal module.
// Detection-only — never blocks; only increments a counter for human review.
const SUSPICIOUS_PATTERNS = [
/ignore\s+(all\s+)?(previous|prior|above)\s+(instructions|prompts|context)/i,
/disregard\s+(all\s+)?(previous|prior|above)/i,
/forget\s+(everything|all|your)\s+(previous|prior|above)/i,
/you\s+are\s+now\s+(a|an|my)\s+/i,
/new\s+instructions?:\s*/i,
/system\s*:\s*you\s+are/i,
/\[SYSTEM\]/i,
/\<\|?(im_start|system|endoftext)\|?\>/i,
/```\s*(system|prompt|injection)/i,
/act\s+as\s+(a|an|if)\s+/i,
/pretend\s+(you\s+are|to\s+be)\s+/i,
/override\s+(your|the|all)\s+(instructions|rules|guidelines)/i,
];
/** Classify tool errors into broad categories for security monitoring. */
function classifyToolError(error) {
if (/ECONNREFUSED|ETIMEDOUT|ENOTFOUND|fetch.*fail/i.test(error))
return "network";
if (/ENOENT|EACCES|path|directory|traversal/i.test(error))
return "filesystem";
if (/timeout|timed?\s*out/i.test(error))
return "timeout";
return "other";
}
// ── Security: unique sessions sliding window (1h) ──────────────────
const uniqueSessionMap = new Map(); // sessionId → latest timestamp
const UNIQUE_SESSION_WINDOW_MS = 60 * 60 * 1000; // 1 hour
function evictStaleSessions() {
const cutoff = Date.now() - UNIQUE_SESSION_WINDOW_MS;
for (const [id, ts] of uniqueSessionMap) {
if (ts < cutoff)
uniqueSessionMap.delete(id);
}
}
function trackUniqueSession(sessionId) {
uniqueSessionMap.set(sessionId, Date.now());
evictStaleSessions();
}
function getUniqueSessionCount() {
return uniqueSessionMap.size;
}
// State maps for span correlation
const activeSessions = new Map();
const activeLlmCalls = new Map();
const sessionKeyToId = new Map();
const activeCompactions = new Map();
const activeToolCalls = new Map();
// Dual-path trace fallback: model.usage → synthetic chat spans when hooks broken
// Grace-period trigger: a single cold-start model.usage event that arrives before the
// first llm_input hook is expected (event ordering, plugin init timing). Only warn
// after FALLBACK_WARN_THRESHOLD orphaned events — past that, hooks are really broken.
let llmHooksActive = false;
let fallbackModeLogged = false;
let fallbackOrphanedCount = 0;
const FALLBACK_WARN_THRESHOLD = 2;
let unsubscribeDiagnostic = null;
const pendingChildren = new Map(); // childSessionKey → parent info
const activeSubagentSpawns = new Map(); // childSessionKey → spawn span + metadata
const parentToChildren = new Map(); // parentSessionId → Set<childSessionId>
const childToParent = new Map(); // childSessionId → parentSessionId
// Sliding window for global rolling average latency (getAvgLatencyMs gauge)
const latencyWindow = [];
// Safety cleanup interval
const cleanupInterval = setInterval(() => {
const now = Date.now();
for (const [id, session] of activeSessions) {
if (now - session.startTime > SESSION_MAX_AGE_MS) {
session.rootSpan.setStatus({ code: SpanStatusCode.ERROR, message: "session timed out (24h safety limit)" });
session.rootSpan.end(now);
activeSessions.delete(id);
// Clean up reverse map
for (const [key, sid] of sessionKeyToId) {
if (sid === id)
sessionKeyToId.delete(key);
}
}
}
// Clean finalized sessions (deferred from finalizeSession to allow late-arriving hooks)
for (const [id, session] of activeSessions) {
if (!session.finalized)
continue;
// Safety: emit deferred summary before deleting (if LLM call never resolved)
if (session.pendingSummary && !session.summaryEmitted) {
const durationMs = session.finalDurationMs ?? (now - session.startTime);
emitSessionSummary(session, durationMs, "final", session.rootSpan);
session.summaryEmitted = true;
}
activeSessions.delete(id);
for (const [key, sid] of sessionKeyToId) {
if (sid === id)
sessionKeyToId.delete(key);
}
// Clean parent-child maps (prevents unbounded growth)
const parentId = childToParent.get(id);
if (parentId) {
parentToChildren.get(parentId)?.delete(id);
if (parentToChildren.get(parentId)?.size === 0)
parentToChildren.delete(parentId);
childToParent.delete(id);
}
const children = parentToChildren.get(id);
if (children) {
for (const childId of children)
childToParent.delete(childId);
parentToChildren.delete(id);
}
}
// Clean stale pending children (subagent spawned but never linked)
for (const [key, info] of pendingChildren) {
if (now - info.spawnTime > SESSION_MAX_AGE_MS) {
pendingChildren.delete(key);
// End orphaned spawn span if still open
const orphanEntry = activeSubagentSpawns.get(key);
if (orphanEntry) {
orphanEntry.span.setStatus({ code: SpanStatusCode.ERROR, message: "subagent spawn timed out (24h safety limit)" });
orphanEntry.span.end(now);
activeSubagentSpawns.delete(key);
}
}
}
// Clean stale tool call spans (before_tool_call without after_tool_call for >5min)
for (const [key, stack] of activeToolCalls) {
const filtered = stack.filter(entry => now - entry.startTime < 5 * 60_000);
// End orphaned spans before removing
for (const entry of stack) {
if (now - entry.startTime >= 5 * 60_000) {
entry.span.setAttribute("openclaw.stale_cleanup", true);
entry.span.setStatus({ code: SpanStatusCode.ERROR, message: "tool call timed out (5min)" });
entry.span.end(now);
}
}
if (filtered.length > 0)
activeToolCalls.set(key, filtered);
else
activeToolCalls.delete(key);
}
}, 60_000);
if (cleanupInterval.unref)
cleanupInterval.unref();
// ── Dual-path trace fallback: model.usage → synthetic chat spans ──
// When openclaw's llm_input/llm_output hooks are broken (e.g., v2026.3.31-4.1),
// model.usage diagnostic events still fire via a separate pipeline.
// Subscribe as an always-on second data source that activates only when hooks are silent.
if (opts?.onDiagnosticEvent) {
unsubscribeDiagnostic = opts.onDiagnosticEvent((evt) => {
if (evt.type !== "model.usage")
return;
// When hooks work, they handle everything — model.usage is dormant
if (llmHooksActive)
return;
// Grace period: tolerate cold-start race where model.usage arrives before the
// first llm_input. Only warn after repeated orphaned events past the threshold.
// Gate the counter so it stops incrementing once the warn has fired — keeps
// the value bounded and stable (matches the ${count} interpolated in the log body).
if (!fallbackModeLogged) {
fallbackOrphanedCount += 1;
if (fallbackOrphanedCount >= FALLBACK_WARN_THRESHOLD) {
fallbackModeLogged = true;
emitLog(SeverityNumber.WARN, "WARN", `LLM hook dispatch appears broken — activating model.usage fallback for trace generation after ${fallbackOrphanedCount} orphaned events. ` +
"Trace fidelity will be reduced. " +
"On openclaw >= 2026.4.24 this most often means the privacy gate is active: " +
"set plugins.entries.openclaw-grafana-lens.hooks.allowConversationAccess=true in ~/.openclaw/openclaw.json " +
"and restart the gateway to enable llm_input/llm_output/agent_end hooks. " +
"If the warning persists after that, file a bug at github.com/awsome-o/grafana-lens/issues with your openclaw version.", {
"event.domain": "openclaw",
"event.name": "trace.fallback_activated",
"openclaw.trace_source": "fallback_model_usage",
"openclaw.fallback.orphaned_count": fallbackOrphanedCount,
});
}
}
const now = Date.now();
const model = evt.model ?? "unknown";
const provider = evt.provider ?? "unknown";
const durationMs = evt.durationMs ?? 0;
const startTime = durationMs > 0 ? now - durationMs : now;
// Resolve parent session for span hierarchy
const session = resolveSessionCtx(evt.sessionId, evt.sessionKey);
const parentCtx = session?.ctx ?? ROOT_CONTEXT;
// Create synthetic chat span (backdated by durationMs for accurate waterfall)
const inTok = evt.usage?.input ?? 0;
const outTok = evt.usage?.output ?? 0;
const span = tracer.startSpan(`chat ${model} (${inTok}\u2192${outTok} tok)`, {
kind: SpanKind.CLIENT,
startTime,
attributes: {
"gen_ai.operation.name": "chat",
"gen_ai.provider.name": provider,
"gen_ai.request.model": model,
"gen_ai.usage.input_tokens": inTok,
"gen_ai.usage.output_tokens": outTok,
"gen_ai.usage.cache_read.input_tokens": evt.usage?.cacheRead ?? 0,
"gen_ai.usage.cache_creation.input_tokens": evt.usage?.cacheWrite ?? 0,
"openclaw.trace_fallback": true,
"openclaw.trace_source": "fallback_model_usage",
...(evt.sessionKey ? { "openclaw.session_key": evt.sessionKey } : {}),
...(evt.sessionId ? { "openclaw.session_id": evt.sessionId } : {}),
},
}, parentCtx);
span.setStatus({ code: SpanStatusCode.OK });
span.end(now);
// Record gen_ai standard metrics (same pattern as onLlmOutput)
const metricAttrs = {
"gen_ai.operation.name": "chat",
"gen_ai.provider.name": provider,
"gen_ai.request.model": model,
};
if (inTok > 0)
instruments.tokenUsage.record(inTok, { ...metricAttrs, "gen_ai.token.type": "input" });
if (outTok > 0)
instruments.tokenUsage.record(outTok, { ...metricAttrs, "gen_ai.token.type": "output" });
if (evt.usage?.cacheRead)
instruments.tokenUsage.record(evt.usage.cacheRead, { ...metricAttrs, "gen_ai.token.type": "cache_read_input" });
if (evt.usage?.cacheWrite)
instruments.tokenUsage.record(evt.usage.cacheWrite, { ...metricAttrs, "gen_ai.token.type": "cache_creation_input" });
if (durationMs > 0) {
instruments.operationDuration.record(durationMs / 1000, metricAttrs);
}
// Accumulate session data (tokens, cost, latency)
if (session && !session.finalized) {
session.totalInputTokens += inTok;
session.totalOutputTokens += outTok;
session.totalCacheReadTokens += evt.usage?.cacheRead ?? 0;
session.totalCacheWriteTokens += evt.usage?.cacheWrite ?? 0;
session.messageCountAssistant++;
if (model !== "unknown")
session.primaryModel = model;
if (provider !== "unknown")
session.primaryProvider = provider;
// Cost: prefer evt.costUsd (most authoritative), else costEstimator
let costUsd;
if (evt.costUsd != null && evt.costUsd > 0) {
costUsd = evt.costUsd;
}
else if (costEstimator) {
costUsd = costEstimator(provider, model, evt.usage);
}
if (costUsd && costUsd > 0) {
const prevCost = session.totalCostUsd;
session.totalCostUsd += costUsd;
// SRE cost threshold alerts (same logic as onLlmOutput)
for (const threshold of COST_THRESHOLDS) {
if (prevCost < threshold && session.totalCostUsd >= threshold && !session.costThresholdsLogged.has(threshold)) {
session.costThresholdsLogged.add(threshold);
const sev = threshold >= 10 ? SeverityNumber.ERROR
: threshold >= 5 ? SeverityNumber.WARN
: SeverityNumber.INFO;
const sevText = threshold >= 10 ? "ERROR" : threshold >= 5 ? "WARN" : "INFO";
const suffix = threshold >= 10 ? " \u2014 investigate" : "";
emitLog(sev, sevText, `Session cost crossed $${threshold.toFixed(2)}${suffix}`, {
"event.domain": "openclaw",
"event.name": "cost.threshold",
"openclaw.session_id": session.sessionId,
"openclaw.session_key": session.sessionKey,
"openclaw.cost_usd": session.totalCostUsd,
"openclaw.threshold_usd": threshold,
"openclaw.trace_source": "fallback_model_usage",
}, session.rootSpan);
}
}
}
// Latency tracking
if (durationMs > 0) {
session.latencies.push(durationMs);
if (session.latencies.length > LATENCY_RESERVOIR_SIZE)
session.latencies.shift();
session.latencySum += durationMs;
session.latencyCount++;
session.latencyMin = Math.min(session.latencyMin, durationMs);
session.latencyMax = Math.max(session.latencyMax, durationMs);
latencyWindow.push(durationMs);
if (latencyWindow.length > LATENCY_WINDOW_SIZE)
latencyWindow.shift();
}
}
// Increment fallback counter metric
instruments.traceFallbackSpans.add(1, { model, provider });
});
}
// ── Helper: build child summary attributes for session summary logs ──
function childSummaryAttrs(sessionId) {
const childIds = parentToChildren.get(sessionId);
if (!childIds?.size)
return {};
return {
"openclaw.child_session_ids": [...childIds].join(","),
"openclaw.child_count": childIds.size,
"openclaw.has_children": true,
};
}
// ── Helper: emit correlated log record ──────────────────────────────
function emitLog(severity, severityText, body, attributes, span) {
// Component label for Loki filtering: {service_name="openclaw"} | component="lifecycle"
attributes["component"] = "lifecycle";
if (span) {
// Keep string attrs for LogQL filtering: `| trace_id = "abc"`
attributes["trace_id"] = span.spanContext().traceId;
attributes["span_id"] = span.spanContext().spanId;
}
// Flatten dotted keys → underscores for Loki structured metadata compatibility
const flat = flattenLogKeys(attributes);
logs.logger.emit({
severityNumber: severity,
severityText,
body: shouldRedact ? redactSecrets(body) : body,
attributes: flat,
// Also pass OTel Context → SDK populates proto-level LogRecord.TraceId/SpanId
// This is the canonical OTLP way; Loki stores it in structured metadata
...(span ? { context: trace.setSpan(ROOT_CONTEXT, span) } : {}),
});
}
// ── Helper: resolve session context by sessionId or sessionKey ──────
function resolveSessionCtx(sessionId, sessionKey) {
if (sessionId) {
const direct = activeSessions.get(sessionId);
if (direct)
return direct;
}
if (sessionKey) {
const id = sessionKeyToId.get(sessionKey);
if (id)
return activeSessions.get(id);
// Fallback: sessionKey might be a sessionId itself
const direct = activeSessions.get(sessionKey);
if (direct)
return direct;
}
return undefined;
}
// ── Helper: resolve parent context for a span ───────────────────────
function resolveParentCtx(sessionId, sessionKey) {
const session = resolveSessionCtx(sessionId, sessionKey);
return session?.ctx ?? ROOT_CONTEXT;
}
// ── Helper: resolve session IDs for consistent log attributes ──────
function resolveSessionIds(sessionId, sessionKey) {
const session = resolveSessionCtx(sessionId, sessionKey);
return {
"openclaw.session_id": session?.sessionId ?? sessionId ?? "",
"openclaw.session_key": session?.sessionKey ?? sessionKey ?? "",
};
}
// ── Helper: resolve child session ID from a session key ──────────────
function resolveChildSessionId(childSessionKey) {
const childSessionId = sessionKeyToId.get(childSessionKey);
if (childSessionId)
return childSessionId;
// Check if the sessionKey is itself a sessionId
if (activeSessions.has(childSessionKey))
return childSessionKey;
return undefined;
}
// ── Fallback: pick the best active session when sessionKey is missing ──
// This handles the case where openclaw's hook system does NOT pass
// sessionKey/agentId in the tool call hook context for plugin-registered
// tools (confirmed in pi-tool-definition-adapter.ts).
// If 1 active session → return it (CLI / single-user case).
// If multiple → pick the one with the most recent LLM activity.
function resolveAnyActiveSession() {
if (activeSessions.size === 0)
return undefined;
if (activeSessions.size === 1)
return activeSessions.values().next().value;
// Multiple sessions: pick the one most likely to be executing tools right now.
// Heuristic: session with the most accumulated LLM latency (= most active).
let best;
let bestLatencySum = -1;
for (const session of activeSessions.values()) {
if (session.latencySum > bestLatencySum) {
bestLatencySum = session.latencySum;
best = session;
}
}
return best;
}
function computeSessionStats(session) {
const latencyCount = session.latencyCount;
const avgLatencyMs = latencyCount > 0 ? session.latencySum / latencyCount : 0;
const p95LatencyMs = computeP95(session.latencies);
const minLatencyMs = latencyCount > 0 ? session.latencyMin : 0;
const maxLatencyMs = latencyCount > 0 ? session.latencyMax : 0;
const totalToolCalls = [...session.toolCounts.values()].reduce((a, b) => a + b, 0);
const uniqueToolCount = session.toolCounts.size;
const topTools = [...session.toolCounts.entries()]
.sort((a, b) => b[1] - a[1]).slice(0, 5).map(([name]) => name).join(",");
const totalTokens = session.totalInputTokens + session.totalOutputTokens +
session.totalCacheReadTokens + session.totalCacheWriteTokens;
const totalMessages = session.messageCountUser + session.messageCountAssistant +
session.messageCountToolCalls + session.messageCountToolResults +
session.messageCountErrors;
const activeDurationMs = session.latencySum + session.totalToolDurationMs;
const W = TOKEN_COST_WEIGHTS;
const weightedSum = (session.totalInputTokens * W.input) +
(session.totalOutputTokens * W.output) +
(session.totalCacheReadTokens * W.cacheRead) +
(session.totalCacheWriteTokens * W.cacheWrite);
const costInput = weightedSum > 0
? (session.totalInputTokens * W.input / weightedSum) * session.totalCostUsd : 0;
const costOutput = weightedSum > 0
? (session.totalOutputTokens * W.output / weightedSum) * session.totalCostUsd : 0;
const costCacheRead = weightedSum > 0
? (session.totalCacheReadTokens * W.cacheRead / weightedSum) * session.totalCostUsd : 0;
const costCacheWrite = weightedSum > 0
? (session.totalCacheWriteTokens * W.cacheWrite / weightedSum) * session.totalCostUsd : 0;
const cacheInputTotal = session.totalInputTokens + session.totalCacheReadTokens;
const cacheHitRatio = cacheInputTotal > 0
? session.totalCacheReadTokens / cacheInputTotal : 0;
const cacheSavingsUsd = session.totalCacheReadTokens *
(W.input - W.cacheRead) / 1_000_000;
return {
latencyCount, avgLatencyMs, p95LatencyMs, minLatencyMs, maxLatencyMs,
totalToolCalls, uniqueToolCount, topTools, totalTokens, totalMessages,
activeDurationMs, costInput, costOutput, costCacheRead, costCacheWrite,
cacheHitRatio, cacheSavingsUsd,
};
}
// ── Helper: emit session usage summary log (reusable for interim + final) ──
function emitSessionSummary(session, durationMs, summaryType, span) {
const s = computeSessionStats(session);
const costStr = session.totalCostUsd > 0 ? `$${session.totalCostUsd.toFixed(2)}` : "$0";
const typeTag = summaryType === "interim" ? " (interim)" : "";
const summaryBody = `Session ${session.sessionId}${typeTag} | ${durationMs}ms (active: ${Math.round(s.activeDurationMs)}ms) | ${s.totalMessages} msgs (${session.messageCountUser} user, ${session.messageCountAssistant} assistant, ${session.messageCountToolCalls} tool) | ${costStr} | ${s.totalTokens.toLocaleString()}tok`;
emitLog(SeverityNumber.INFO, "INFO", summaryBody, {
"event.domain": "openclaw",
"event.name": "usage.session_summary",
"openclaw.summary.type": summaryType,
// Session identity
"openclaw.session_id": session.sessionId,
"openclaw.session_key": session.sessionKey,
"openclaw.agent_id": session.agentId,
// Duration
"openclaw.duration_ms": durationMs,
"openclaw.active_duration_ms": Math.round(s.activeDurationMs),
// Message type breakdown
"openclaw.messages.total": s.totalMessages,
"openclaw.messages.user": session.messageCountUser,
"openclaw.messages.assistant": session.messageCountAssistant,
"openclaw.messages.tool_calls": session.messageCountToolCalls,
"openclaw.messages.tool_results": session.messageCountToolResults,
"openclaw.messages.errors": session.messageCountErrors,
// Tool usage
"openclaw.tools.total_calls": s.totalToolCalls,
"openclaw.tools.unique_count": s.uniqueToolCount,
"openclaw.tools.top": s.topTools,
// Latency distribution
"openclaw.latency.count": s.latencyCount,
"openclaw.latency.avg_ms": Math.round(s.avgLatencyMs),
"openclaw.latency.p95_ms": Math.round(s.p95LatencyMs),
"openclaw.latency.min_ms": Math.round(s.minLatencyMs),
"openclaw.latency.max_ms": Math.round(s.maxLatencyMs),
// Token totals
"openclaw.tokens.input": session.totalInputTokens,
"openclaw.tokens.output": session.totalOutputTokens,
"openclaw.tokens.cache_read": session.totalCacheReadTokens,
"openclaw.tokens.cache_write": session.totalCacheWriteTokens,
"openclaw.tokens.total": s.totalTokens,
// Cost
"openclaw.cost.total": Number(session.totalCostUsd.toFixed(4)),
"openclaw.cost.input": Number(s.costInput.toFixed(4)),
"openclaw.cost.output": Number(s.costOutput.toFixed(4)),
"openclaw.cost.cache_read": Number(s.costCacheRead.toFixed(4)),
"openclaw.cost.cache_write": Number(s.costCacheWrite.toFixed(4)),
// Cache efficiency
"openclaw.cache.hit_ratio": Number(s.cacheHitRatio.toFixed(4)),
"openclaw.cache.savings_usd": Number(s.cacheSavingsUsd.toFixed(4)),
// Model + channel
"gen_ai.provider.name": session.primaryProvider,
"gen_ai.request.model": session.primaryModel,
...(session.channel ? { "openclaw.channel": session.channel } : {}),
// Subagent hierarchy attributes
...(session.isSubagent && session.parentSessionId ? {
"openclaw.is_subagent": true,
"openclaw.parent_session_id": session.parentSessionId,
} : {}),
...childSummaryAttrs(session.sessionId),
}, span ?? session.rootSpan);
}
// ── Helper: finalize a session (close root span, emit FINAL, cleanup) ──
// Idempotent: checks session.finalized flag. Called by onAgentEnd (primary)
// and onSessionEnd (fallback).
function finalizeSession(session, durationMs, errorMsg, deferSummary = false) {
if (session.finalized)
return;
session.finalized = true;
const now = Date.now();
const s = computeSessionStats(session);
// ── Enrich session root span with summary ────────────────────
{
const costTag = session.totalCostUsd > 0 ? ` $${session.totalCostUsd.toFixed(2)}` : "";
const toolTag = s.totalToolCalls > 0 ? ` ${s.totalToolCalls} tools` : "";
const modelTag = session.primaryModel ? ` [${session.primaryModel}]` : "";
session.rootSpan.updateName(`invoke_agent openclaw [${session.sessionId}]${modelTag} ${s.totalMessages} msgs${toolTag}${costTag}`);
}
// ── Enrich session root span with 20+ attributes ────────────
session.rootSpan.setAttributes({
"openclaw.session.duration_ms": durationMs,
"openclaw.session.active_duration_ms": Math.round(s.activeDurationMs),
"openclaw.session.cost_usd": session.totalCostUsd,
"openclaw.session.total_input_tokens": session.totalInputTokens,
"openclaw.session.total_output_tokens": session.totalOutputTokens,
"openclaw.session.total_cache_read_tokens": session.totalCacheReadTokens,
"openclaw.session.total_cache_write_tokens": session.totalCacheWriteTokens,
"openclaw.session.messages.user": session.messageCountUser,
"openclaw.session.messages.assistant": session.messageCountAssistant,
"openclaw.session.messages.tool_calls": session.messageCountToolCalls,
"openclaw.session.messages.tool_results": session.messageCountToolResults,
"openclaw.session.messages.errors": session.messageCountErrors,
"openclaw.session.latency.avg_ms": Math.round(s.avgLatencyMs),
"openclaw.session.latency.p95_ms": Math.round(s.p95LatencyMs),
"openclaw.session.latency.min_ms": Math.round(s.minLatencyMs),
"openclaw.session.latency.max_ms": Math.round(s.maxLatencyMs),
"openclaw.session.tools.unique_count": s.uniqueToolCount,
"openclaw.session.tools.total_calls": s.totalToolCalls,
"openclaw.session.tools.top": s.topTools,
"openclaw.session.cost.input": Number(s.costInput.toFixed(4)),
"openclaw.session.cost.output": Number(s.costOutput.toFixed(4)),
"openclaw.session.cost.cache_read": Number(s.costCacheRead.toFixed(4)),
"openclaw.session.cost.cache_write": Number(s.costCacheWrite.toFixed(4)),
"openclaw.session.cache_hit_ratio": Number(s.cacheHitRatio.toFixed(4)),
"openclaw.session.cache_savings_usd": Number(s.cacheSavingsUsd.toFixed(4)),
"gen_ai.agent.name": "openclaw",
"gen_ai.agent.id": session.agentId,
"gen_ai.conversation.id": session.sessionId,
"gen_ai.provider.name": session.primaryProvider,
"gen_ai.request.model": session.primaryModel,
});
if (errorMsg) {
session.rootSpan.setStatus({ code: SpanStatusCode.ERROR, message: errorMsg });
}
else {
session.rootSpan.setStatus({ code: SpanStatusCode.OK });
}
session.rootSpan.end(now);
// Record session duration + completion outcome metrics
instruments.sessionDurationMs.record(durationMs);
instruments.sessionsCompleted.add(1, { outcome: errorMsg ? "error" : "success" });
// Emit or defer final session usage summary
if (deferSummary) {
// LLM calls still in-flight — defer summary until last one resolves
session.pendingSummary = true;
session.finalDurationMs = durationMs;
}
else {
// No pending LLM calls — emit immediately
emitSessionSummary(session, durationMs, "final", session.rootSpan);
session.summaryEmitted = true;
}
// NOTE: Do NOT delete from activeSessions/sessionKeyToId here.
// Late-arriving hooks (e.g. llm_output after agent_end) still need to
// resolve the session to accumulate tokens. The 60s cleanup timer
// handles map cleanup for finalized sessions.
}
return {
// ── session_start → root span ───────────────────────────────────
onSessionStart(event, ctx) {
// Guard: if session already exists, skip (prevents orphaned root spans in Tempo)
if (activeSessions.has(event.sessionId))
return;
const now = Date.now();
const rootSpan = tracer.startSpan(`invoke_agent openclaw [${event.sessionId}]`, {
kind: SpanKind.INTERNAL,
startTime: now,
attributes: {
"gen_ai.operation.name": "invoke_agent",
"gen_ai.provider.name": "openclaw",
"gen_ai.agent.name": "openclaw",
"gen_ai.agent.id": "grafana-lens",
"gen_ai.output.type": "text",
"gen_ai.conversation.id": event.sessionId,
...(opts?.agentVersion ? { "gen_ai.agent.version": opts.agentVersion } : {}),
...(event.resumedFrom ? { "openclaw.session.resumed_from": event.resumedFrom } : {}),
},
});
const sessionCtx = trace.setSpan(ROOT_CONTEXT, rootSpan);
activeSessions.set(event.sessionId, {
rootSpan,
ctx: sessionCtx,
sessionId: event.sessionId,
startTime: now,
totalCostUsd: 0,
totalInputTokens: 0,
totalOutputTokens: 0,
totalCacheReadTokens: 0,
totalCacheWriteTokens: 0,
messageCountUser: 0,
messageCountAssistant: 0,
messageCountToolCalls: 0,
messageCountToolResults: 0,
messageCountErrors: 0,
toolCounts: new Map(),
toolErrorCounts: new Map(),
totalToolDurationMs: 0,
latencies: [],
latencySum: 0,
latencyMin: Infinity,
latencyMax: 0,
latencyCount: 0,
primaryModel: "",
primaryProvider: "",
channel: "",
sessionKey: "",
agentId: ctx.agentId ?? "",
costThresholdsLogged: new Set(),
firstMessageCaptured: false,
finalized: false,
isSubagent: false,
parentLinked: false,
});
// Map agentId as a sessionKey alias if available
if (ctx.agentId) {
sessionKeyToId.set(ctx.agentId, event.sessionId);
}
instruments.sessionsStartedTotal.add(1, {
type: event.resumedFrom ? "resumed" : "new",
});
// Security: track unique sessions for enumeration detection
trackUniqueSession(event.sessionId);
const resumedTag = event.resumedFrom ? " (resumed)" : "";
const agentTag = ctx.agentId ? ` [agent:${ctx.agentId}]` : "";
emitLog(SeverityNumber.INFO, "INFO", `Session started ${event.sessionId}${resumedTag}${agentTag}`, {
"event.domain": "openclaw",
"event.name": "session.start",
"openclaw.session_id": event.sessionId,
...(event.resumedFrom ? { "openclaw.resumed_from": event.resumedFrom } : {}),
}, rootSpan);
},
// ── session_end → close root span ─────────────────────────────────
onSessionEnd(event) {
const session = activeSessions.get(event.sessionId);
if (!session)
return; // graceful: end without start
if (session.finalized) {
// Safety: emit deferred summary if LLM call never resolved
if (session.pendingSummary && !session.summaryEmitted) {
const durationMs = session.finalDurationMs ?? (Date.now() - session.startTime);
emitSessionSummary(session, durationMs, "final", session.rootSpan);
session.summaryEmitted = true;
}
return;
}
const durationMs = event.durationMs ?? (Date.now() - session.startTime);
finalizeSession(session, durationMs);
},
// ── llm_input → start LLM call span ──────────────────────────────
onLlmInput(event, ctx) {
// Dual-path: latch hooks as active — disables model.usage fallback.
// Reset orphan counter and logged flag so a future hook outage is detected fresh.
if (!llmHooksActive) {
llmHooksActive = true;
const wasFallbackActive = fallbackModeLogged;
fallbackOrphanedCount = 0;
fallbackModeLogged = false;
if (wasFallbackActive) {
emitLog(SeverityNumber.INFO, "INFO", "LLM hooks restored — deactivating model.usage fallback", {
"event.domain": "openclaw",
"event.name": "trace.fallback_deactivated",
});
}
}
// Lazy session creation: if session_start was missed (fires before service init),
// create a synthetic root span so all subsequent spans have correct parenting.
let session = resolveSessionCtx(event.sessionId, ctx.sessionKey);
if (!session && event.sessionId) {
const synthNow = Date.now();
const rootSpan = tracer.startSpan(`invoke_agent openclaw [${event.sessionId}]`, {
kind: SpanKind.INTERNAL,
startTime: synthNow,
attributes: {
"gen_ai.operation.name": "invoke_agent",
"gen_ai.provider.name": "openclaw",
"gen_ai.agent.name": "openclaw",
"gen_ai.agent.id": "grafana-lens",
"gen_ai.output.type": "text",
"gen_ai.conversation.id": event.sessionId,
"openclaw.session.synthetic": true,
...(opts?.agentVersion ? { "gen_ai.agent.version": opts.agentVersion } : {}),
},
});
const sessionCtx = trace.setSpan(ROOT_CONTEXT, rootSpan);
activeSessions.set(event.sessionId, {
rootSpan,
ctx: sessionCtx,
sessionId: event.sessionId,
startTime: synthNow,
totalCostUsd: 0,
totalInputTokens: 0,
totalOutputTokens: 0,
totalCacheReadTokens: 0,
totalCacheWriteTokens: 0,
messageCountUser: 0,
messageCountAssistant: 0,
messageCountToolCalls: 0,
messageCountToolResults: 0,
messageCountErrors: 0,
toolCounts: new Map(),
toolErrorCounts: new Map(),
totalToolDurationMs: 0,
latencies: [],
latencySum: 0,
latencyMin: Infinity,
latencyMax: 0,
latencyCount: 0,
primaryModel: "",
primaryProvider: "",
channel: "",
sessionKey: ctx.sessionKey ?? "",
agentId: "",
costThresholdsLogged: new Set(),
firstMessageCaptured: false,
finalized: false,
isSubagent: false,
parentLinked: false,
});
if (ctx.sessionKey)
sessionKeyToId.set(ctx.sessionKey, event.sessionId);
session = activeSessions.get(event.sessionId);
instruments.sessionsStartedTotal.add(1, { type: "synthetic" });
trackUniqueSession(event.sessionId);
emitLog(SeverityNumber.INFO, "INFO", `Session started ${event.sessionId} (synthetic)`, {
"event.domain": "openclaw",
"event.name": "session.start",
"openclaw.session_id": event.sessionId,
"openclaw.session_key": ctx.sessionKey ?? "",
"openclaw.synthetic": true,
}, rootSpan);
}
const parentCtx = session?.ctx ?? ROOT_CONTEXT;
const now = Date.now();
const llmSpan = tracer.startSpan(`chat ${event.model}`, {
kind: SpanKind.CLIENT,
startTime: now,
attributes: {
"gen_ai.operation.name": "chat",
"gen_ai.provider.name": event.provider,
"gen_ai.request.model": event.model,
"gen_ai.conversation.id": event.sessionId,
"openclaw.run_id": event.runId,
"openclaw.session_key": ctx.sessionKey ?? "",
"openclaw.history_length": event.historyMessages.length,
"openclaw.images_count": event.imagesCount,
},
}, parentCtx);
const llmCtx = trace.setSpan(ROOT_CONTEXT, llmSpan);
activeLlmCalls.set(event.runId, {
span: llmSpan,
ctx: llmCtx,
sessionKey: ctx.sessionKey ?? "",
startTime: now,
});
// Map sessionKey → sessionId for tool call parenting
if (ctx.sessionKey && event.sessionId) {
sessionKeyToId.set(ctx.sessionKey, event.sessionId);
}
// ── Deferred subagent linking ──────────────────────────────────
// First hook where ctx.sessionKey is available for a child agent.
// Match against pendingChildren to establish parent↔child correlation.
if (ctx.sessionKey && session && !session.parentLinked) {
const parentInfo = pendingChildren.get(ctx.sessionKey);
if (parentInfo) {
session.parentLinked = true;
session.parentSessionId = parentInfo.parentSessionId;
session.parentTraceId = parentInfo.parentTraceId;
session.isSubagent = true;
// Enrich child root span with parent info (retroactive)
session.rootSpan.setAttributes({
"gen_ai.conversation.parent_id": parentInfo.parentSessionId,
"openclaw.parent_session_id": parentInfo.parentSessionId,
"openclaw.parent_session_key": parentInfo.parentSessionKey,
"openclaw.parent_trace_id": parentInfo.parentTraceId,
"openclaw.is_subagent": true,
"openclaw.subagent.agent_id": parentInfo.agentId,
"openclaw.subagent.label": parentInfo.label,
"openclaw.subagent.mode": parentInfo.mode,
});
// Span link: child root → parent spawn span (cross-trace)
session.rootSpan.addLink?.({
context: {
traceId: parentInfo.parentTraceId,
spanId: parentInfo.parentSpanId,
traceFlags: TraceFlags.SAMPLED,
},
attributes: { "openclaw.link.type": "parent_agent" },
});
// Span link: parent spawn span → child root (bidirectional)
const spawnEntry = activeSubagentSpawns.get(ctx.sessionKey);
if (spawnEntry) {
spawnEntry.span.addLink?.({
context: session.rootSpan.spanContext(),
attributes: {
"openclaw.link.type": "child_agent",
"openclaw.child_session_id": session.sessionId,
},
});
spawnEntry.span.setAttribute("openclaw.subagent.child_trace_id", session.rootSpan.spanContext().traceId);
spawnEntry.span.setAttribute("openclaw.subagent.child_session_id", session.sessionId);
}
// Update relationship maps
const childSet = parentToChildren.get(parentInfo.parentSessionId) ?? new Set();
childSet.add(session.sessionId);
parentToChildren.set(parentInfo.parentSessionId, childSet);
childToParent.set(session.sessionId, parentInfo.parentSessionId);
pendingChildren.delete(ctx.sessionKey);
emitLog(SeverityNumber.INFO, "INFO", `Subagent linked: ${session.sessionId} → parent ${parentInfo.parentSessionId}`, {
"event.domain": "openclaw",
"event.name": "subagent.linked",
"openclaw.session_id": session.sessionId,
"openclaw.parent_session_id": parentInfo.parentSessionId,
"openclaw.parent_trace_id": parentInfo.parentTraceId,
"openclaw.subagent.agent_id": parentInfo.agentId,
}, session.rootSpan);
}
}
// Count as user message (each LLM input follows a user turn)
const inputSession = r