UNPKG

openclaw-grafana-lens

Version:

OpenClaw plugin that gives AI agents full Grafana access — 18 composable tools for PromQL/LogQL/TraceQL queries, dashboard creation, alerting, SRE investigation, security monitoring, data collection pipeline management via Grafana Alloy (29 recipes), and

940 lines 102 kB
/** * Lifecycle Telemetry — gen_ai-compliant session-scoped traces * * Converts OpenClaw plugin hook events into hierarchical OTel traces * following the gen_ai semantic conventions (v1.27.0+). * * Trace hierarchy (all spans share the same trace_id per session): * invoke_agent openclaw (root, INTERNAL) * +-- chat {model} (CLIENT) * +-- execute_tool {toolName} (INTERNAL, sibling of chat) * +-- chat {model} (CLIENT, next turn) * +-- openclaw.compaction (INTERNAL) * +-- openclaw.subagent.spawn {agentId} (INTERNAL) * +-- openclaw.agent.end (INTERNAL) * * Context propagation: Explicit — no global TracerProvider. * Uses trace.setSpan(ROOT_CONTEXT, parentSpan) + 3rd arg to startSpan(). * * Also records gen_ai standard metrics: * - gen_ai.client.token.usage (histogram) * - gen_ai.client.operation.duration (histogram, seconds) */ import { trace, ROOT_CONTEXT, SpanKind, SpanStatusCode, TraceFlags } from "@opentelemetry/api"; import { SeverityNumber } from "@opentelemetry/api-logs"; import { redactSecrets, flattenLogKeys } from "./redact.js"; // ══════════════════════════════════════════════════════════════════════ // Helpers: extract gen_ai fields from lastAssistant (AssistantMessage) // Defensive narrowing — lastAssistant comes from pi-ai via hook events // ══════════════════════════════════════════════════════════════════════ /** Extract the actual response model from lastAssistant (may differ from request model) */ export function extractResponseModel(lastAssistant) { if (lastAssistant && typeof lastAssistant === "object" && "model" in lastAssistant && typeof lastAssistant.model === "string") { return lastAssistant.model; } return undefined; } /** Map pi-ai stopReason to gen_ai finish_reason values */ const STOP_REASON_MAP = { stop: "stop", length: "max_tokens", toolUse: "tool_calls", error: "error", aborted: "stop", }; export function extractFinishReason(lastAssistant) { if (lastAssistant && typeof lastAssistant === "object" && "stopReason" in lastAssistant && typeof lastAssistant.stopReason === "string") { return STOP_REASON_MAP[lastAssistant.stopReason]; } return undefined; } // gen_ai tool descriptions — hardcoded for our 14 tools, empty for external tools const TOOL_DESCRIPTIONS = { grafana_create_dashboard: "Create dashboard from template or custom JSON spec", grafana_update_dashboard: "Add, remove, or update panels on an existing dashboard", grafana_query: "Run PromQL instant/range queries against Prometheus datasources", grafana_query_logs: "Run LogQL queries against Loki datasources", grafana_create_alert: "Create Grafana-native alert rules with PromQL conditions", grafana_share_dashboard: "Render panels as PNG and deliver to messaging channels", grafana_annotate: "Create or query annotations on dashboards", grafana_explore_datasources: "Discover datasources configured in Grafana", grafana_list_metrics: "Discover available metrics from Prometheus datasources", grafana_search: "Search existing dashboards by title or tag", grafana_get_dashboard: "Get compact dashboard summary with panels and queries", grafana_check_alerts: "Check, acknowledge, or set up alert webhook notifications", grafana_push_metrics: "Push custom data via OTLP for external data observatory", grafana_explain_metric: "Get metric context: current value, trend, stats, metadata", grafana_security_check: "Run comprehensive security health check with threat-level assessment", }; /** Truncate a string to maxLen characters, appending "..." if truncated */ function truncate(s, maxLen) { return s.length > maxLen ? s.slice(0, maxLen) + "..." : s; } /** Truncate for span names: uses "…" (single char) to save space in Tempo waterfall */ function truncateForSpanName(text, maxLen) { if (text.length <= maxLen) return text; return text.slice(0, maxLen - 1) + "\u2026"; } // Approximate pricing weights (relative cost per million tokens) // Used for cost attribution when exact per-type costs aren't available const TOKEN_COST_WEIGHTS = { input: 15, // $15/MTok output: 75, // $75/MTok cacheRead: 1.5, // $1.5/MTok cacheWrite: 18.75, // $18.75/MTok }; /** Compute p95 from an array of numeric values */ function computeP95(values) { if (values.length === 0) return 0; const sorted = [...values].sort((a, b) => a - b); const idx = Math.ceil(sorted.length * 0.95) - 1; return sorted[Math.max(0, idx)]; } // Safety: force-close sessions older than 24h to prevent memory leaks const SESSION_MAX_AGE_MS = 24 * 60 * 60 * 1000; // Max entries in per-session latency reservoir (for P95 computation) const LATENCY_RESERVOIR_SIZE = 200; // Max entries in global rolling latency window (for getAvgLatencyMs gauge) const LATENCY_WINDOW_SIZE = 100; // Cost thresholds for SRE severity intelligence (Part 6) const COST_THRESHOLDS = [1, 5, 10]; export function createLifecycleTelemetry(traces, logs, instruments, opts) { const { tracer } = traces; // Resolved config options with defaults const captureContent = opts?.captureContent !== false; const contentMaxLen = opts?.contentMaxLength ?? 2000; const shouldRedact = opts?.redactSecrets !== false; const costEstimator = opts?.costEstimator; /** Prepare content for capture: truncate + optionally redact */ function prepareContent(text) { let result = text; if (shouldRedact) result = redactSecrets(result); return truncate(result, contentMaxLen); } // ── Security: prompt injection detection patterns ──────────────────── // Copied from openclaw's external-content.ts (src/security/external-content.ts). // Local copy avoids runtime dependency on an internal module. // Detection-only — never blocks; only increments a counter for human review. const SUSPICIOUS_PATTERNS = [ /ignore\s+(all\s+)?(previous|prior|above)\s+(instructions|prompts|context)/i, /disregard\s+(all\s+)?(previous|prior|above)/i, /forget\s+(everything|all|your)\s+(previous|prior|above)/i, /you\s+are\s+now\s+(a|an|my)\s+/i, /new\s+instructions?:\s*/i, /system\s*:\s*you\s+are/i, /\[SYSTEM\]/i, /\<\|?(im_start|system|endoftext)\|?\>/i, /```\s*(system|prompt|injection)/i, /act\s+as\s+(a|an|if)\s+/i, /pretend\s+(you\s+are|to\s+be)\s+/i, /override\s+(your|the|all)\s+(instructions|rules|guidelines)/i, ]; /** Classify tool errors into broad categories for security monitoring. */ function classifyToolError(error) { if (/ECONNREFUSED|ETIMEDOUT|ENOTFOUND|fetch.*fail/i.test(error)) return "network"; if (/ENOENT|EACCES|path|directory|traversal/i.test(error)) return "filesystem"; if (/timeout|timed?\s*out/i.test(error)) return "timeout"; return "other"; } // ── Security: unique sessions sliding window (1h) ────────────────── const uniqueSessionMap = new Map(); // sessionId → latest timestamp const UNIQUE_SESSION_WINDOW_MS = 60 * 60 * 1000; // 1 hour function evictStaleSessions() { const cutoff = Date.now() - UNIQUE_SESSION_WINDOW_MS; for (const [id, ts] of uniqueSessionMap) { if (ts < cutoff) uniqueSessionMap.delete(id); } } function trackUniqueSession(sessionId) { uniqueSessionMap.set(sessionId, Date.now()); evictStaleSessions(); } function getUniqueSessionCount() { return uniqueSessionMap.size; } // State maps for span correlation const activeSessions = new Map(); const activeLlmCalls = new Map(); const sessionKeyToId = new Map(); const activeCompactions = new Map(); const activeToolCalls = new Map(); // Dual-path trace fallback: model.usage → synthetic chat spans when hooks broken // Grace-period trigger: a single cold-start model.usage event that arrives before the // first llm_input hook is expected (event ordering, plugin init timing). Only warn // after FALLBACK_WARN_THRESHOLD orphaned events — past that, hooks are really broken. let llmHooksActive = false; let fallbackModeLogged = false; let fallbackOrphanedCount = 0; const FALLBACK_WARN_THRESHOLD = 2; let unsubscribeDiagnostic = null; const pendingChildren = new Map(); // childSessionKey → parent info const activeSubagentSpawns = new Map(); // childSessionKey → spawn span + metadata const parentToChildren = new Map(); // parentSessionId → Set<childSessionId> const childToParent = new Map(); // childSessionId → parentSessionId // Sliding window for global rolling average latency (getAvgLatencyMs gauge) const latencyWindow = []; // Safety cleanup interval const cleanupInterval = setInterval(() => { const now = Date.now(); for (const [id, session] of activeSessions) { if (now - session.startTime > SESSION_MAX_AGE_MS) { session.rootSpan.setStatus({ code: SpanStatusCode.ERROR, message: "session timed out (24h safety limit)" }); session.rootSpan.end(now); activeSessions.delete(id); // Clean up reverse map for (const [key, sid] of sessionKeyToId) { if (sid === id) sessionKeyToId.delete(key); } } } // Clean finalized sessions (deferred from finalizeSession to allow late-arriving hooks) for (const [id, session] of activeSessions) { if (!session.finalized) continue; // Safety: emit deferred summary before deleting (if LLM call never resolved) if (session.pendingSummary && !session.summaryEmitted) { const durationMs = session.finalDurationMs ?? (now - session.startTime); emitSessionSummary(session, durationMs, "final", session.rootSpan); session.summaryEmitted = true; } activeSessions.delete(id); for (const [key, sid] of sessionKeyToId) { if (sid === id) sessionKeyToId.delete(key); } // Clean parent-child maps (prevents unbounded growth) const parentId = childToParent.get(id); if (parentId) { parentToChildren.get(parentId)?.delete(id); if (parentToChildren.get(parentId)?.size === 0) parentToChildren.delete(parentId); childToParent.delete(id); } const children = parentToChildren.get(id); if (children) { for (const childId of children) childToParent.delete(childId); parentToChildren.delete(id); } } // Clean stale pending children (subagent spawned but never linked) for (const [key, info] of pendingChildren) { if (now - info.spawnTime > SESSION_MAX_AGE_MS) { pendingChildren.delete(key); // End orphaned spawn span if still open const orphanEntry = activeSubagentSpawns.get(key); if (orphanEntry) { orphanEntry.span.setStatus({ code: SpanStatusCode.ERROR, message: "subagent spawn timed out (24h safety limit)" }); orphanEntry.span.end(now); activeSubagentSpawns.delete(key); } } } // Clean stale tool call spans (before_tool_call without after_tool_call for >5min) for (const [key, stack] of activeToolCalls) { const filtered = stack.filter(entry => now - entry.startTime < 5 * 60_000); // End orphaned spans before removing for (const entry of stack) { if (now - entry.startTime >= 5 * 60_000) { entry.span.setAttribute("openclaw.stale_cleanup", true); entry.span.setStatus({ code: SpanStatusCode.ERROR, message: "tool call timed out (5min)" }); entry.span.end(now); } } if (filtered.length > 0) activeToolCalls.set(key, filtered); else activeToolCalls.delete(key); } }, 60_000); if (cleanupInterval.unref) cleanupInterval.unref(); // ── Dual-path trace fallback: model.usage → synthetic chat spans ── // When openclaw's llm_input/llm_output hooks are broken (e.g., v2026.3.31-4.1), // model.usage diagnostic events still fire via a separate pipeline. // Subscribe as an always-on second data source that activates only when hooks are silent. if (opts?.onDiagnosticEvent) { unsubscribeDiagnostic = opts.onDiagnosticEvent((evt) => { if (evt.type !== "model.usage") return; // When hooks work, they handle everything — model.usage is dormant if (llmHooksActive) return; // Grace period: tolerate cold-start race where model.usage arrives before the // first llm_input. Only warn after repeated orphaned events past the threshold. // Gate the counter so it stops incrementing once the warn has fired — keeps // the value bounded and stable (matches the ${count} interpolated in the log body). if (!fallbackModeLogged) { fallbackOrphanedCount += 1; if (fallbackOrphanedCount >= FALLBACK_WARN_THRESHOLD) { fallbackModeLogged = true; emitLog(SeverityNumber.WARN, "WARN", `LLM hook dispatch appears broken — activating model.usage fallback for trace generation after ${fallbackOrphanedCount} orphaned events. ` + "Trace fidelity will be reduced. " + "On openclaw >= 2026.4.24 this most often means the privacy gate is active: " + "set plugins.entries.openclaw-grafana-lens.hooks.allowConversationAccess=true in ~/.openclaw/openclaw.json " + "and restart the gateway to enable llm_input/llm_output/agent_end hooks. " + "If the warning persists after that, file a bug at github.com/awsome-o/grafana-lens/issues with your openclaw version.", { "event.domain": "openclaw", "event.name": "trace.fallback_activated", "openclaw.trace_source": "fallback_model_usage", "openclaw.fallback.orphaned_count": fallbackOrphanedCount, }); } } const now = Date.now(); const model = evt.model ?? "unknown"; const provider = evt.provider ?? "unknown"; const durationMs = evt.durationMs ?? 0; const startTime = durationMs > 0 ? now - durationMs : now; // Resolve parent session for span hierarchy const session = resolveSessionCtx(evt.sessionId, evt.sessionKey); const parentCtx = session?.ctx ?? ROOT_CONTEXT; // Create synthetic chat span (backdated by durationMs for accurate waterfall) const inTok = evt.usage?.input ?? 0; const outTok = evt.usage?.output ?? 0; const span = tracer.startSpan(`chat ${model} (${inTok}\u2192${outTok} tok)`, { kind: SpanKind.CLIENT, startTime, attributes: { "gen_ai.operation.name": "chat", "gen_ai.provider.name": provider, "gen_ai.request.model": model, "gen_ai.usage.input_tokens": inTok, "gen_ai.usage.output_tokens": outTok, "gen_ai.usage.cache_read.input_tokens": evt.usage?.cacheRead ?? 0, "gen_ai.usage.cache_creation.input_tokens": evt.usage?.cacheWrite ?? 0, "openclaw.trace_fallback": true, "openclaw.trace_source": "fallback_model_usage", ...(evt.sessionKey ? { "openclaw.session_key": evt.sessionKey } : {}), ...(evt.sessionId ? { "openclaw.session_id": evt.sessionId } : {}), }, }, parentCtx); span.setStatus({ code: SpanStatusCode.OK }); span.end(now); // Record gen_ai standard metrics (same pattern as onLlmOutput) const metricAttrs = { "gen_ai.operation.name": "chat", "gen_ai.provider.name": provider, "gen_ai.request.model": model, }; if (inTok > 0) instruments.tokenUsage.record(inTok, { ...metricAttrs, "gen_ai.token.type": "input" }); if (outTok > 0) instruments.tokenUsage.record(outTok, { ...metricAttrs, "gen_ai.token.type": "output" }); if (evt.usage?.cacheRead) instruments.tokenUsage.record(evt.usage.cacheRead, { ...metricAttrs, "gen_ai.token.type": "cache_read_input" }); if (evt.usage?.cacheWrite) instruments.tokenUsage.record(evt.usage.cacheWrite, { ...metricAttrs, "gen_ai.token.type": "cache_creation_input" }); if (durationMs > 0) { instruments.operationDuration.record(durationMs / 1000, metricAttrs); } // Accumulate session data (tokens, cost, latency) if (session && !session.finalized) { session.totalInputTokens += inTok; session.totalOutputTokens += outTok; session.totalCacheReadTokens += evt.usage?.cacheRead ?? 0; session.totalCacheWriteTokens += evt.usage?.cacheWrite ?? 0; session.messageCountAssistant++; if (model !== "unknown") session.primaryModel = model; if (provider !== "unknown") session.primaryProvider = provider; // Cost: prefer evt.costUsd (most authoritative), else costEstimator let costUsd; if (evt.costUsd != null && evt.costUsd > 0) { costUsd = evt.costUsd; } else if (costEstimator) { costUsd = costEstimator(provider, model, evt.usage); } if (costUsd && costUsd > 0) { const prevCost = session.totalCostUsd; session.totalCostUsd += costUsd; // SRE cost threshold alerts (same logic as onLlmOutput) for (const threshold of COST_THRESHOLDS) { if (prevCost < threshold && session.totalCostUsd >= threshold && !session.costThresholdsLogged.has(threshold)) { session.costThresholdsLogged.add(threshold); const sev = threshold >= 10 ? SeverityNumber.ERROR : threshold >= 5 ? SeverityNumber.WARN : SeverityNumber.INFO; const sevText = threshold >= 10 ? "ERROR" : threshold >= 5 ? "WARN" : "INFO"; const suffix = threshold >= 10 ? " \u2014 investigate" : ""; emitLog(sev, sevText, `Session cost crossed $${threshold.toFixed(2)}${suffix}`, { "event.domain": "openclaw", "event.name": "cost.threshold", "openclaw.session_id": session.sessionId, "openclaw.session_key": session.sessionKey, "openclaw.cost_usd": session.totalCostUsd, "openclaw.threshold_usd": threshold, "openclaw.trace_source": "fallback_model_usage", }, session.rootSpan); } } } // Latency tracking if (durationMs > 0) { session.latencies.push(durationMs); if (session.latencies.length > LATENCY_RESERVOIR_SIZE) session.latencies.shift(); session.latencySum += durationMs; session.latencyCount++; session.latencyMin = Math.min(session.latencyMin, durationMs); session.latencyMax = Math.max(session.latencyMax, durationMs); latencyWindow.push(durationMs); if (latencyWindow.length > LATENCY_WINDOW_SIZE) latencyWindow.shift(); } } // Increment fallback counter metric instruments.traceFallbackSpans.add(1, { model, provider }); }); } // ── Helper: build child summary attributes for session summary logs ── function childSummaryAttrs(sessionId) { const childIds = parentToChildren.get(sessionId); if (!childIds?.size) return {}; return { "openclaw.child_session_ids": [...childIds].join(","), "openclaw.child_count": childIds.size, "openclaw.has_children": true, }; } // ── Helper: emit correlated log record ────────────────────────────── function emitLog(severity, severityText, body, attributes, span) { // Component label for Loki filtering: {service_name="openclaw"} | component="lifecycle" attributes["component"] = "lifecycle"; if (span) { // Keep string attrs for LogQL filtering: `| trace_id = "abc"` attributes["trace_id"] = span.spanContext().traceId; attributes["span_id"] = span.spanContext().spanId; } // Flatten dotted keys → underscores for Loki structured metadata compatibility const flat = flattenLogKeys(attributes); logs.logger.emit({ severityNumber: severity, severityText, body: shouldRedact ? redactSecrets(body) : body, attributes: flat, // Also pass OTel Context → SDK populates proto-level LogRecord.TraceId/SpanId // This is the canonical OTLP way; Loki stores it in structured metadata ...(span ? { context: trace.setSpan(ROOT_CONTEXT, span) } : {}), }); } // ── Helper: resolve session context by sessionId or sessionKey ────── function resolveSessionCtx(sessionId, sessionKey) { if (sessionId) { const direct = activeSessions.get(sessionId); if (direct) return direct; } if (sessionKey) { const id = sessionKeyToId.get(sessionKey); if (id) return activeSessions.get(id); // Fallback: sessionKey might be a sessionId itself const direct = activeSessions.get(sessionKey); if (direct) return direct; } return undefined; } // ── Helper: resolve parent context for a span ─────────────────────── function resolveParentCtx(sessionId, sessionKey) { const session = resolveSessionCtx(sessionId, sessionKey); return session?.ctx ?? ROOT_CONTEXT; } // ── Helper: resolve session IDs for consistent log attributes ────── function resolveSessionIds(sessionId, sessionKey) { const session = resolveSessionCtx(sessionId, sessionKey); return { "openclaw.session_id": session?.sessionId ?? sessionId ?? "", "openclaw.session_key": session?.sessionKey ?? sessionKey ?? "", }; } // ── Helper: resolve child session ID from a session key ────────────── function resolveChildSessionId(childSessionKey) { const childSessionId = sessionKeyToId.get(childSessionKey); if (childSessionId) return childSessionId; // Check if the sessionKey is itself a sessionId if (activeSessions.has(childSessionKey)) return childSessionKey; return undefined; } // ── Fallback: pick the best active session when sessionKey is missing ── // This handles the case where openclaw's hook system does NOT pass // sessionKey/agentId in the tool call hook context for plugin-registered // tools (confirmed in pi-tool-definition-adapter.ts). // If 1 active session → return it (CLI / single-user case). // If multiple → pick the one with the most recent LLM activity. function resolveAnyActiveSession() { if (activeSessions.size === 0) return undefined; if (activeSessions.size === 1) return activeSessions.values().next().value; // Multiple sessions: pick the one most likely to be executing tools right now. // Heuristic: session with the most accumulated LLM latency (= most active). let best; let bestLatencySum = -1; for (const session of activeSessions.values()) { if (session.latencySum > bestLatencySum) { bestLatencySum = session.latencySum; best = session; } } return best; } function computeSessionStats(session) { const latencyCount = session.latencyCount; const avgLatencyMs = latencyCount > 0 ? session.latencySum / latencyCount : 0; const p95LatencyMs = computeP95(session.latencies); const minLatencyMs = latencyCount > 0 ? session.latencyMin : 0; const maxLatencyMs = latencyCount > 0 ? session.latencyMax : 0; const totalToolCalls = [...session.toolCounts.values()].reduce((a, b) => a + b, 0); const uniqueToolCount = session.toolCounts.size; const topTools = [...session.toolCounts.entries()] .sort((a, b) => b[1] - a[1]).slice(0, 5).map(([name]) => name).join(","); const totalTokens = session.totalInputTokens + session.totalOutputTokens + session.totalCacheReadTokens + session.totalCacheWriteTokens; const totalMessages = session.messageCountUser + session.messageCountAssistant + session.messageCountToolCalls + session.messageCountToolResults + session.messageCountErrors; const activeDurationMs = session.latencySum + session.totalToolDurationMs; const W = TOKEN_COST_WEIGHTS; const weightedSum = (session.totalInputTokens * W.input) + (session.totalOutputTokens * W.output) + (session.totalCacheReadTokens * W.cacheRead) + (session.totalCacheWriteTokens * W.cacheWrite); const costInput = weightedSum > 0 ? (session.totalInputTokens * W.input / weightedSum) * session.totalCostUsd : 0; const costOutput = weightedSum > 0 ? (session.totalOutputTokens * W.output / weightedSum) * session.totalCostUsd : 0; const costCacheRead = weightedSum > 0 ? (session.totalCacheReadTokens * W.cacheRead / weightedSum) * session.totalCostUsd : 0; const costCacheWrite = weightedSum > 0 ? (session.totalCacheWriteTokens * W.cacheWrite / weightedSum) * session.totalCostUsd : 0; const cacheInputTotal = session.totalInputTokens + session.totalCacheReadTokens; const cacheHitRatio = cacheInputTotal > 0 ? session.totalCacheReadTokens / cacheInputTotal : 0; const cacheSavingsUsd = session.totalCacheReadTokens * (W.input - W.cacheRead) / 1_000_000; return { latencyCount, avgLatencyMs, p95LatencyMs, minLatencyMs, maxLatencyMs, totalToolCalls, uniqueToolCount, topTools, totalTokens, totalMessages, activeDurationMs, costInput, costOutput, costCacheRead, costCacheWrite, cacheHitRatio, cacheSavingsUsd, }; } // ── Helper: emit session usage summary log (reusable for interim + final) ── function emitSessionSummary(session, durationMs, summaryType, span) { const s = computeSessionStats(session); const costStr = session.totalCostUsd > 0 ? `$${session.totalCostUsd.toFixed(2)}` : "$0"; const typeTag = summaryType === "interim" ? " (interim)" : ""; const summaryBody = `Session ${session.sessionId}${typeTag} | ${durationMs}ms (active: ${Math.round(s.activeDurationMs)}ms) | ${s.totalMessages} msgs (${session.messageCountUser} user, ${session.messageCountAssistant} assistant, ${session.messageCountToolCalls} tool) | ${costStr} | ${s.totalTokens.toLocaleString()}tok`; emitLog(SeverityNumber.INFO, "INFO", summaryBody, { "event.domain": "openclaw", "event.name": "usage.session_summary", "openclaw.summary.type": summaryType, // Session identity "openclaw.session_id": session.sessionId, "openclaw.session_key": session.sessionKey, "openclaw.agent_id": session.agentId, // Duration "openclaw.duration_ms": durationMs, "openclaw.active_duration_ms": Math.round(s.activeDurationMs), // Message type breakdown "openclaw.messages.total": s.totalMessages, "openclaw.messages.user": session.messageCountUser, "openclaw.messages.assistant": session.messageCountAssistant, "openclaw.messages.tool_calls": session.messageCountToolCalls, "openclaw.messages.tool_results": session.messageCountToolResults, "openclaw.messages.errors": session.messageCountErrors, // Tool usage "openclaw.tools.total_calls": s.totalToolCalls, "openclaw.tools.unique_count": s.uniqueToolCount, "openclaw.tools.top": s.topTools, // Latency distribution "openclaw.latency.count": s.latencyCount, "openclaw.latency.avg_ms": Math.round(s.avgLatencyMs), "openclaw.latency.p95_ms": Math.round(s.p95LatencyMs), "openclaw.latency.min_ms": Math.round(s.minLatencyMs), "openclaw.latency.max_ms": Math.round(s.maxLatencyMs), // Token totals "openclaw.tokens.input": session.totalInputTokens, "openclaw.tokens.output": session.totalOutputTokens, "openclaw.tokens.cache_read": session.totalCacheReadTokens, "openclaw.tokens.cache_write": session.totalCacheWriteTokens, "openclaw.tokens.total": s.totalTokens, // Cost "openclaw.cost.total": Number(session.totalCostUsd.toFixed(4)), "openclaw.cost.input": Number(s.costInput.toFixed(4)), "openclaw.cost.output": Number(s.costOutput.toFixed(4)), "openclaw.cost.cache_read": Number(s.costCacheRead.toFixed(4)), "openclaw.cost.cache_write": Number(s.costCacheWrite.toFixed(4)), // Cache efficiency "openclaw.cache.hit_ratio": Number(s.cacheHitRatio.toFixed(4)), "openclaw.cache.savings_usd": Number(s.cacheSavingsUsd.toFixed(4)), // Model + channel "gen_ai.provider.name": session.primaryProvider, "gen_ai.request.model": session.primaryModel, ...(session.channel ? { "openclaw.channel": session.channel } : {}), // Subagent hierarchy attributes ...(session.isSubagent && session.parentSessionId ? { "openclaw.is_subagent": true, "openclaw.parent_session_id": session.parentSessionId, } : {}), ...childSummaryAttrs(session.sessionId), }, span ?? session.rootSpan); } // ── Helper: finalize a session (close root span, emit FINAL, cleanup) ── // Idempotent: checks session.finalized flag. Called by onAgentEnd (primary) // and onSessionEnd (fallback). function finalizeSession(session, durationMs, errorMsg, deferSummary = false) { if (session.finalized) return; session.finalized = true; const now = Date.now(); const s = computeSessionStats(session); // ── Enrich session root span with summary ──────────────────── { const costTag = session.totalCostUsd > 0 ? ` $${session.totalCostUsd.toFixed(2)}` : ""; const toolTag = s.totalToolCalls > 0 ? ` ${s.totalToolCalls} tools` : ""; const modelTag = session.primaryModel ? ` [${session.primaryModel}]` : ""; session.rootSpan.updateName(`invoke_agent openclaw [${session.sessionId}]${modelTag} ${s.totalMessages} msgs${toolTag}${costTag}`); } // ── Enrich session root span with 20+ attributes ──────────── session.rootSpan.setAttributes({ "openclaw.session.duration_ms": durationMs, "openclaw.session.active_duration_ms": Math.round(s.activeDurationMs), "openclaw.session.cost_usd": session.totalCostUsd, "openclaw.session.total_input_tokens": session.totalInputTokens, "openclaw.session.total_output_tokens": session.totalOutputTokens, "openclaw.session.total_cache_read_tokens": session.totalCacheReadTokens, "openclaw.session.total_cache_write_tokens": session.totalCacheWriteTokens, "openclaw.session.messages.user": session.messageCountUser, "openclaw.session.messages.assistant": session.messageCountAssistant, "openclaw.session.messages.tool_calls": session.messageCountToolCalls, "openclaw.session.messages.tool_results": session.messageCountToolResults, "openclaw.session.messages.errors": session.messageCountErrors, "openclaw.session.latency.avg_ms": Math.round(s.avgLatencyMs), "openclaw.session.latency.p95_ms": Math.round(s.p95LatencyMs), "openclaw.session.latency.min_ms": Math.round(s.minLatencyMs), "openclaw.session.latency.max_ms": Math.round(s.maxLatencyMs), "openclaw.session.tools.unique_count": s.uniqueToolCount, "openclaw.session.tools.total_calls": s.totalToolCalls, "openclaw.session.tools.top": s.topTools, "openclaw.session.cost.input": Number(s.costInput.toFixed(4)), "openclaw.session.cost.output": Number(s.costOutput.toFixed(4)), "openclaw.session.cost.cache_read": Number(s.costCacheRead.toFixed(4)), "openclaw.session.cost.cache_write": Number(s.costCacheWrite.toFixed(4)), "openclaw.session.cache_hit_ratio": Number(s.cacheHitRatio.toFixed(4)), "openclaw.session.cache_savings_usd": Number(s.cacheSavingsUsd.toFixed(4)), "gen_ai.agent.name": "openclaw", "gen_ai.agent.id": session.agentId, "gen_ai.conversation.id": session.sessionId, "gen_ai.provider.name": session.primaryProvider, "gen_ai.request.model": session.primaryModel, }); if (errorMsg) { session.rootSpan.setStatus({ code: SpanStatusCode.ERROR, message: errorMsg }); } else { session.rootSpan.setStatus({ code: SpanStatusCode.OK }); } session.rootSpan.end(now); // Record session duration + completion outcome metrics instruments.sessionDurationMs.record(durationMs); instruments.sessionsCompleted.add(1, { outcome: errorMsg ? "error" : "success" }); // Emit or defer final session usage summary if (deferSummary) { // LLM calls still in-flight — defer summary until last one resolves session.pendingSummary = true; session.finalDurationMs = durationMs; } else { // No pending LLM calls — emit immediately emitSessionSummary(session, durationMs, "final", session.rootSpan); session.summaryEmitted = true; } // NOTE: Do NOT delete from activeSessions/sessionKeyToId here. // Late-arriving hooks (e.g. llm_output after agent_end) still need to // resolve the session to accumulate tokens. The 60s cleanup timer // handles map cleanup for finalized sessions. } return { // ── session_start → root span ─────────────────────────────────── onSessionStart(event, ctx) { // Guard: if session already exists, skip (prevents orphaned root spans in Tempo) if (activeSessions.has(event.sessionId)) return; const now = Date.now(); const rootSpan = tracer.startSpan(`invoke_agent openclaw [${event.sessionId}]`, { kind: SpanKind.INTERNAL, startTime: now, attributes: { "gen_ai.operation.name": "invoke_agent", "gen_ai.provider.name": "openclaw", "gen_ai.agent.name": "openclaw", "gen_ai.agent.id": "grafana-lens", "gen_ai.output.type": "text", "gen_ai.conversation.id": event.sessionId, ...(opts?.agentVersion ? { "gen_ai.agent.version": opts.agentVersion } : {}), ...(event.resumedFrom ? { "openclaw.session.resumed_from": event.resumedFrom } : {}), }, }); const sessionCtx = trace.setSpan(ROOT_CONTEXT, rootSpan); activeSessions.set(event.sessionId, { rootSpan, ctx: sessionCtx, sessionId: event.sessionId, startTime: now, totalCostUsd: 0, totalInputTokens: 0, totalOutputTokens: 0, totalCacheReadTokens: 0, totalCacheWriteTokens: 0, messageCountUser: 0, messageCountAssistant: 0, messageCountToolCalls: 0, messageCountToolResults: 0, messageCountErrors: 0, toolCounts: new Map(), toolErrorCounts: new Map(), totalToolDurationMs: 0, latencies: [], latencySum: 0, latencyMin: Infinity, latencyMax: 0, latencyCount: 0, primaryModel: "", primaryProvider: "", channel: "", sessionKey: "", agentId: ctx.agentId ?? "", costThresholdsLogged: new Set(), firstMessageCaptured: false, finalized: false, isSubagent: false, parentLinked: false, }); // Map agentId as a sessionKey alias if available if (ctx.agentId) { sessionKeyToId.set(ctx.agentId, event.sessionId); } instruments.sessionsStartedTotal.add(1, { type: event.resumedFrom ? "resumed" : "new", }); // Security: track unique sessions for enumeration detection trackUniqueSession(event.sessionId); const resumedTag = event.resumedFrom ? " (resumed)" : ""; const agentTag = ctx.agentId ? ` [agent:${ctx.agentId}]` : ""; emitLog(SeverityNumber.INFO, "INFO", `Session started ${event.sessionId}${resumedTag}${agentTag}`, { "event.domain": "openclaw", "event.name": "session.start", "openclaw.session_id": event.sessionId, ...(event.resumedFrom ? { "openclaw.resumed_from": event.resumedFrom } : {}), }, rootSpan); }, // ── session_end → close root span ───────────────────────────────── onSessionEnd(event) { const session = activeSessions.get(event.sessionId); if (!session) return; // graceful: end without start if (session.finalized) { // Safety: emit deferred summary if LLM call never resolved if (session.pendingSummary && !session.summaryEmitted) { const durationMs = session.finalDurationMs ?? (Date.now() - session.startTime); emitSessionSummary(session, durationMs, "final", session.rootSpan); session.summaryEmitted = true; } return; } const durationMs = event.durationMs ?? (Date.now() - session.startTime); finalizeSession(session, durationMs); }, // ── llm_input → start LLM call span ────────────────────────────── onLlmInput(event, ctx) { // Dual-path: latch hooks as active — disables model.usage fallback. // Reset orphan counter and logged flag so a future hook outage is detected fresh. if (!llmHooksActive) { llmHooksActive = true; const wasFallbackActive = fallbackModeLogged; fallbackOrphanedCount = 0; fallbackModeLogged = false; if (wasFallbackActive) { emitLog(SeverityNumber.INFO, "INFO", "LLM hooks restored — deactivating model.usage fallback", { "event.domain": "openclaw", "event.name": "trace.fallback_deactivated", }); } } // Lazy session creation: if session_start was missed (fires before service init), // create a synthetic root span so all subsequent spans have correct parenting. let session = resolveSessionCtx(event.sessionId, ctx.sessionKey); if (!session && event.sessionId) { const synthNow = Date.now(); const rootSpan = tracer.startSpan(`invoke_agent openclaw [${event.sessionId}]`, { kind: SpanKind.INTERNAL, startTime: synthNow, attributes: { "gen_ai.operation.name": "invoke_agent", "gen_ai.provider.name": "openclaw", "gen_ai.agent.name": "openclaw", "gen_ai.agent.id": "grafana-lens", "gen_ai.output.type": "text", "gen_ai.conversation.id": event.sessionId, "openclaw.session.synthetic": true, ...(opts?.agentVersion ? { "gen_ai.agent.version": opts.agentVersion } : {}), }, }); const sessionCtx = trace.setSpan(ROOT_CONTEXT, rootSpan); activeSessions.set(event.sessionId, { rootSpan, ctx: sessionCtx, sessionId: event.sessionId, startTime: synthNow, totalCostUsd: 0, totalInputTokens: 0, totalOutputTokens: 0, totalCacheReadTokens: 0, totalCacheWriteTokens: 0, messageCountUser: 0, messageCountAssistant: 0, messageCountToolCalls: 0, messageCountToolResults: 0, messageCountErrors: 0, toolCounts: new Map(), toolErrorCounts: new Map(), totalToolDurationMs: 0, latencies: [], latencySum: 0, latencyMin: Infinity, latencyMax: 0, latencyCount: 0, primaryModel: "", primaryProvider: "", channel: "", sessionKey: ctx.sessionKey ?? "", agentId: "", costThresholdsLogged: new Set(), firstMessageCaptured: false, finalized: false, isSubagent: false, parentLinked: false, }); if (ctx.sessionKey) sessionKeyToId.set(ctx.sessionKey, event.sessionId); session = activeSessions.get(event.sessionId); instruments.sessionsStartedTotal.add(1, { type: "synthetic" }); trackUniqueSession(event.sessionId); emitLog(SeverityNumber.INFO, "INFO", `Session started ${event.sessionId} (synthetic)`, { "event.domain": "openclaw", "event.name": "session.start", "openclaw.session_id": event.sessionId, "openclaw.session_key": ctx.sessionKey ?? "", "openclaw.synthetic": true, }, rootSpan); } const parentCtx = session?.ctx ?? ROOT_CONTEXT; const now = Date.now(); const llmSpan = tracer.startSpan(`chat ${event.model}`, { kind: SpanKind.CLIENT, startTime: now, attributes: { "gen_ai.operation.name": "chat", "gen_ai.provider.name": event.provider, "gen_ai.request.model": event.model, "gen_ai.conversation.id": event.sessionId, "openclaw.run_id": event.runId, "openclaw.session_key": ctx.sessionKey ?? "", "openclaw.history_length": event.historyMessages.length, "openclaw.images_count": event.imagesCount, }, }, parentCtx); const llmCtx = trace.setSpan(ROOT_CONTEXT, llmSpan); activeLlmCalls.set(event.runId, { span: llmSpan, ctx: llmCtx, sessionKey: ctx.sessionKey ?? "", startTime: now, }); // Map sessionKey → sessionId for tool call parenting if (ctx.sessionKey && event.sessionId) { sessionKeyToId.set(ctx.sessionKey, event.sessionId); } // ── Deferred subagent linking ────────────────────────────────── // First hook where ctx.sessionKey is available for a child agent. // Match against pendingChildren to establish parent↔child correlation. if (ctx.sessionKey && session && !session.parentLinked) { const parentInfo = pendingChildren.get(ctx.sessionKey); if (parentInfo) { session.parentLinked = true; session.parentSessionId = parentInfo.parentSessionId; session.parentTraceId = parentInfo.parentTraceId; session.isSubagent = true; // Enrich child root span with parent info (retroactive) session.rootSpan.setAttributes({ "gen_ai.conversation.parent_id": parentInfo.parentSessionId, "openclaw.parent_session_id": parentInfo.parentSessionId, "openclaw.parent_session_key": parentInfo.parentSessionKey, "openclaw.parent_trace_id": parentInfo.parentTraceId, "openclaw.is_subagent": true, "openclaw.subagent.agent_id": parentInfo.agentId, "openclaw.subagent.label": parentInfo.label, "openclaw.subagent.mode": parentInfo.mode, }); // Span link: child root → parent spawn span (cross-trace) session.rootSpan.addLink?.({ context: { traceId: parentInfo.parentTraceId, spanId: parentInfo.parentSpanId, traceFlags: TraceFlags.SAMPLED, }, attributes: { "openclaw.link.type": "parent_agent" }, }); // Span link: parent spawn span → child root (bidirectional) const spawnEntry = activeSubagentSpawns.get(ctx.sessionKey); if (spawnEntry) { spawnEntry.span.addLink?.({ context: session.rootSpan.spanContext(), attributes: { "openclaw.link.type": "child_agent", "openclaw.child_session_id": session.sessionId, }, }); spawnEntry.span.setAttribute("openclaw.subagent.child_trace_id", session.rootSpan.spanContext().traceId); spawnEntry.span.setAttribute("openclaw.subagent.child_session_id", session.sessionId); } // Update relationship maps const childSet = parentToChildren.get(parentInfo.parentSessionId) ?? new Set(); childSet.add(session.sessionId); parentToChildren.set(parentInfo.parentSessionId, childSet); childToParent.set(session.sessionId, parentInfo.parentSessionId); pendingChildren.delete(ctx.sessionKey); emitLog(SeverityNumber.INFO, "INFO", `Subagent linked: ${session.sessionId} → parent ${parentInfo.parentSessionId}`, { "event.domain": "openclaw", "event.name": "subagent.linked", "openclaw.session_id": session.sessionId, "openclaw.parent_session_id": parentInfo.parentSessionId, "openclaw.parent_trace_id": parentInfo.parentTraceId, "openclaw.subagent.agent_id": parentInfo.agentId, }, session.rootSpan); } } // Count as user message (each LLM input follows a user turn) const inputSession = r