UNPKG

@dooor-ai/toolkit

Version:

Guards, Evals & Observability for AI applications - works seamlessly with LangChain/LangGraph

838 lines (733 loc) â€ĸ 29 kB
import { BaseCallbackHandler } from "@langchain/core/callbacks/base"; import { Guard } from "../guards/base"; import { Eval } from "../evals/base"; import { ObservabilityCollector } from "../observability/logger"; import { GuardBlockedException, TraceData } from "./types"; import { v4 as uuidv4 } from "uuid"; interface DOOORCallbackConfig { guards?: Guard[]; evals?: Eval[]; outputGuards?: Guard[]; observability?: ObservabilityCollector; evalMode?: "async" | "sync" | "sample"; evalSampleRate?: number; guardFailureMode?: "throw" | "return_error" | "log_only"; modelName?: string; } interface ToolCallEntry { trace_id?: string; tool_call_id?: string; tool_name: string; input: any; output: any; latency_ms: number; started_at: string; ended_at: string; error?: string; } /** * Callback handler that implements guards, evals, and observability */ export class DOOORCallbackHandler extends BaseCallbackHandler { name = "DOOORCallbackHandler"; private guards: Guard[]; private evals: Eval[]; private outputGuards: Guard[]; private observability?: ObservabilityCollector; private evalMode: "async" | "sync" | "sample"; private evalSampleRate: number; private guardFailureMode: "throw" | "return_error" | "log_only"; private defaultModelName: string; private currentModel?: string; // Current trace state private currentTraceId?: string; private currentInput?: string; private startTime?: number; private guardsResults: Array<{ name: string; result: any; latency: number }> = []; private toolCalls: ToolCallEntry[] = []; private pendingToolCalls: ToolCallEntry[] = []; private currentToolStart?: { name: string; input: any; startTime: number; traceId?: string }; private toolCallsByTrace: Map<string, ToolCallEntry[]> = new Map(); // Session tracking for grouping related traces private currentSessionId?: string; private traceSequence: number = 0; constructor(config: DOOORCallbackConfig) { super(); console.log("[DOOORCallbackHandler] Constructor called with config:", { guardsCount: config.guards?.length || 0, evalsCount: config.evals?.length || 0, outputGuardsCount: config.outputGuards?.length || 0, hasObservability: !!config.observability, evalMode: config.evalMode, evalSampleRate: config.evalSampleRate, guardFailureMode: config.guardFailureMode, }); this.guards = config.guards ?? []; this.evals = config.evals ?? []; this.outputGuards = config.outputGuards ?? []; this.observability = config.observability; this.evalMode = config.evalMode ?? "async"; this.evalSampleRate = config.evalSampleRate ?? 1.0; this.guardFailureMode = config.guardFailureMode ?? "throw"; this.defaultModelName = config.modelName ?? "unknown"; console.log("[DOOORCallbackHandler] Constructor completed, handler ready"); } /** * Called before LLM starts (LangChain lifecycle hook) */ async handleLLMStart( llm: { name?: string }, prompts: string[], runId: string, _parentRunId?: string, _extraParams?: Record<string, any>, _tags?: string[], _metadata?: Record<string, any>, _runName?: string ): Promise<void> { await this.processLLMStart(llm, prompts, runId); } /** * Backwards compatibility with LangChain < 0.3 which calls onLLMStart */ async onLLMStart( llm: { name?: string }, prompts: string[], runId: string ): Promise<void> { await this.processLLMStart(llm, prompts, runId); } private async processLLMStart( llm: { name?: string }, prompts: string[], runId: string ): Promise<void> { const llmName = llm?.name || this.defaultModelName; console.log("[DOOORCallbackHandler] handleLLMStart called", { llmName, promptsCount: prompts.length, runId, }); this.currentTraceId = uuidv4(); this.currentModel = llmName; this.currentInput = prompts[0] || ""; this.startTime = Date.now(); this.guardsResults = []; // Session tracking: detect if this is a new conversation or continuation const isNewConversation = !this.currentInput.includes("AI:") && !this.currentInput.includes("Tool:"); if (isNewConversation || !this.currentSessionId) { // New conversation - generate new session_id this.currentSessionId = uuidv4(); this.traceSequence = 0; console.log("[DOOORCallbackHandler] 🆕 New session started:", this.currentSessionId); } // Note: traceSequence will be incremented when trace is actually saved (in processLLMEnd) console.log("[DOOORCallbackHandler] 📊 Session:", this.currentSessionId, "| Current sequence:", this.traceSequence); // Extract tool output from input (for LangGraph agents) await this.extractToolOutputFromInput(this.currentInput); // Run guards for (const guard of this.guards) { if (!guard.isEnabled()) continue; const guardStartTime = Date.now(); try { const result = await Promise.resolve(guard.validate(this.currentInput)); const guardLatency = Date.now() - guardStartTime; this.guardsResults.push({ name: guard.name, result, latency: guardLatency, }); // Handle guard failure if (!result.passed && guard.shouldBlock()) { const error = new GuardBlockedException( guard.name, result.reason ?? "Guard blocked request", result.severity ?? "medium", result.metadata ); if (this.guardFailureMode === "throw") { // Log and throw this.observability?.logError(error, { traceId: this.currentTraceId, input: this.currentInput, guardName: guard.name, }); throw error; } else if (this.guardFailureMode === "log_only") { // Just log, don't block this.observability?.logError(error, { traceId: this.currentTraceId, input: this.currentInput, guardName: guard.name, mode: "log_only", }); } // return_error mode is handled by the wrapper } } catch (error) { if (error instanceof GuardBlockedException) { throw error; } // Guard itself failed - log but don't block console.error(`Guard ${guard.name} failed:`, error); } } } /** * Called after LLM completes (LangChain lifecycle hook) */ async handleLLMEnd( output: any, runId: string, _parentRunId?: string, _tags?: string[], _extraParams?: Record<string, any> ): Promise<void> { await this.processLLMEnd(output, runId); } /** * Backwards compatibility with LangChain < 0.3 which calls onLLMEnd */ async onLLMEnd(output: any, runId: string): Promise<void> { await this.processLLMEnd(output, runId); } private async processLLMEnd(output: any, runId: string): Promise<void> { console.log("[DOOORCallbackHandler] handleLLMEnd called", { hasTraceId: !!this.currentTraceId, hasInput: !!this.currentInput, hasStartTime: !!this.startTime, runId, }); if (!this.currentTraceId || !this.currentInput || !this.startTime) { console.log("[DOOORCallbackHandler] Skipping - missing trace data"); return; } const totalLatency = Date.now() - this.startTime; console.log("[DOOORCallbackHandler] Processing trace, latency:", totalLatency); console.log("[DOOORCallbackHandler] Raw output object keys:", Object.keys(output)); console.log("[DOOORCallbackHandler] Output structure:", JSON.stringify(output, null, 2)); const outputText = this.extractOutputText(output); console.log("[DOOORCallbackHandler] Extracted output text length:", outputText.length); const tokens = this.extractTokens(output); console.log("[DOOORCallbackHandler] Extracted tokens:", tokens); const modelName = this.currentModel || this.defaultModelName; console.log("[DOOORCallbackHandler] Model name:", modelName); // Extract tool calls from LLM output (LangGraph doesn't call tool hooks) this.extractToolCallsFromOutput(output); // Run output guards for (const guard of this.outputGuards) { if (!guard.isEnabled()) continue; const guardStartTime = Date.now(); try { const result = await Promise.resolve(guard.validate(outputText)); const guardLatency = Date.now() - guardStartTime; this.guardsResults.push({ name: guard.name + " (output)", result, latency: guardLatency, }); if (!result.passed && guard.shouldBlock()) { console.warn(`Output guard ${guard.name} detected issue:`, result.reason); // For output guards, we typically just log (can't undo the LLM call) } } catch (error) { console.error(`Output guard ${guard.name} failed:`, error); } } // Prepare trace data console.log("[DOOORCallbackHandler] 🔍 Preparing trace, tool calls captured:", this.toolCalls.length); if (this.toolCalls.length > 0) { console.log("[DOOORCallbackHandler] Tool calls data:", JSON.stringify(this.toolCalls, null, 2)); } // Increment trace sequence when actually saving the trace (not just starting) this.traceSequence++; console.log("[DOOORCallbackHandler] 📊 Saving trace #" + this.traceSequence + " for session:", this.currentSessionId); // Determine trace type based on context const hasToolCallsInOutput = this.toolCalls.length > 0; const hasHistoryInInput = this.currentInput?.includes("AI:") || this.currentInput?.includes("Tool:"); const traceType: "llm_decision" | "llm_response" = hasToolCallsInOutput ? "llm_decision" : hasHistoryInInput ? "llm_response" : "llm_response"; // Default to response if unclear const trace: TraceData = { traceId: this.currentTraceId!, input: this.currentInput!, output: outputText, model: modelName, latency: totalLatency, tokens: tokens, // cost removed - will be calculated server-side based on tokens and pricing table timestamp: new Date(), guards: this.guardsResults, toolCalls: this.toolCalls.length > 0 ? this.toolCalls : undefined, sessionId: this.currentSessionId, traceSequence: this.traceSequence, traceType: traceType, }; console.log("[DOOORCallbackHandler] Trace object created:", { traceId: trace.traceId, model: trace.model, hasTokens: !!trace.tokens, tokens: trace.tokens, hasCost: trace.cost !== undefined, cost: trace.cost, }); // Log trace FIRST (must complete before evals can reference it via foreign key) console.log("[DOOORCallbackHandler] Logging trace to observability..."); await this.observability?.logTrace(trace); console.log("[DOOORCallbackHandler] Trace logged"); this.observability?.logMetric("dooor.llm.latency", totalLatency, { model: trace.model, }); // Run evals AFTER trace is saved (to avoid foreign key violations) const shouldRunEvals = this.shouldRunEvals(); if (shouldRunEvals && this.evals.length > 0) { if (this.evalMode === "async") { // Run async without blocking (trace already saved, so foreign key is satisfied) this.runEvalsAsync( this.currentInput, outputText, { latency: totalLatency }, trace.traceId ); } else { // Run sync (blocking) const evalResults = await this.runEvalsSync( this.currentInput, outputText, { latency: totalLatency } ); if (evalResults.length > 0) { if (this.observability) { await this.observability.updateTrace(trace.traceId, { evals: evalResults }); } } } } // Reset state this.currentTraceId = undefined; this.currentInput = undefined; this.startTime = undefined; this.guardsResults = []; this.toolCalls = []; this.currentToolStart = undefined; } /** * Extract tool calls from LLM output (for LangGraph agents where tool hooks aren't called) */ private extractToolCallsFromOutput(output: any): void { try { // Navigate through the output structure to find tool_calls const generations = output?.generations?.[0]; if (!generations || !Array.isArray(generations)) { return; } for (const generation of generations) { const message = generation?.message; const toolCalls = message?.kwargs?.tool_calls || message?.tool_calls; if (toolCalls && Array.isArray(toolCalls) && toolCalls.length > 0) { console.log("[DOOORCallbackHandler] 🔍 Found tool_calls in LLM output:", toolCalls.length); for (const toolCall of toolCalls) { const toolName = toolCall.name || "unknown_tool"; const toolArgs = toolCall.args || {}; const toolId = toolCall.id || "unknown_id"; console.log("[DOOORCallbackHandler] âš™ī¸ Tool call detected:", toolName, "args:", toolArgs); // Add to tool calls array (we don't have output yet, will be in next LLM call) const entry: ToolCallEntry = { trace_id: this.currentTraceId, tool_call_id: toolId, tool_name: toolName, input: toolArgs, output: null, // Will be populated if we can extract it latency_ms: 0, // Unknown from output started_at: new Date().toISOString(), ended_at: new Date().toISOString(), }; this.toolCalls.push(entry); this.pendingToolCalls.push(entry); if (entry.trace_id) { const existing = this.toolCallsByTrace.get(entry.trace_id) ?? []; existing.push(entry); this.toolCallsByTrace.set(entry.trace_id, existing); } } } } } catch (error) { console.error("[DOOORCallbackHandler] Error extracting tool calls from output:", error); } } /** * Extract tool output from input (for LangGraph agents where tool result is in next LLM input) */ private async extractToolOutputFromInput(inputText: string): Promise<void> { try { console.log("[DOOORCallbackHandler] 🔍 Attempting to extract tool output from input"); console.log("[DOOORCallbackHandler] Input text sample:", inputText.substring(0, 500)); console.log("[DOOORCallbackHandler] Pending tool calls count:", this.pendingToolCalls.length); // Look for "Tool: <name>, <output>" pattern in input // Made more flexible to handle newlines and various JSON formats const toolPattern = /Tool:\s*(\w+)\s*,\s*(\{[^}]*\}|\{[\s\S]*?\})/g; const matches = Array.from(inputText.matchAll(toolPattern)); console.log("[DOOORCallbackHandler] Regex matches found:", matches.length); if (matches.length > 0 && this.pendingToolCalls.length > 0) { console.log("[DOOORCallbackHandler] 🔍 Found tool outputs in input:", matches.length); for (const match of matches) { const toolName = match[1]; const toolOutputStr = match[2]; console.log("[DOOORCallbackHandler] Processing match - Tool:", toolName, "Output:", toolOutputStr.substring(0, 100)); // Find the corresponding pending tool call (last one with matching name and no output) let toolCallIndex = -1; for (let i = this.pendingToolCalls.length - 1; i >= 0; i--) { const pending = this.pendingToolCalls[i]; if (pending.tool_name === toolName && pending.output === null) { toolCallIndex = i; break; } } if (toolCallIndex !== -1) { const toolCall = this.pendingToolCalls[toolCallIndex]; try { toolCall.output = JSON.parse(toolOutputStr); toolCall.ended_at = new Date().toISOString(); // Calculate latency if we can const started = new Date(toolCall.started_at).getTime(); const ended = new Date(toolCall.ended_at).getTime(); toolCall.latency_ms = ended - started; console.log("[DOOORCallbackHandler] ✅ Populated output for tool:", toolName, "latency:", toolCall.latency_ms, "ms"); } catch (e) { console.log("[DOOORCallbackHandler] Failed to parse JSON, using raw string:", e); toolCall.output = toolOutputStr; toolCall.ended_at = new Date().toISOString(); } // Try to update the original trace await this.persistToolCallUpdate(toolCall); // Also include in current trace as fallback (if we're processing a new trace) if (this.currentTraceId && this.currentTraceId !== toolCall.trace_id) { // This is a new trace, include the updated tool call const updatedToolCall = { ...toolCall }; updatedToolCall.trace_id = this.currentTraceId; this.toolCalls.push(updatedToolCall); console.log("[DOOORCallbackHandler] 📝 Added updated tool call to current trace as fallback"); } this.pendingToolCalls.splice(toolCallIndex, 1); } else { console.log("[DOOORCallbackHandler] âš ī¸ No matching tool call found for:", toolName); } } } else { if (this.pendingToolCalls.length === 0) { console.log("[DOOORCallbackHandler] â„šī¸ No tool calls to populate"); } else { console.log("[DOOORCallbackHandler] âš ī¸ Tool pattern not found in input"); } } } catch (error) { console.error("[DOOORCallbackHandler] Error extracting tool output from input:", error); } } /** * Called when a tool starts execution (LangChain lifecycle hook) */ async handleToolStart( tool: { name?: string; id?: string[] }, input: string, runId: string ): Promise<void> { const toolName = tool?.name || "unknown_tool"; console.log("[DOOORCallbackHandler] âš™ī¸ Tool started:", toolName, "with input:", input); console.log("[DOOORCallbackHandler] Tool object:", JSON.stringify(tool)); console.log("[DOOORCallbackHandler] RunId:", runId); this.currentToolStart = { name: toolName, input: input, startTime: Date.now(), traceId: this.currentTraceId, }; } /** * Backwards compatibility - some versions call onToolStart */ async onToolStart( tool: { name?: string; id?: string[] }, input: string, runId: string ): Promise<void> { await this.handleToolStart(tool, input, runId); } /** * Called when a tool finishes execution (LangChain lifecycle hook) */ async handleToolEnd( output: string, runId: string ): Promise<void> { console.log("[DOOORCallbackHandler] âš™ī¸ Tool ended, output:", output); console.log("[DOOORCallbackHandler] Current tool start state:", this.currentToolStart); if (!this.currentToolStart) { console.log("[DOOORCallbackHandler] âš ī¸ Tool ended but no start recorded - this is a bug!"); return; } const latency = Date.now() - this.currentToolStart.startTime; const endTime = new Date().toISOString(); const startTime = new Date(this.currentToolStart.startTime).toISOString(); console.log("[DOOORCallbackHandler] ✅ Tool completed:", this.currentToolStart.name, "latency:", latency); const entry: ToolCallEntry = { trace_id: this.currentToolStart.traceId, tool_name: this.currentToolStart.name, input: this.currentToolStart.input, output: output, latency_ms: latency, started_at: startTime, ended_at: endTime, }; this.toolCalls.push(entry); if (entry.trace_id) { const existing = this.toolCallsByTrace.get(entry.trace_id) ?? []; existing.push(entry); this.toolCallsByTrace.set(entry.trace_id, existing); } console.log("[DOOORCallbackHandler] 📊 Total tool calls captured:", this.toolCalls.length); this.currentToolStart = undefined; } /** * Backwards compatibility - some versions call onToolEnd */ async onToolEnd(output: string, runId: string): Promise<void> { await this.handleToolEnd(output, runId); } /** * Called when a tool errors (LangChain lifecycle hook) */ async handleToolError( error: Error, runId: string ): Promise<void> { if (!this.currentToolStart) { console.log("[DOOORCallbackHandler] Tool errored but no start recorded"); return; } const latency = Date.now() - this.currentToolStart.startTime; const endTime = new Date().toISOString(); const startTime = new Date(this.currentToolStart.startTime).toISOString(); console.log("[DOOORCallbackHandler] Tool errored:", this.currentToolStart.name, error.message); const entry: ToolCallEntry = { trace_id: this.currentToolStart.traceId, tool_name: this.currentToolStart.name, input: this.currentToolStart.input, output: null, latency_ms: latency, started_at: startTime, ended_at: endTime, error: error.message, }; this.toolCalls.push(entry); if (entry.trace_id) { const existing = this.toolCallsByTrace.get(entry.trace_id) ?? []; existing.push(entry); this.toolCallsByTrace.set(entry.trace_id, existing); } this.currentToolStart = undefined; } /** * Backwards compatibility - some versions call onToolError */ async onToolError(error: Error, runId: string): Promise<void> { await this.handleToolError(error, runId); } /** * Persist tool call updates to observability backend */ private async persistToolCallUpdate(toolCall: ToolCallEntry): Promise<void> { if (!toolCall.trace_id || !this.observability) { return; } try { const toolCallsForTrace = this.toolCallsByTrace.get(toolCall.trace_id); await this.observability.updateTrace(toolCall.trace_id, { toolCalls: toolCallsForTrace ?? [toolCall], }); const stillPending = this.pendingToolCalls.some( (pending) => pending.trace_id === toolCall.trace_id ); if (!stillPending && toolCall.trace_id) { this.toolCallsByTrace.delete(toolCall.trace_id); } } catch (error) { console.error("[DOOORCallbackHandler] Failed to persist tool call update:", error); } } /** * Called on LLM error (LangChain lifecycle hook) */ async handleLLMError( error: Error, runId: string, _parentRunId?: string, _tags?: string[], _extraParams?: Record<string, any> ): Promise<void> { this.observability?.logError(error, { traceId: this.currentTraceId, input: this.currentInput, }); } /** * Backwards compatibility with LangChain < 0.3 which calls onLLMError */ async onLLMError(error: Error, runId: string): Promise<void> { await this.handleLLMError(error, runId); } /** * Run evals asynchronously (non-blocking) */ private async runEvalsAsync( input: string, output: string, metadata: Record<string, any>, traceId: string ): Promise<void> { // Run in background without blocking setImmediate(async () => { try { console.log("[DOOORCallbackHandler] Starting async evals for trace:", traceId); const evalResults = await this.runEvalsSync(input, output, metadata); if (evalResults.length > 0) { console.log( "[DOOORCallbackHandler] Evals completed, updating trace with results:", evalResults.length ); if (this.observability) { await this.observability.updateTrace(traceId, { evals: evalResults }); } } } catch (error) { console.error("Error running async evals:", error); } }); } /** * Run evals synchronously (blocking) */ private async runEvalsSync( input: string, output: string, metadata: Record<string, any> ): Promise<Array<{ name: string; result: any; latency: number }>> { const evalResults: Array<{ name: string; result: any; latency: number }> = []; for (const evalInstance of this.evals) { if (!evalInstance.isEnabled()) continue; const evalStartTime = Date.now(); try { const result = await Promise.resolve( evalInstance.evaluate(input, output, metadata) ); const evalLatency = Date.now() - evalStartTime; evalResults.push({ name: evalInstance.name, result, latency: evalLatency, }); // Log metric this.observability?.logMetric(`dooor.eval.${evalInstance.name}.score`, result.score, { passed: result.passed ? "true" : "false", }); } catch (error) { console.error(`Eval ${evalInstance.name} failed:`, error); } } return evalResults; } /** * Determine if evals should run (sampling logic) */ private shouldRunEvals(): boolean { if (this.evalMode === "sample") { return Math.random() < this.evalSampleRate; } return true; } /** * Extract text output from LangChain response */ private extractOutputText(output: any): string { if (typeof output === "string") { return output; } if (output?.generations?.[0]?.[0]?.text) { return output.generations[0][0].text; } if (output?.text) { return output.text; } if (output?.content) { return output.content; } return JSON.stringify(output); } /** * Extract token usage from LangChain response */ private extractTokens(output: any): { prompt: number; completion: number; total: number } | undefined { console.log("[DOOORCallbackHandler] extractTokens - checking multiple locations..."); const normalize = (usage: any, source: string) => { console.log(`[DOOORCallbackHandler] Trying to normalize tokens from: ${source}`, usage); if (!usage) return undefined; const prompt = usage.prompt_tokens ?? usage.promptTokens ?? usage.input_tokens ?? usage.inputTokens; const completion = usage.completion_tokens ?? usage.completionTokens ?? usage.output_tokens ?? usage.outputTokens; const total = usage.total_tokens ?? usage.totalTokens ?? (prompt ?? 0) + (completion ?? 0); if (prompt == null && completion == null && total == null) { console.log(`[DOOORCallbackHandler] ${source} - no tokens found`); return undefined; } const result = { prompt: prompt ?? (total ?? 0) - (completion ?? 0), completion: completion ?? (total ?? 0) - (prompt ?? 0), total: total ?? ((prompt ?? 0) + (completion ?? 0)), }; console.log(`[DOOORCallbackHandler] ${source} - normalized tokens:`, result); return result; }; const llmUsage = output?.llmOutput?.tokenUsage; const normalizedLLMUsage = normalize(llmUsage, "llmOutput.tokenUsage"); if (normalizedLLMUsage) { console.log("[DOOORCallbackHandler] ✅ Tokens extracted from llmOutput.tokenUsage"); return normalizedLLMUsage; } const directUsage = normalize(output?.usage_metadata || output?.usageMetadata, "usage_metadata"); if (directUsage) { console.log("[DOOORCallbackHandler] ✅ Tokens extracted from usage_metadata"); return directUsage; } const generationMessage = output?.generations?.[0]?.[0]?.message; const generationUsage = normalize( generationMessage?.usage_metadata || generationMessage?.usageMetadata, "generations[0][0].message.usage_metadata" ); if (generationUsage) { console.log("[DOOORCallbackHandler] ✅ Tokens extracted from generation message usage_metadata"); return generationUsage; } const responseUsage = normalize( output?.response_metadata?.tokenUsage || output?.response_metadata?.usageMetadata || generationMessage?.response_metadata?.tokenUsage || generationMessage?.response_metadata?.usageMetadata, "response_metadata.tokenUsage" ); if (responseUsage) { console.log("[DOOORCallbackHandler] ✅ Tokens extracted from response_metadata"); return responseUsage; } const genericUsage = normalize(output?.usage, "usage"); if (genericUsage) { console.log("[DOOORCallbackHandler] ✅ Tokens extracted from generic usage field"); return genericUsage; } console.log("[DOOORCallbackHandler] ❌ No tokens found in any location!"); return undefined; } }