jay-code
Version:
Streamlined AI CLI orchestration engine with mathematical rigor and enterprise-grade reliability
557 lines (490 loc) • 13.7 kB
text/typescript
/**
* LLM-specific hooks for agentic-flow integration
*
* Provides pre/post operation hooks for all LLM calls with
* memory persistence and performance optimization.
*/
import { agenticHookManager } from './hook-manager.js';
import type {
AgenticHookContext,
HookHandlerResult,
LLMHookPayload,
LLMMetrics,
Pattern,
SideEffect,
} from './types.js';
// ===== Pre-LLM Call Hook =====
export const preLLMCallHook = {
id: 'agentic-pre-llm-call',
type: 'pre-llm-call' as const,
priority: 100,
handler: async (
payload: LLMHookPayload,
context: AgenticHookContext
): Promise<HookHandlerResult> => {
const { provider, model, operation, request } = payload;
// Check memory for similar requests
const cacheKey = generateCacheKey(provider, model, request);
const cached = await checkMemoryCache(cacheKey, context);
if (cached) {
return {
continue: false, // Skip LLM call
modified: true,
payload: {
...payload,
response: cached.response,
metrics: {
...cached.metrics,
cacheHit: true,
},
},
sideEffects: [
{
type: 'metric',
action: 'increment',
data: { name: 'llm.cache.hits' },
},
],
};
}
// Load provider-specific optimizations
const optimizations = await loadProviderOptimizations(provider, context);
// Apply request optimizations
const optimizedRequest = applyRequestOptimizations(
request,
optimizations,
context
);
// Track pre-call metrics
const sideEffects: SideEffect[] = [
{
type: 'metric',
action: 'increment',
data: { name: `llm.calls.${provider}.${model}` },
},
{
type: 'memory',
action: 'store',
data: {
key: `llm:request:${context.correlationId}`,
value: {
provider,
model,
operation,
request: optimizedRequest,
timestamp: Date.now(),
},
ttl: 3600, // 1 hour
},
},
];
return {
continue: true,
modified: true,
payload: {
...payload,
request: optimizedRequest,
},
sideEffects,
};
},
};
// ===== Post-LLM Call Hook =====
export const postLLMCallHook = {
id: 'agentic-post-llm-call',
type: 'post-llm-call' as const,
priority: 100,
handler: async (
payload: LLMHookPayload,
context: AgenticHookContext
): Promise<HookHandlerResult> => {
const { provider, model, request, response, metrics } = payload;
if (!response || !metrics) {
return { continue: true };
}
const sideEffects: SideEffect[] = [];
// Store response in memory for caching
const cacheKey = generateCacheKey(provider, model, request);
sideEffects.push({
type: 'memory',
action: 'store',
data: {
key: `llm:cache:${cacheKey}`,
value: {
response,
metrics,
timestamp: Date.now(),
},
ttl: determineCacheTTL(operation, response),
},
});
// Extract patterns for neural training
const patterns = extractResponsePatterns(request, response, metrics);
if (patterns.length > 0) {
sideEffects.push({
type: 'neural',
action: 'train',
data: {
patterns,
modelId: `llm-optimizer-${provider}`,
},
});
}
// Update performance metrics
sideEffects.push(
{
type: 'metric',
action: 'update',
data: {
name: `llm.latency.${provider}.${model}`,
value: metrics.latency,
},
},
{
type: 'metric',
action: 'update',
data: {
name: `llm.tokens.${provider}.${model}`,
value: response.usage.totalTokens,
},
},
{
type: 'metric',
action: 'update',
data: {
name: `llm.cost.${provider}.${model}`,
value: metrics.costEstimate,
},
}
);
// Check for performance issues
if (metrics.latency > getLatencyThreshold(provider, model)) {
sideEffects.push({
type: 'notification',
action: 'send',
data: {
level: 'warning',
message: `High latency detected for ${provider}/${model}: ${metrics.latency}ms`,
},
});
}
// Store provider health score
await updateProviderHealth(provider, metrics.providerHealth, context);
return {
continue: true,
sideEffects,
};
},
};
// ===== LLM Error Hook =====
export const llmErrorHook = {
id: 'agentic-llm-error',
type: 'llm-error' as const,
priority: 100,
handler: async (
payload: LLMHookPayload,
context: AgenticHookContext
): Promise<HookHandlerResult> => {
const { provider, model, error } = payload;
if (!error) {
return { continue: true };
}
const sideEffects: SideEffect[] = [];
// Log error details
sideEffects.push({
type: 'log',
action: 'write',
data: {
level: 'error',
message: `LLM error from ${provider}/${model}`,
data: {
error: error.message,
stack: error.stack,
request: payload.request,
},
},
});
// Update error metrics
sideEffects.push({
type: 'metric',
action: 'increment',
data: { name: `llm.errors.${provider}.${model}` },
});
// Check if we should fallback
const fallbackProvider = await selectFallbackProvider(
provider,
model,
error,
context
);
if (fallbackProvider) {
return {
continue: false, // Don't propagate error
modified: true,
payload: {
...payload,
provider: fallbackProvider.provider,
model: fallbackProvider.model,
error: undefined, // Clear error for retry
},
sideEffects: [
...sideEffects,
{
type: 'notification',
action: 'send',
data: {
level: 'info',
message: `Falling back from ${provider}/${model} to ${fallbackProvider.provider}/${fallbackProvider.model}`,
},
},
],
};
}
return {
continue: true,
sideEffects,
};
},
};
// ===== LLM Retry Hook =====
export const llmRetryHook = {
id: 'agentic-llm-retry',
type: 'llm-retry' as const,
priority: 90,
handler: async (
payload: LLMHookPayload,
context: AgenticHookContext
): Promise<HookHandlerResult> => {
const { provider, model, metrics } = payload;
const retryCount = metrics?.retryCount || 0;
// Adjust request parameters for retry
const adjustedRequest = adjustRequestForRetry(
payload.request,
retryCount
);
const sideEffects: SideEffect[] = [
{
type: 'metric',
action: 'increment',
data: { name: `llm.retries.${provider}.${model}` },
},
];
// Apply exponential backoff
const backoffMs = Math.min(1000 * Math.pow(2, retryCount), 10000);
await new Promise(resolve => setTimeout(resolve, backoffMs));
return {
continue: true,
modified: true,
payload: {
...payload,
request: adjustedRequest,
metrics: {
...metrics,
retryCount: retryCount + 1,
},
},
sideEffects,
};
},
};
// ===== Helper Functions =====
function generateCacheKey(
provider: string,
model: string,
request: LLMHookPayload['request']
): string {
const normalized = {
provider,
model,
messages: request.messages?.map(m => ({
role: m.role,
content: m.content.substring(0, 100), // First 100 chars
})),
temperature: request.temperature,
maxTokens: request.maxTokens,
};
return Buffer.from(JSON.stringify(normalized)).toString('base64');
}
async function checkMemoryCache(
cacheKey: string,
context: AgenticHookContext
): Promise<any | null> {
// Implementation would integrate with memory service
// This is a placeholder
return null;
}
async function loadProviderOptimizations(
provider: string,
context: AgenticHookContext
): Promise<any> {
// Load provider-specific optimizations from memory
// This is a placeholder
return {
maxRetries: 3,
timeout: 30000,
rateLimit: 100,
};
}
function applyRequestOptimizations(
request: LLMHookPayload['request'],
optimizations: any,
context: AgenticHookContext
): LLMHookPayload['request'] {
// Apply various optimizations
const optimized = { ...request };
// Optimize token usage
if (optimized.maxTokens && optimized.maxTokens > 4000) {
optimized.maxTokens = 4000; // Cap at reasonable limit
}
// Optimize temperature for consistency
if (optimized.temperature === undefined) {
optimized.temperature = 0.7;
}
// Add stop sequences if missing
if (!optimized.stopSequences && optimized.messages) {
optimized.stopSequences = ['\n\nHuman:', '\n\nAssistant:'];
}
return optimized;
}
function determineCacheTTL(
operation: string,
response: LLMHookPayload['response']
): number {
// Determine cache TTL based on operation and response
switch (operation) {
case 'embedding':
return 86400; // 24 hours for embeddings
case 'completion':
// Shorter TTL for completions
return response?.usage?.totalTokens && response.usage.totalTokens > 1000
? 1800 // 30 minutes for long responses
: 3600; // 1 hour for short responses
default:
return 3600; // 1 hour default
}
}
function extractResponsePatterns(
request: LLMHookPayload['request'],
response: LLMHookPayload['response'],
metrics: LLMMetrics
): Pattern[] {
const patterns: Pattern[] = [];
// Extract performance patterns
if (metrics.latency > 1000) {
patterns.push({
id: `perf_${Date.now()}`,
type: 'optimization',
confidence: 0.8,
occurrences: 1,
context: {
provider: metrics.providerHealth < 0.8 ? 'unhealthy' : 'healthy',
requestSize: JSON.stringify(request).length,
responseTokens: response?.usage?.totalTokens || 0,
latency: metrics.latency,
},
});
}
// Extract success patterns
if (response?.choices?.[0]?.finishReason === 'stop') {
patterns.push({
id: `success_${Date.now()}`,
type: 'success',
confidence: 0.9,
occurrences: 1,
context: {
temperature: request.temperature,
maxTokens: request.maxTokens,
actualTokens: response.usage?.totalTokens || 0,
},
});
}
return patterns;
}
function getLatencyThreshold(provider: string, model: string): number {
// Provider/model specific thresholds
const thresholds: Record<string, number> = {
'openai:gpt-4': 5000,
'openai:gpt-3.5-turbo': 2000,
'anthropic:claude-3': 4000,
'anthropic:claude-instant': 1500,
};
return thresholds[`${provider}:${model}`] || 3000;
}
async function updateProviderHealth(
provider: string,
health: number,
context: AgenticHookContext
): Promise<void> {
// Update provider health in memory
const healthKey = `provider:health:${provider}`;
const currentHealth = await context.memory.cache.get(healthKey) || [];
currentHealth.push({
timestamp: Date.now(),
health,
});
// Keep last 100 health checks
if (currentHealth.length > 100) {
currentHealth.shift();
}
await context.memory.cache.set(healthKey, currentHealth);
}
async function selectFallbackProvider(
provider: string,
model: string,
error: Error,
context: AgenticHookContext
): Promise<{ provider: string; model: string } | null> {
// Implement intelligent fallback selection
const fallbacks: Record<string, { provider: string; model: string }[]> = {
'openai': [
{ provider: 'anthropic', model: 'claude-3' },
{ provider: 'cohere', model: 'command' },
],
'anthropic': [
{ provider: 'openai', model: 'gpt-4' },
{ provider: 'cohere', model: 'command' },
],
};
const candidates = fallbacks[provider] || [];
// Select based on health scores
for (const candidate of candidates) {
const healthKey = `provider:health:${candidate.provider}`;
const healthData = await context.memory.cache.get(healthKey) || [];
if (healthData.length > 0) {
const avgHealth = healthData.reduce((sum: number, h: any) =>
sum + h.health, 0
) / healthData.length;
if (avgHealth > 0.7) {
return candidate;
}
}
}
return null;
}
function adjustRequestForRetry(
request: LLMHookPayload['request'],
retryCount: number
): LLMHookPayload['request'] {
const adjusted = { ...request };
// Increase temperature slightly for variety
if (adjusted.temperature !== undefined) {
adjusted.temperature = Math.min(
adjusted.temperature + (0.1 * retryCount),
1.0
);
}
// Reduce max tokens to improve success rate
if (adjusted.maxTokens !== undefined) {
adjusted.maxTokens = Math.floor(
adjusted.maxTokens * Math.pow(0.9, retryCount)
);
}
return adjusted;
}
// ===== Register Hooks =====
export function registerLLMHooks(): void {
agenticHookManager.register(preLLMCallHook);
agenticHookManager.register(postLLMCallHook);
agenticHookManager.register(llmErrorHook);
agenticHookManager.register(llmRetryHook);
}