UNPKG

@stackmemoryai/stackmemory

Version:

Project-scoped memory for AI coding tools. Durable context across sessions with MCP integration, frames, smart retrieval, Claude Code skills, and automatic hooks.

312 lines (254 loc) 8.25 kB
#!/usr/bin/env node /** * GEPA Reflection Engine * * Analyzes evaluation results to generate insights for next mutation cycle. * This is the key differentiator from random mutations. */ import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; import { execSync } from 'child_process'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const GEPA_DIR = path.join(__dirname, '..'); const RESULTS_DIR = path.join(GEPA_DIR, 'results'); const GENERATIONS_DIR = path.join(GEPA_DIR, 'generations'); /** * Analyze session data for patterns */ function analyzeSessionPatterns() { const sessionsDir = path.join(RESULTS_DIR, 'sessions'); if (!fs.existsSync(sessionsDir)) return { patterns: [], insights: [] }; const sessions = fs .readdirSync(sessionsDir) .filter((f) => f.endsWith('.json')) .map((f) => JSON.parse(fs.readFileSync(path.join(sessionsDir, f), 'utf8'))); const patterns = { // Error patterns commonErrors: extractCommonErrors(sessions), // Tool usage patterns toolUsage: extractToolPatterns(sessions), // Feedback patterns feedbackPatterns: extractFeedbackPatterns(sessions), // Performance patterns performanceByVariant: extractPerformanceByVariant(sessions), }; return patterns; } function extractCommonErrors(sessions) { const errorCounts = {}; sessions.forEach((s) => { s.errors?.forEach((e) => { const normalized = normalizeError(e.error); errorCounts[normalized] = (errorCounts[normalized] || 0) + 1; }); }); return Object.entries(errorCounts) .sort((a, b) => b[1] - a[1]) .slice(0, 10) .map(([error, count]) => ({ error, count })); } function normalizeError(error) { // Normalize error messages for grouping return error .replace(/\d+/g, 'N') .replace(/['"`][^'"`]+['"`]/g, '"..."') .replace(/\/[^\s]+/g, '/path') .slice(0, 100); } function extractToolPatterns(sessions) { const toolStats = {}; sessions.forEach((s) => { s.toolCalls?.forEach((t) => { if (!toolStats[t.tool]) { toolStats[t.tool] = { count: 0, success: 0, avgDuration: 0 }; } toolStats[t.tool].count++; if (t.success) toolStats[t.tool].success++; toolStats[t.tool].avgDuration += t.duration || 0; }); }); // Calculate averages Object.values(toolStats).forEach((s) => { s.avgDuration = s.count > 0 ? s.avgDuration / s.count : 0; s.successRate = s.count > 0 ? s.success / s.count : 0; }); return toolStats; } function extractFeedbackPatterns(sessions) { const feedback = { positive: 0, negative: 0, corrections: 0, retries: 0 }; sessions.forEach((s) => { s.userFeedback?.forEach((f) => { if (f.type === 'thumbs_up') feedback.positive++; if (f.type === 'thumbs_down') feedback.negative++; if (f.type === 'correction') feedback.corrections++; if (f.type === 'retry') feedback.retries++; }); }); return feedback; } function extractPerformanceByVariant(sessions) { const variants = {}; sessions.forEach((s) => { if (!variants[s.variant]) { variants[s.variant] = { sessions: 0, totalErrors: 0, totalSuccess: 0, avgDuration: 0, }; } variants[s.variant].sessions++; variants[s.variant].totalErrors += s.metrics?.errorCount || 0; variants[s.variant].totalSuccess += s.metrics?.successfulToolCalls || 0; variants[s.variant].avgDuration += s.duration || 0; }); // Normalize Object.values(variants).forEach((v) => { v.avgDuration = v.sessions > 0 ? v.avgDuration / v.sessions : 0; v.errorRate = v.sessions > 0 ? v.totalErrors / v.sessions : 0; v.successRate = v.totalSuccess / (v.totalSuccess + v.totalErrors) || 0; }); return variants; } /** * Generate reflection insights */ async function generateReflection() { const patterns = analyzeSessionPatterns(); const state = JSON.parse( fs.readFileSync(path.join(GEPA_DIR, 'state.json'), 'utf8') ); // Load current best prompt for context const currentPrompt = fs.readFileSync( path.join( GENERATIONS_DIR, `gen-${String(state.currentGeneration).padStart(3, '0')}`, `${state.bestVariant}.md` ), 'utf8' ); const reflectionPrompt = `Analyze these AI agent performance patterns and generate specific improvement recommendations. CURRENT SYSTEM PROMPT (excerpt): \`\`\`markdown ${currentPrompt.slice(0, 3000)}... \`\`\` PERFORMANCE PATTERNS: 1. COMMON ERRORS (${patterns.commonErrors.length} types): ${patterns.commonErrors.map((e) => ` - "${e.error}" (${e.count}x)`).join('\n')} 2. TOOL USAGE: ${Object.entries(patterns.toolUsage) .map( ([tool, s]) => ` - ${tool}: ${s.count} calls, ${(s.successRate * 100).toFixed(0)}% success, ${s.avgDuration.toFixed(0)}ms avg` ) .join('\n')} 3. USER FEEDBACK: - Positive: ${patterns.feedbackPatterns.positive} - Negative: ${patterns.feedbackPatterns.negative} - Corrections needed: ${patterns.feedbackPatterns.corrections} - Retries: ${patterns.feedbackPatterns.retries} 4. VARIANT PERFORMANCE: ${Object.entries(patterns.performanceByVariant) .map( ([v, s]) => ` - ${v}: ${s.sessions} sessions, ${s.errorRate.toFixed(1)} errors/session, ${(s.successRate * 100).toFixed(0)}% success` ) .join('\n')} Based on this data, provide: 1. TOP 3 FAILURE MODES - What's causing the most errors? 2. MISSING INSTRUCTIONS - What rules should be added to prevent errors? 3. UNCLEAR INSTRUCTIONS - What existing rules are being misinterpreted? 4. PRIORITY MUTATIONS - What specific changes would have highest impact? Format as JSON: { "failureModes": ["...", "...", "..."], "missingInstructions": ["...", "...", "..."], "unclearInstructions": ["...", "...", "..."], "priorityMutations": [ {"type": "add|modify|remove", "section": "...", "change": "...", "rationale": "..."}, ... ] }`; try { const result = execSync( `echo ${JSON.stringify(reflectionPrompt)} | claude --print`, { encoding: 'utf8', maxBuffer: 10 * 1024 * 1024 } ); // Parse JSON from response const jsonMatch = result.match(/\{[\s\S]*\}/); if (jsonMatch) { const insights = JSON.parse(jsonMatch[0]); // Save reflection const reflectionPath = path.join( RESULTS_DIR, `reflection-${Date.now()}.json` ); fs.writeFileSync( reflectionPath, JSON.stringify( { timestamp: new Date().toISOString(), generation: state.currentGeneration, patterns, insights, }, null, 2 ) ); return insights; } } catch (e) { console.error('Reflection failed:', e.message); } return null; } /** * Generate focused mutation prompts based on reflection */ function generateMutationGuide(insights) { if (!insights) return null; return ` REFLECTION-GUIDED MUTATIONS: Based on ${insights.failureModes.length} identified failure modes: ${insights.failureModes.map((f, i) => `${i + 1}. ${f}`).join('\n')} PRIORITY CHANGES: ${insights.priorityMutations .map( (m) => `- [${m.type.toUpperCase()}] ${m.section}: ${m.change}\n Rationale: ${m.rationale}` ) .join('\n\n')} MISSING INSTRUCTIONS TO ADD: ${insights.missingInstructions.map((i) => `- ${i}`).join('\n')} UNCLEAR INSTRUCTIONS TO CLARIFY: ${insights.unclearInstructions.map((i) => `- ${i}`).join('\n')} `; } // CLI const command = process.argv[2]; switch (command) { case 'analyze': const patterns = analyzeSessionPatterns(); console.log(JSON.stringify(patterns, null, 2)); break; case 'reflect': generateReflection().then((insights) => { if (insights) { console.log('\nReflection Insights:'); console.log(JSON.stringify(insights, null, 2)); console.log('\nMutation Guide:'); console.log(generateMutationGuide(insights)); } }); break; default: console.log(` GEPA Reflection Engine Usage: node reflect.js analyze Analyze session patterns node reflect.js reflect Generate reflection insights `); } export { analyzeSessionPatterns, generateReflection, generateMutationGuide };