UNPKG

erosolar-cli

Version:

Unified AI agent framework for the command line - Multi-provider support with schema-driven tools, code intelligence, and transparent reasoning

509 lines (493 loc) 18.7 kB
/** * AI Response Verification System - Isolated Runtime Only * * Verifies assistant claims by spawning fresh CLI instances and running * actual runtime tests. All verification happens in isolation. * * @license MIT */ import { exec, spawn } from 'node:child_process'; import { promisify } from 'node:util'; import * as fs from 'node:fs/promises'; import * as path from 'node:path'; const execAsync = promisify(exec); // ============================================================================ // ISOLATED RUNTIME - Core Functions // ============================================================================ /** * Spawns a fresh isolated erosolar-cli instance for testing */ async function spawnIsolatedCLI(cwd, timeout = 60000) { const cliPath = path.join(cwd, 'dist/bin/erosolar.js'); // Verify CLI exists try { await fs.access(cliPath); } catch { throw new Error(`CLI not found at ${cliPath}. Run build first.`); } let output = ''; let errors = ''; let exitResolve; const exitPromise = new Promise(resolve => { exitResolve = resolve; }); const child = spawn('node', [cliPath, '--plain'], { cwd, env: { ...process.env, EROSOLAR_TEST_MODE: '1', NO_COLOR: '1' }, stdio: ['pipe', 'pipe', 'pipe'] }); child.stdout.on('data', (data) => { output += data.toString(); }); child.stderr.on('data', (data) => { errors += data.toString(); }); child.on('close', (code) => { exitResolve(code); }); child.on('error', (err) => { errors += err.message; exitResolve(1); }); // Set timeout const timeoutId = setTimeout(() => { child.kill('SIGTERM'); errors += `\nTimeout after ${timeout}ms`; }, timeout); child.on('close', () => clearTimeout(timeoutId)); // Wait for startup await new Promise(resolve => { const checkStartup = setInterval(() => { if (output.includes('erosolar') || output.includes('>') || output.length > 100) { clearInterval(checkStartup); resolve(); } }, 100); setTimeout(() => { clearInterval(checkStartup); resolve(); }, 2000); }); return { process: child, stdin: child.stdin, output, errors, exitPromise }; } /** * Sends a command to the spawned CLI and waits for response */ async function sendCommand(cli, command, waitMs = 5000) { const outputBefore = cli.output.length; cli.stdin.write(`${command}\n`); await new Promise(resolve => { let lastLength = cli.output.length; const checkInterval = setInterval(() => { if (cli.output.length > lastLength) { lastLength = cli.output.length; } else if (cli.output.length > outputBefore) { clearInterval(checkInterval); resolve(); } }, 200); setTimeout(() => { clearInterval(checkInterval); resolve(); }, waitMs); }); return cli.output.slice(outputBefore); } /** * Run a shell command for verification (file checks, etc.) */ async function runShellVerification(cmd, cwd) { // Safety check - block dangerous commands const dangerous = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i]; for (const p of dangerous) { if (p.test(cmd)) return { ok: false, out: `Blocked dangerous command: ${p.source}` }; } try { const { stdout, stderr } = await execAsync(cmd, { cwd, timeout: 30000 }); return { ok: true, out: stdout + stderr }; } catch (e) { return { ok: false, out: e instanceof Error ? e.message : 'Command failed' }; } } /** * Runs an isolated runtime test */ export async function runIsolatedTest(test, cwd, llmVerifier) { const startTime = Date.now(); const result = { test, success: false, output: '', errors: '', exitCode: null, duration: 0, matchedPatterns: [], unmatchedPatterns: [] }; try { // Rebuild if required if (test.requiresBuild) { try { await execAsync('npm run build', { cwd, timeout: 120000 }); } catch (buildErr) { result.errors = `Build failed: ${buildErr instanceof Error ? buildErr.message : 'unknown'}`; result.duration = Date.now() - startTime; return result; } } // Run shell commands first if any (file checks, etc.) if (test.shellCommands && test.shellCommands.length > 0) { for (const cmd of test.shellCommands) { const shellResult = await runShellVerification(cmd, cwd); result.output += `$ ${cmd}\n${shellResult.out}\n`; if (!shellResult.ok) { result.errors += `${shellResult.out}\n`; } } } // Run CLI commands if any if (test.commands && test.commands.length > 0) { const cli = await spawnIsolatedCLI(cwd, test.timeout || 60000); for (const cmd of test.commands) { const cmdOutput = await sendCommand(cli, cmd); result.output += `> ${cmd}\n${cmdOutput}\n`; } cli.stdin.write('/quit\n'); await new Promise(resolve => setTimeout(resolve, 500)); cli.process.kill('SIGTERM'); result.exitCode = await cli.exitPromise; result.errors += cli.errors; } // Check expected output patterns if (test.expectedOutputs) { for (const pattern of test.expectedOutputs) { if (result.output.includes(pattern) || new RegExp(pattern, 'i').test(result.output)) { result.matchedPatterns.push(pattern); } else { result.unmatchedPatterns.push(pattern); } } } // LLM assessment of behavior if (test.expectedBehavior && llmVerifier) { const assessPrompt = `Assess if this output demonstrates the expected behavior. EXPECTED: ${test.expectedBehavior} OUTPUT: --- ${result.output.slice(0, 4000)} --- Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanation"}`; try { const assessment = await llmVerifier(assessPrompt); const match = assessment.match(/\{[\s\S]*\}/); if (match) { const parsed = JSON.parse(match[0]); result.llmAssessment = `${parsed.matches ? '✅' : '❌'} [${parsed.confidence}%] ${parsed.reasoning}`; if (!parsed.matches || parsed.confidence < 70) { result.unmatchedPatterns.push(`behavior: ${test.expectedBehavior}`); } else { result.matchedPatterns.push(`behavior: ${test.expectedBehavior}`); } } } catch { result.llmAssessment = 'LLM assessment failed'; } } // Determine success result.success = result.unmatchedPatterns.length === 0 && (result.matchedPatterns.length > 0 || (!test.expectedOutputs?.length && !test.expectedBehavior)); } catch (err) { result.errors = err instanceof Error ? err.message : 'Unknown error'; } result.duration = Date.now() - startTime; return result; } // ============================================================================ // CLAIM EXTRACTION - LLM extracts claims from responses // ============================================================================ const EXTRACT_CLAIMS_PROMPT = `Extract ALL verifiable claims from this AI assistant response. RESPONSE: --- {RESPONSE} --- CONTEXT: {CONTEXT} WORKING_DIR: {WORKING_DIR} For each claim, determine: 1. What specific assertion is being made 2. Category: file_op (created/modified/deleted files), code (compiles/tests pass), command (executed successfully), state (something changed), behavior (feature works), fact (verifiable truth) 3. How it can be verified (shell command, file check, CLI test, etc.) 4. Priority: critical (must verify), high (should verify), medium (nice to verify), low (optional) Return JSON array: [{ "id": "c1", "statement": "the specific claim", "category": "file_op|code|command|state|behavior|fact", "verifiable": true, "priority": "critical|high|medium|low", "context": {"path": "/path/if/relevant", "command": "if relevant"} }] Output ONLY valid JSON array.`; /** * Extract claims from assistant response using LLM */ async function extractClaims(response, ctx) { if (!ctx.llmVerifier) return []; try { const prompt = EXTRACT_CLAIMS_PROMPT .replace('{RESPONSE}', response.slice(0, 8000)) .replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '') .replace('{WORKING_DIR}', ctx.workingDirectory); const result = await ctx.llmVerifier(prompt); const match = result.match(/\[[\s\S]*\]/); if (match) { return JSON.parse(match[0]); } } catch { // Fall through } return []; } // ============================================================================ // TEST GENERATION - LLM generates isolated tests for claims // ============================================================================ const GENERATE_TESTS_PROMPT = `Generate isolated runtime tests for these claims. CLAIMS: {CLAIMS} WORKING_DIR: {WORKING_DIR} PLATFORM: {PLATFORM} For each claim, generate a test that verifies it using: - Shell commands (for file checks, git status, etc.) - CLI commands (for testing CLI behavior in fresh instance) - Expected output patterns Return JSON array: [{ "id": "test-1", "description": "what we're testing", "shellCommands": ["ls -la path", "cat file"], "commands": ["/help", "some input"], "expectedOutputs": ["pattern1", "pattern2"], "expectedBehavior": "description for LLM assessment", "requiresBuild": false, "timeout": 30000 }] Use READ-ONLY commands only. No destructive operations. Output ONLY valid JSON array.`; /** * Generate isolated tests for claims */ async function generateTests(claims, ctx) { if (!ctx.llmVerifier || claims.length === 0) return []; try { const prompt = GENERATE_TESTS_PROMPT .replace('{CLAIMS}', JSON.stringify(claims.slice(0, 10))) .replace('{WORKING_DIR}', ctx.workingDirectory) .replace('{PLATFORM}', process.platform); const result = await ctx.llmVerifier(prompt); const match = result.match(/\[[\s\S]*\]/); if (match) { return JSON.parse(match[0]); } } catch { // Fall through to basic tests } // Fallback: generate basic tests return claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).map((c, i) => { const test = { id: `test-${i}`, description: c.statement, commands: [], shellCommands: [], expectedBehavior: c.statement, timeout: 30000 }; // Add basic verification based on category if (c.category === 'file_op' && c.context['path']) { test.shellCommands = [`test -f "${c.context['path']}" && echo "EXISTS" || echo "NOT_FOUND"`]; test.expectedOutputs = ['EXISTS']; } else if (c.category === 'code') { test.shellCommands = ['npm run build 2>&1 | tail -5']; } else if (c.category === 'behavior') { test.commands = ['/help']; } return test; }); } // ============================================================================ // MAIN VERIFICATION API // ============================================================================ /** * Verify an assistant response using a completely isolated process. * * This spawns a separate Node.js process to run all verification: * - Separate memory space from main CLI * - Separate event loop * - Independent error handling * - No shared state * * This ensures verification cannot interfere with the main process and vice versa. */ export async function verifyResponse(response, ctx, responseId) { const timestamp = new Date().toISOString(); const id = responseId || `verify-${Date.now()}`; // Find the isolated verifier script const verifierPath = path.join(ctx.workingDirectory, 'dist/core/isolatedVerifier.js'); try { await fs.access(verifierPath); } catch { // Fallback: return unverified if script not found return { responseId: id, timestamp, claims: [], results: [], summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 }, overallVerdict: 'unverified', trustScore: 50 }; } // Build request for isolated process const request = { type: 'verify', response, workingDirectory: ctx.workingDirectory, conversationHistory: ctx.conversationHistory || [], provider: ctx.provider, model: ctx.model, }; return new Promise((resolve) => { // Spawn completely isolated Node.js process const child = spawn('node', [verifierPath], { cwd: ctx.workingDirectory, env: { ...process.env, EROSOLAR_ISOLATED_VERIFIER: '1', NODE_OPTIONS: '--max-old-space-size=512', // Limit memory for safety }, stdio: ['pipe', 'pipe', 'pipe'], }); let stdout = ''; let stderr = ''; child.stdout.on('data', (data) => { stdout += data.toString(); }); child.stderr.on('data', (data) => { stderr += data.toString(); }); // Set timeout for verification (2 minutes max) const timeout = setTimeout(() => { child.kill('SIGTERM'); resolve({ responseId: id, timestamp, claims: [], results: [], summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 }, overallVerdict: 'unverified', trustScore: 50 }); }, 120000); child.on('close', (_code) => { clearTimeout(timeout); try { // Parse result from isolated process const result = JSON.parse(stdout); if (result.error) { // Process returned error resolve({ responseId: id, timestamp, claims: [], results: [], summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 }, overallVerdict: 'unverified', trustScore: 50 }); } else { resolve(result); } } catch { // JSON parse failed resolve({ responseId: id, timestamp, claims: [], results: [], summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 }, overallVerdict: 'unverified', trustScore: 50 }); } }); child.on('error', () => { clearTimeout(timeout); resolve({ responseId: id, timestamp, claims: [], results: [], summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 }, overallVerdict: 'unverified', trustScore: 50 }); }); // Send request to isolated process via stdin child.stdin.write(JSON.stringify(request)); child.stdin.end(); }); } /** * Format verification report for display */ export function formatVerificationReport(report) { const bar = '█'.repeat(Math.round(report.trustScore / 10)) + '░'.repeat(10 - Math.round(report.trustScore / 10)); const icon = report.trustScore >= 80 ? '✅' : report.trustScore >= 50 ? '⚠️' : '❌'; let out = `╔════════════════════════════════════════════════════════════╗ ║ ISOLATED RUNTIME VERIFICATION REPORT ║ ╚════════════════════════════════════════════════════════════╝ `; out += `Trust: ${icon} ${report.trustScore}/100 [${bar}] Verdict: ${report.overallVerdict.toUpperCase()} Claims: ${report.summary.total} | ✅ ${report.summary.verified} | ❌ ${report.summary.failed} | ❓ ${report.summary.inconclusive} `; out += `🔬 ISOLATED RUNTIME TESTS:\n`; for (const r of report.results.slice(0, 8)) { const statusIcon = r.verified ? '✅' : r.confidence === 'high' ? '❌' : '❓'; out += ` ${statusIcon} [${r.confidence}] ${r.claim.statement.slice(0, 50)}...\n`; if (r.reasoning) { out += ` └─ ${r.reasoning.slice(0, 60)}\n`; } } if (report.results.length > 8) { out += ` ... +${report.results.length - 8} more\n`; } return out; } /** * Quick verification - verify only critical/high priority claims */ export async function quickVerify(response, ctx) { const claims = await extractClaims(response, ctx); const critical = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 3); if (critical.length === 0) { return { trustScore: 50, summary: 'No critical claims to verify' }; } const tests = await generateTests(critical, ctx); let verified = 0; for (const test of tests) { const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier); if (result.success) verified++; } return { trustScore: Math.round((verified / critical.length) * 100), summary: `${verified}/${critical.length} critical claims verified` }; } //# sourceMappingURL=responseVerifier.js.map