erosolar-cli
Version:
Unified AI agent framework for the command line - Multi-provider support with schema-driven tools, code intelligence, and transparent reasoning
509 lines (493 loc) • 18.7 kB
JavaScript
/**
* AI Response Verification System - Isolated Runtime Only
*
* Verifies assistant claims by spawning fresh CLI instances and running
* actual runtime tests. All verification happens in isolation.
*
* @license MIT
*/
import { exec, spawn } from 'node:child_process';
import { promisify } from 'node:util';
import * as fs from 'node:fs/promises';
import * as path from 'node:path';
const execAsync = promisify(exec);
// ============================================================================
// ISOLATED RUNTIME - Core Functions
// ============================================================================
/**
* Spawns a fresh isolated erosolar-cli instance for testing
*/
async function spawnIsolatedCLI(cwd, timeout = 60000) {
const cliPath = path.join(cwd, 'dist/bin/erosolar.js');
// Verify CLI exists
try {
await fs.access(cliPath);
}
catch {
throw new Error(`CLI not found at ${cliPath}. Run build first.`);
}
let output = '';
let errors = '';
let exitResolve;
const exitPromise = new Promise(resolve => { exitResolve = resolve; });
const child = spawn('node', [cliPath, '--plain'], {
cwd,
env: { ...process.env, EROSOLAR_TEST_MODE: '1', NO_COLOR: '1' },
stdio: ['pipe', 'pipe', 'pipe']
});
child.stdout.on('data', (data) => { output += data.toString(); });
child.stderr.on('data', (data) => { errors += data.toString(); });
child.on('close', (code) => { exitResolve(code); });
child.on('error', (err) => { errors += err.message; exitResolve(1); });
// Set timeout
const timeoutId = setTimeout(() => {
child.kill('SIGTERM');
errors += `\nTimeout after ${timeout}ms`;
}, timeout);
child.on('close', () => clearTimeout(timeoutId));
// Wait for startup
await new Promise(resolve => {
const checkStartup = setInterval(() => {
if (output.includes('erosolar') || output.includes('>') || output.length > 100) {
clearInterval(checkStartup);
resolve();
}
}, 100);
setTimeout(() => { clearInterval(checkStartup); resolve(); }, 2000);
});
return {
process: child,
stdin: child.stdin,
output,
errors,
exitPromise
};
}
/**
* Sends a command to the spawned CLI and waits for response
*/
async function sendCommand(cli, command, waitMs = 5000) {
const outputBefore = cli.output.length;
cli.stdin.write(`${command}\n`);
await new Promise(resolve => {
let lastLength = cli.output.length;
const checkInterval = setInterval(() => {
if (cli.output.length > lastLength) {
lastLength = cli.output.length;
}
else if (cli.output.length > outputBefore) {
clearInterval(checkInterval);
resolve();
}
}, 200);
setTimeout(() => { clearInterval(checkInterval); resolve(); }, waitMs);
});
return cli.output.slice(outputBefore);
}
/**
* Run a shell command for verification (file checks, etc.)
*/
async function runShellVerification(cmd, cwd) {
// Safety check - block dangerous commands
const dangerous = [/\brm\s/i, /rmdir/i, /sudo/i, /chmod\s*7/i, /eval\s*\(/i, /DROP\s+TABLE/i, /DELETE\s+FROM/i];
for (const p of dangerous) {
if (p.test(cmd))
return { ok: false, out: `Blocked dangerous command: ${p.source}` };
}
try {
const { stdout, stderr } = await execAsync(cmd, { cwd, timeout: 30000 });
return { ok: true, out: stdout + stderr };
}
catch (e) {
return { ok: false, out: e instanceof Error ? e.message : 'Command failed' };
}
}
/**
* Runs an isolated runtime test
*/
export async function runIsolatedTest(test, cwd, llmVerifier) {
const startTime = Date.now();
const result = {
test,
success: false,
output: '',
errors: '',
exitCode: null,
duration: 0,
matchedPatterns: [],
unmatchedPatterns: []
};
try {
// Rebuild if required
if (test.requiresBuild) {
try {
await execAsync('npm run build', { cwd, timeout: 120000 });
}
catch (buildErr) {
result.errors = `Build failed: ${buildErr instanceof Error ? buildErr.message : 'unknown'}`;
result.duration = Date.now() - startTime;
return result;
}
}
// Run shell commands first if any (file checks, etc.)
if (test.shellCommands && test.shellCommands.length > 0) {
for (const cmd of test.shellCommands) {
const shellResult = await runShellVerification(cmd, cwd);
result.output += `$ ${cmd}\n${shellResult.out}\n`;
if (!shellResult.ok) {
result.errors += `${shellResult.out}\n`;
}
}
}
// Run CLI commands if any
if (test.commands && test.commands.length > 0) {
const cli = await spawnIsolatedCLI(cwd, test.timeout || 60000);
for (const cmd of test.commands) {
const cmdOutput = await sendCommand(cli, cmd);
result.output += `> ${cmd}\n${cmdOutput}\n`;
}
cli.stdin.write('/quit\n');
await new Promise(resolve => setTimeout(resolve, 500));
cli.process.kill('SIGTERM');
result.exitCode = await cli.exitPromise;
result.errors += cli.errors;
}
// Check expected output patterns
if (test.expectedOutputs) {
for (const pattern of test.expectedOutputs) {
if (result.output.includes(pattern) || new RegExp(pattern, 'i').test(result.output)) {
result.matchedPatterns.push(pattern);
}
else {
result.unmatchedPatterns.push(pattern);
}
}
}
// LLM assessment of behavior
if (test.expectedBehavior && llmVerifier) {
const assessPrompt = `Assess if this output demonstrates the expected behavior.
EXPECTED: ${test.expectedBehavior}
OUTPUT:
---
${result.output.slice(0, 4000)}
---
Return JSON: {"matches": true/false, "confidence": 0-100, "reasoning": "explanation"}`;
try {
const assessment = await llmVerifier(assessPrompt);
const match = assessment.match(/\{[\s\S]*\}/);
if (match) {
const parsed = JSON.parse(match[0]);
result.llmAssessment = `${parsed.matches ? '✅' : '❌'} [${parsed.confidence}%] ${parsed.reasoning}`;
if (!parsed.matches || parsed.confidence < 70) {
result.unmatchedPatterns.push(`behavior: ${test.expectedBehavior}`);
}
else {
result.matchedPatterns.push(`behavior: ${test.expectedBehavior}`);
}
}
}
catch {
result.llmAssessment = 'LLM assessment failed';
}
}
// Determine success
result.success = result.unmatchedPatterns.length === 0 &&
(result.matchedPatterns.length > 0 || (!test.expectedOutputs?.length && !test.expectedBehavior));
}
catch (err) {
result.errors = err instanceof Error ? err.message : 'Unknown error';
}
result.duration = Date.now() - startTime;
return result;
}
// ============================================================================
// CLAIM EXTRACTION - LLM extracts claims from responses
// ============================================================================
const EXTRACT_CLAIMS_PROMPT = `Extract ALL verifiable claims from this AI assistant response.
RESPONSE:
---
{RESPONSE}
---
CONTEXT: {CONTEXT}
WORKING_DIR: {WORKING_DIR}
For each claim, determine:
1. What specific assertion is being made
2. Category: file_op (created/modified/deleted files), code (compiles/tests pass), command (executed successfully), state (something changed), behavior (feature works), fact (verifiable truth)
3. How it can be verified (shell command, file check, CLI test, etc.)
4. Priority: critical (must verify), high (should verify), medium (nice to verify), low (optional)
Return JSON array:
[{
"id": "c1",
"statement": "the specific claim",
"category": "file_op|code|command|state|behavior|fact",
"verifiable": true,
"priority": "critical|high|medium|low",
"context": {"path": "/path/if/relevant", "command": "if relevant"}
}]
Output ONLY valid JSON array.`;
/**
* Extract claims from assistant response using LLM
*/
async function extractClaims(response, ctx) {
if (!ctx.llmVerifier)
return [];
try {
const prompt = EXTRACT_CLAIMS_PROMPT
.replace('{RESPONSE}', response.slice(0, 8000))
.replace('{CONTEXT}', ctx.conversationHistory?.slice(-3).join('\n') || '')
.replace('{WORKING_DIR}', ctx.workingDirectory);
const result = await ctx.llmVerifier(prompt);
const match = result.match(/\[[\s\S]*\]/);
if (match) {
return JSON.parse(match[0]);
}
}
catch {
// Fall through
}
return [];
}
// ============================================================================
// TEST GENERATION - LLM generates isolated tests for claims
// ============================================================================
const GENERATE_TESTS_PROMPT = `Generate isolated runtime tests for these claims.
CLAIMS:
{CLAIMS}
WORKING_DIR: {WORKING_DIR}
PLATFORM: {PLATFORM}
For each claim, generate a test that verifies it using:
- Shell commands (for file checks, git status, etc.)
- CLI commands (for testing CLI behavior in fresh instance)
- Expected output patterns
Return JSON array:
[{
"id": "test-1",
"description": "what we're testing",
"shellCommands": ["ls -la path", "cat file"],
"commands": ["/help", "some input"],
"expectedOutputs": ["pattern1", "pattern2"],
"expectedBehavior": "description for LLM assessment",
"requiresBuild": false,
"timeout": 30000
}]
Use READ-ONLY commands only. No destructive operations.
Output ONLY valid JSON array.`;
/**
* Generate isolated tests for claims
*/
async function generateTests(claims, ctx) {
if (!ctx.llmVerifier || claims.length === 0)
return [];
try {
const prompt = GENERATE_TESTS_PROMPT
.replace('{CLAIMS}', JSON.stringify(claims.slice(0, 10)))
.replace('{WORKING_DIR}', ctx.workingDirectory)
.replace('{PLATFORM}', process.platform);
const result = await ctx.llmVerifier(prompt);
const match = result.match(/\[[\s\S]*\]/);
if (match) {
return JSON.parse(match[0]);
}
}
catch {
// Fall through to basic tests
}
// Fallback: generate basic tests
return claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).map((c, i) => {
const test = {
id: `test-${i}`,
description: c.statement,
commands: [],
shellCommands: [],
expectedBehavior: c.statement,
timeout: 30000
};
// Add basic verification based on category
if (c.category === 'file_op' && c.context['path']) {
test.shellCommands = [`test -f "${c.context['path']}" && echo "EXISTS" || echo "NOT_FOUND"`];
test.expectedOutputs = ['EXISTS'];
}
else if (c.category === 'code') {
test.shellCommands = ['npm run build 2>&1 | tail -5'];
}
else if (c.category === 'behavior') {
test.commands = ['/help'];
}
return test;
});
}
// ============================================================================
// MAIN VERIFICATION API
// ============================================================================
/**
* Verify an assistant response using a completely isolated process.
*
* This spawns a separate Node.js process to run all verification:
* - Separate memory space from main CLI
* - Separate event loop
* - Independent error handling
* - No shared state
*
* This ensures verification cannot interfere with the main process and vice versa.
*/
export async function verifyResponse(response, ctx, responseId) {
const timestamp = new Date().toISOString();
const id = responseId || `verify-${Date.now()}`;
// Find the isolated verifier script
const verifierPath = path.join(ctx.workingDirectory, 'dist/core/isolatedVerifier.js');
try {
await fs.access(verifierPath);
}
catch {
// Fallback: return unverified if script not found
return {
responseId: id,
timestamp,
claims: [],
results: [],
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
overallVerdict: 'unverified',
trustScore: 50
};
}
// Build request for isolated process
const request = {
type: 'verify',
response,
workingDirectory: ctx.workingDirectory,
conversationHistory: ctx.conversationHistory || [],
provider: ctx.provider,
model: ctx.model,
};
return new Promise((resolve) => {
// Spawn completely isolated Node.js process
const child = spawn('node', [verifierPath], {
cwd: ctx.workingDirectory,
env: {
...process.env,
EROSOLAR_ISOLATED_VERIFIER: '1',
NODE_OPTIONS: '--max-old-space-size=512', // Limit memory for safety
},
stdio: ['pipe', 'pipe', 'pipe'],
});
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
stdout += data.toString();
});
child.stderr.on('data', (data) => {
stderr += data.toString();
});
// Set timeout for verification (2 minutes max)
const timeout = setTimeout(() => {
child.kill('SIGTERM');
resolve({
responseId: id,
timestamp,
claims: [],
results: [],
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
overallVerdict: 'unverified',
trustScore: 50
});
}, 120000);
child.on('close', (_code) => {
clearTimeout(timeout);
try {
// Parse result from isolated process
const result = JSON.parse(stdout);
if (result.error) {
// Process returned error
resolve({
responseId: id,
timestamp,
claims: [],
results: [],
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
overallVerdict: 'unverified',
trustScore: 50
});
}
else {
resolve(result);
}
}
catch {
// JSON parse failed
resolve({
responseId: id,
timestamp,
claims: [],
results: [],
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
overallVerdict: 'unverified',
trustScore: 50
});
}
});
child.on('error', () => {
clearTimeout(timeout);
resolve({
responseId: id,
timestamp,
claims: [],
results: [],
summary: { total: 0, verified: 0, failed: 0, inconclusive: 0 },
overallVerdict: 'unverified',
trustScore: 50
});
});
// Send request to isolated process via stdin
child.stdin.write(JSON.stringify(request));
child.stdin.end();
});
}
/**
* Format verification report for display
*/
export function formatVerificationReport(report) {
const bar = '█'.repeat(Math.round(report.trustScore / 10)) + '░'.repeat(10 - Math.round(report.trustScore / 10));
const icon = report.trustScore >= 80 ? '✅' : report.trustScore >= 50 ? '⚠️' : '❌';
let out = `╔════════════════════════════════════════════════════════════╗
║ ISOLATED RUNTIME VERIFICATION REPORT ║
╚════════════════════════════════════════════════════════════╝
`;
out += `Trust: ${icon} ${report.trustScore}/100 [${bar}]
Verdict: ${report.overallVerdict.toUpperCase()}
Claims: ${report.summary.total} | ✅ ${report.summary.verified} | ❌ ${report.summary.failed} | ❓ ${report.summary.inconclusive}
`;
out += `🔬 ISOLATED RUNTIME TESTS:\n`;
for (const r of report.results.slice(0, 8)) {
const statusIcon = r.verified ? '✅' : r.confidence === 'high' ? '❌' : '❓';
out += ` ${statusIcon} [${r.confidence}] ${r.claim.statement.slice(0, 50)}...\n`;
if (r.reasoning) {
out += ` └─ ${r.reasoning.slice(0, 60)}\n`;
}
}
if (report.results.length > 8) {
out += ` ... +${report.results.length - 8} more\n`;
}
return out;
}
/**
* Quick verification - verify only critical/high priority claims
*/
export async function quickVerify(response, ctx) {
const claims = await extractClaims(response, ctx);
const critical = claims.filter(c => c.verifiable && (c.priority === 'critical' || c.priority === 'high')).slice(0, 3);
if (critical.length === 0) {
return { trustScore: 50, summary: 'No critical claims to verify' };
}
const tests = await generateTests(critical, ctx);
let verified = 0;
for (const test of tests) {
const result = await runIsolatedTest(test, ctx.workingDirectory, ctx.llmVerifier);
if (result.success)
verified++;
}
return {
trustScore: Math.round((verified / critical.length) * 100),
summary: `${verified}/${critical.length} critical claims verified`
};
}
//# sourceMappingURL=responseVerifier.js.map