gepa-ts
Version:
TypeScript implementation of GEPA (Gradient-free Evolution of Prompts and Agents) - Complete port with 100% feature parity
364 lines (318 loc) โข 9.58 kB
text/typescript
/**
* Terminal-bench training example - Terminal agent optimization
* Demonstrates optimizing instructions for terminal-using agents
*/
import { GEPAOptimizer } from '../src/core/optimizer.js';
import { TerminalBenchAdapter, TerminalBenchExample, TerminalTrace } from '../src/adapters/terminal-bench-adapter.js';
import { OpenAILanguageModel } from '../src/models/openai.js';
import { WandBIntegration } from '../src/integrations/wandb.js';
import { execSync, spawn } from 'child_process';
import { writeFileSync, mkdirSync, rmSync } from 'fs';
import { join, dirname } from 'path';
import { tmpdir } from 'os';
/**
* Mock terminal executor for testing
* In production, this would integrate with actual Terminus agent
*/
class MockTerminalExecutor {
private lm: OpenAILanguageModel;
constructor(lm: OpenAILanguageModel) {
this.lm = lm;
}
async execute(
instruction: string,
task: string,
initialState: any
): Promise<TerminalTrace> {
const tmpDir = join(tmpdir(), `terminal-bench-${Date.now()}`);
mkdirSync(tmpDir, { recursive: true });
// Create initial files
if (initialState.files) {
for (const [path, content] of Object.entries(initialState.files)) {
const fullPath = join(tmpDir, path);
mkdirSync(dirname(fullPath), { recursive: true });
writeFileSync(fullPath, content as string);
}
}
// Generate commands using LLM
const prompt = `${instruction}
Task: ${task}
Current directory: ${tmpDir}
Available files: ${Object.keys(initialState.files || {}).join(', ')}
Generate a sequence of bash commands to complete this task.
Output only the commands, one per line, no explanations.`;
const response = await this.lm.generate(prompt);
const commands = response.split('\n')
.filter(line => line.trim() && !line.startsWith('#'))
.slice(0, 10); // Limit commands
// Execute commands and capture trace
const trace: TerminalTrace = {
commands: [],
finalState: {
cwd: tmpDir,
files: {}
},
success: true
};
for (const cmd of commands) {
try {
const result = execSync(cmd, {
cwd: tmpDir,
encoding: 'utf-8',
stdio: 'pipe'
});
trace.commands.push({
command: cmd,
stdout: result,
stderr: '',
exitCode: 0,
timestamp: Date.now()
});
} catch (error: any) {
trace.commands.push({
command: cmd,
stdout: '',
stderr: error.stderr || error.message,
exitCode: error.status || 1,
timestamp: Date.now()
});
trace.success = false;
break;
}
}
// Capture final state
try {
const files = execSync('find . -type f', {
cwd: tmpDir,
encoding: 'utf-8'
}).split('\n').filter(f => f);
for (const file of files) {
const content = execSync(`cat ${file}`, {
cwd: tmpDir,
encoding: 'utf-8'
});
trace.finalState.files[file] = content;
}
} catch (error) {
// Ignore errors reading final state
}
// Cleanup
try {
rmSync(tmpDir, { recursive: true, force: true });
} catch {
// Ignore cleanup errors
}
return trace;
}
}
// Example terminal-bench tasks
const TERMINAL_BENCH_TASKS: TerminalBenchExample[] = [
{
task: 'Create a Python script that calculates fibonacci numbers',
initialState: {
cwd: '.',
files: {}
},
expectedOutcome: {
files: {
'./fibonacci.py': 'def fibonacci'
}
}
},
{
task: 'Set up a basic Node.js project with package.json',
initialState: {
cwd: '.',
files: {}
},
expectedOutcome: {
files: {
'./package.json': '"name":'
}
}
},
{
task: 'Create a README.md with project title and description',
initialState: {
cwd: '.',
files: {
'main.py': 'print("Hello World")'
}
},
expectedOutcome: {
files: {
'./README.md': '# '
}
}
},
{
task: 'Write a bash script that counts files in a directory',
initialState: {
cwd: '.',
files: {}
},
expectedOutcome: {
files: {
'./count_files.sh': '#!/bin/bash'
}
}
},
{
task: 'Create a simple HTML page with a title and heading',
initialState: {
cwd: '.',
files: {}
},
expectedOutcome: {
files: {
'./index.html': '<html>'
}
}
}
];
async function main() {
console.log('๐ฅ๏ธ Starting Terminal-bench Agent Optimization');
// Initialize language model
const apiKey = process.env.OPENAI_API_KEY;
if (!apiKey) {
console.error('โ OPENAI_API_KEY environment variable not set');
process.exit(1);
}
const lm = new OpenAILanguageModel({
apiKey,
model: 'gpt-4-turbo-preview',
temperature: 0.5,
maxTokens: 500
});
// Create mock executor
const executor = new MockTerminalExecutor(lm);
// Create adapter
const adapter = new TerminalBenchAdapter({
agentExecutor: (instruction, task, state) =>
executor.execute(instruction, task, state),
maxCommands: 15,
timeout: 30000,
verbose: true
});
// Initialize WandB (optional)
let wandb: WandBIntegration | undefined;
if (process.env.WANDB_API_KEY) {
wandb = new WandBIntegration({
project: 'gepa-terminal-bench',
name: `terminal-optimization-${Date.now()}`,
config: {
dataset: 'Terminal-bench',
model: 'gpt-4-turbo-preview',
populationSize: 6,
generations: 8
}
});
await wandb.init();
}
// Configure GEPA optimizer
const optimizer = new GEPAOptimizer({
adapter,
config: {
populationSize: 6,
generations: 8,
mutationRate: 0.4,
mergeRate: 0.2,
tournamentSize: 2,
eliteRatio: 0.25,
verbose: true,
runDir: join(process.cwd(), 'runs', `terminal-${Date.now()}`)
},
wandbIntegration: wandb
});
// Define initial instructions to optimize
const initialComponents = {
terminal_instruction: `You are a terminal agent that executes bash commands to complete tasks.
Guidelines:
- Read the task carefully
- Plan your approach before executing
- Use standard Unix commands
- Create files and directories as needed
- Verify your work with ls, cat, etc.
Task: {task}
Execute commands one at a time. Be efficient and precise.`,
error_handling: `If a command fails:
- Check the error message
- Try an alternative approach
- Create parent directories if needed
- Use appropriate permissions`,
verification: `After completing the task:
- List created files with ls
- Show file contents with cat
- Confirm task requirements are met`
};
// Split data
const trainTasks = TERMINAL_BENCH_TASKS.slice(0, 3);
const valTasks = TERMINAL_BENCH_TASKS.slice(3);
console.log('\n๐ Starting GEPA optimization...');
console.log(`Training tasks: ${trainTasks.length}`);
console.log(`Validation tasks: ${valTasks.length}`);
console.log('Components to optimize:', Object.keys(initialComponents));
const result = await optimizer.optimize({
initialCandidate: initialComponents,
trainData: trainTasks,
valData: valTasks,
componentsToUpdate: ['terminal_instruction']
});
// Save results
const outputDir = join(process.cwd(), 'results', 'terminal-bench');
mkdirSync(outputDir, { recursive: true });
console.log('\n๐ Optimization Results:');
console.log('Best validation score:', result.bestScore.toFixed(3));
console.log('Generations completed:', result.generationsCompleted);
console.log('Total evaluations:', result.totalEvaluations);
// Save optimized instructions
const outputPath = join(outputDir, 'optimized_instructions.json');
writeFileSync(outputPath, JSON.stringify({
metadata: {
dataset: 'Terminal-bench',
score: result.bestScore,
generations: result.generationsCompleted,
timestamp: new Date().toISOString()
},
instructions: result.bestCandidate,
history: result.history
}, null, 2));
console.log(`\nโ
Results saved to: ${outputPath}`);
// Show optimized instruction
console.log('\n๐ Optimized Terminal Instruction:');
console.log('---');
console.log(result.bestCandidate.terminal_instruction);
console.log('---');
// Test on validation task
console.log('\n๐งช Testing optimized instruction on validation task:');
const testTask = valTasks[0];
console.log('Task:', testTask.task);
const evalResult = await adapter.evaluate(
[testTask],
result.bestCandidate,
true
);
console.log('Success:', evalResult.outputs[0].success ? 'โ' : 'โ');
console.log('Commands used:', evalResult.outputs[0].commandCount);
console.log('Score:', evalResult.scores[0].toFixed(2));
if (evalResult.trajectories && evalResult.trajectories[0]) {
console.log('\nCommand trace:');
for (const cmd of evalResult.trajectories[0].commands.slice(0, 5)) {
console.log(` $ ${cmd.command}`);
if (cmd.exitCode !== 0) {
console.log(` โ Error: ${cmd.stderr}`);
}
}
}
// Close WandB
if (wandb) {
await wandb.finish();
}
console.log('\n๐ Terminal-bench optimization complete!');
}
// Run the example
main().catch(error => {
console.error('โ Error:', error);
process.exit(1);
});