codecrucible-synth
Version:
Production-Ready AI Development Platform with Multi-Voice Synthesis, Smithery MCP Integration, Enterprise Security, and Zero-Timeout Reliability
883 lines (777 loc) โข 26.9 kB
text/typescript
import { UnifiedModelClient } from '../../refactor/unified-model-client.js';
import { logger } from '../logger.js';
import * as fs from 'fs/promises';
import * as path from 'path';
import ivm from 'isolated-vm';
export interface CodingChallenge {
id: string;
title: string;
prompt: string;
testCases: TestCase[];
expectedSolution?: string;
difficulty: 'easy' | 'medium' | 'hard';
category: 'algorithms' | 'data-structures' | 'string-manipulation' | 'math' | 'logic';
language: 'javascript' | 'python' | 'typescript';
timeLimit?: number; // seconds
}
export interface TestCase {
input: any;
expectedOutput: any;
description?: string;
}
export interface BenchmarkResult {
challengeId: string;
passed: boolean;
generatedCode: string;
executionTime: number;
errors: string[];
testResults: TestResult[];
confidence: number;
codeQuality: {
readability: number;
efficiency: number;
correctness: number;
};
}
export interface TestResult {
input: any;
expectedOutput: any;
actualOutput: any;
passed: boolean;
error?: string;
}
export interface BenchmarkSummary {
totalChallenges: number;
passed: number;
failed: number;
successRate: number;
averageTime: number;
averageConfidence: number;
categoryBreakdown: Record<string, { passed: number; total: number }>;
difficultyBreakdown: Record<string, { passed: number; total: number }>;
detailedResults: BenchmarkResult[];
modelUsed: string;
timestamp: number;
}
/**
* HumanEval-inspired benchmark runner for code generation models
* Evaluates model performance on standardized coding challenges
*/
export class BenchmarkRunner {
private challenges: CodingChallenge[];
private hybridClient?: UnifiedModelClient;
private ollamaClient?: UnifiedModelClient;
constructor() {
this.challenges = this.loadDefaultChallenges();
this.initializeClients();
logger.info('Benchmark runner initialized with challenges', {
totalChallenges: this.challenges.length,
categories: this.getCategoryDistribution(),
});
}
/**
* Run benchmark suite on specified model
*/
async runBenchmark(
modelName?: string,
options: {
categories?: string[];
difficulties?: string[];
limit?: number;
timeoutMs?: number;
} = {}
): Promise<BenchmarkSummary> {
const startTime = Date.now();
console.log('๐งช Starting benchmark suite...');
console.log(`๐ Model: ${modelName || 'hybrid'}`);
// Filter challenges based on options
let selectedChallenges = this.challenges;
if (options.categories?.length) {
selectedChallenges = selectedChallenges.filter(c => options.categories!.includes(c.category));
}
if (options.difficulties?.length) {
selectedChallenges = selectedChallenges.filter(c =>
options.difficulties!.includes(c.difficulty)
);
}
if (options.limit) {
selectedChallenges = selectedChallenges.slice(0, options.limit);
}
console.log(`๐ฏ Running ${selectedChallenges.length} challenges`);
const results: BenchmarkResult[] = [];
let passed = 0;
let totalTime = 0;
let totalConfidence = 0;
// Run each challenge
for (let i = 0; i < selectedChallenges.length; i++) {
const challenge = selectedChallenges[i];
console.log(
`\n[${i + 1}/${selectedChallenges.length}] ${challenge.title} (${challenge.difficulty})`
);
try {
const result = await this.runSingleChallenge(challenge, modelName, options.timeoutMs);
results.push(result);
if (result.passed) {
passed++;
console.log(
` โ
PASSED (${result.executionTime}ms, confidence: ${(result.confidence * 100).toFixed(0)}%)`
);
} else {
console.log(` โ FAILED (${result.errors.length} errors)`);
if (result.errors.length > 0) {
console.log(` ${result.errors[0]}`);
}
}
totalTime += result.executionTime;
totalConfidence += result.confidence;
} catch (error) {
console.log(` ๐ฅ ERROR: ${error instanceof Error ? error.message : 'Unknown error'}`);
// Add failed result
results.push({
challengeId: challenge.id,
passed: false,
generatedCode: '',
executionTime: 0,
errors: [error instanceof Error ? error.message : 'Benchmark execution error'],
testResults: [],
confidence: 0,
codeQuality: { readability: 0, efficiency: 0, correctness: 0 },
});
}
// Small delay between challenges
await new Promise(resolve => setTimeout(resolve, 500));
}
// Calculate summary statistics
const successRate = (passed / selectedChallenges.length) * 100;
const averageTime = totalTime / selectedChallenges.length;
const averageConfidence = totalConfidence / selectedChallenges.length;
// Category and difficulty breakdowns
const categoryBreakdown = this.calculateCategoryBreakdown(selectedChallenges, results);
const difficultyBreakdown = this.calculateDifficultyBreakdown(selectedChallenges, results);
const summary: BenchmarkSummary = {
totalChallenges: selectedChallenges.length,
passed,
failed: selectedChallenges.length - passed,
successRate,
averageTime,
averageConfidence,
categoryBreakdown,
difficultyBreakdown,
detailedResults: results,
modelUsed: modelName || 'hybrid',
timestamp: Date.now(),
};
// Save results
await this.saveBenchmarkResults(summary);
const totalDuration = Date.now() - startTime;
console.log('\n๐ Benchmark Results:');
console.log(`โ
Passed: ${passed}/${selectedChallenges.length} (${successRate.toFixed(1)}%)`);
console.log(`โฑ๏ธ Average Time: ${averageTime.toFixed(0)}ms per challenge`);
console.log(`๐ฏ Average Confidence: ${(averageConfidence * 100).toFixed(1)}%`);
console.log(`โณ Total Duration: ${(totalDuration / 1000).toFixed(1)}s`);
// Category breakdown
console.log('\n๐ Category Breakdown:');
Object.entries(categoryBreakdown).forEach(([category, stats]) => {
const rate = ((stats.passed / stats.total) * 100).toFixed(1);
console.log(` ${category}: ${stats.passed}/${stats.total} (${rate}%)`);
});
// Difficulty breakdown
console.log('\n๐๏ธ Difficulty Breakdown:');
Object.entries(difficultyBreakdown).forEach(([difficulty, stats]) => {
const rate = ((stats.passed / stats.total) * 100).toFixed(1);
console.log(` ${difficulty}: ${stats.passed}/${stats.total} (${rate}%)`);
});
return summary;
}
/**
* Run a single coding challenge
*/
private async runSingleChallenge(
challenge: CodingChallenge,
modelName?: string,
timeoutMs: number = 30000
): Promise<BenchmarkResult> {
const startTime = Date.now();
try {
// Generate code using specified model
const generation = await this.generateCodeForChallenge(challenge, modelName, timeoutMs);
const executionTime = Date.now() - startTime;
// Execute tests
const testResults = await this.executeTests(challenge, generation.code);
// Check if all tests passed
const passed = testResults.every(result => result.passed);
// Calculate code quality metrics
const codeQuality = this.assessCodeQuality(generation.code, challenge);
return {
challengeId: challenge.id,
passed,
generatedCode: generation.code,
executionTime,
errors: generation.errors,
testResults,
confidence: generation.confidence,
codeQuality,
};
} catch (error) {
return {
challengeId: challenge.id,
passed: false,
generatedCode: '',
executionTime: Date.now() - startTime,
errors: [error instanceof Error ? error.message : 'Unknown execution error'],
testResults: [],
confidence: 0,
codeQuality: { readability: 0, efficiency: 0, correctness: 0 },
};
}
}
/**
* Generate code for a challenge using specified model
*/
private async generateCodeForChallenge(
challenge: CodingChallenge,
modelName?: string,
timeoutMs: number = 30000
): Promise<{ code: string; confidence: number; errors: string[] }> {
const enhancedPrompt = `${challenge.prompt}
Requirements:
- Write only the function implementation
- Use ${challenge.language}
- Do not include test cases or examples
- Ensure the solution handles edge cases
- Return only the code without explanations
Function signature and implementation:`;
try {
if (modelName && modelName !== 'hybrid') {
// Use specific model (Ollama)
if (this.ollamaClient) {
const result = (await Promise.race([
this.ollamaClient.generateText(enhancedPrompt, { includeContext: false }),
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), timeoutMs)),
])) as any;
const extractedCode = this.extractCodeFromResponse(result.text, challenge.language);
return {
code: extractedCode,
confidence: 0.8, // Default confidence for Ollama
errors: [],
};
} else {
throw new Error('Ollama client not available');
}
} else {
// Use hybrid client
if (this.hybridClient) {
const result = (await Promise.race([
this.hybridClient.generate({
prompt: enhancedPrompt,
}),
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), timeoutMs)),
])) as any;
const extractedCode = this.extractCodeFromResponse(
result.code || result.synthesis,
challenge.language
);
return {
code: extractedCode,
confidence: result.confidence || 0.7,
errors: [],
};
} else {
throw new Error('Hybrid client not available');
}
}
} catch (error) {
return {
code: '',
confidence: 0,
errors: [error instanceof Error ? error.message : 'Code generation failed'],
};
}
}
/**
* Extract clean code from model response
*/
private extractCodeFromResponse(response: string, language: string): string {
// Remove markdown code blocks
let code = response.replace(/```[\w]*\n?/g, '').replace(/```/g, '');
// Remove explanatory text (simple heuristic)
const lines = code.split('\n');
const codeLines = lines.filter(line => {
const trimmed = line.trim();
return (
trimmed &&
!trimmed.startsWith('//') &&
!trimmed.startsWith('#') &&
!trimmed.startsWith('Here') &&
!trimmed.startsWith('This') &&
!trimmed.startsWith('The function')
);
});
code = codeLines.join('\n').trim();
// Language-specific cleanup
if (language === 'javascript' || language === 'typescript') {
// Ensure we have a function
if (!code.includes('function') && !code.includes('=>')) {
// Try to wrap in a function if it looks like function body
if (code.includes('return')) {
code = `function solution() {\n${code}\n}`;
}
}
} else if (language === 'python') {
// Ensure proper Python indentation
const pyLines = code.split('\n');
if (pyLines.length > 1 && !pyLines[1].startsWith(' ')) {
// Add indentation
const indentedLines = pyLines.map((line, i) =>
i === 0 ? line : line.trim() ? ' ' + line : line
);
code = indentedLines.join('\n');
}
}
return code;
}
/**
* Execute test cases against generated code
*/
private async executeTests(challenge: CodingChallenge, code: string): Promise<TestResult[]> {
const testResults: TestResult[] = [];
for (const testCase of challenge.testCases) {
try {
const result = await this.executeTestCase(code, testCase, challenge.language);
testResults.push(result);
} catch (error) {
testResults.push({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: error instanceof Error ? error.message : 'Test execution error',
});
}
}
return testResults;
}
/**
* Execute a single test case
*/
private async executeTestCase(
code: string,
testCase: TestCase,
language: string
): Promise<TestResult> {
try {
if (language === 'javascript' || language === 'typescript') {
return await this.executeJavaScriptTest(code, testCase);
} else if (language === 'python') {
return await this.executePythonTest(code, testCase);
} else {
throw new Error(`Language ${language} not supported for execution`);
}
} catch (error) {
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: error instanceof Error ? error.message : 'Execution error',
};
}
}
/**
* Execute JavaScript test case using vm2
*/
private async executeJavaScriptTest(code: string, testCase: TestCase): Promise<TestResult> {
const isolate = new ivm.Isolate({ memoryLimit: 128 });
const context = await isolate.createContext();
try {
// Prepare test execution code
const testCode = `
${code}
// Extract function name
const functionName = ${this.extractFunctionName(code)};
const result = functionName(${JSON.stringify(testCase.input)});
result;
`;
const script = await isolate.compileScript(testCode);
const actualOutput = await script.run(context, { timeout: 5000 });
const passed = this.compareOutputs(actualOutput, testCase.expectedOutput);
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput,
passed,
};
} catch (error) {
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: error instanceof Error ? error.message : 'JavaScript execution error',
};
}
}
/**
* Execute Python test case (placeholder - would need Python runtime)
*/
private async executePythonTest(code: string, testCase: TestCase): Promise<TestResult> {
try {
// Check if python is available in the system
const { spawn } = await import('child_process');
const { writeFile, unlink } = await import('fs/promises');
const { join } = await import('path');
const { randomBytes } = await import('crypto');
// Create temporary Python file
const tempId = randomBytes(8).toString('hex');
const tempFile = join(process.cwd(), `temp_test_${tempId}.py`);
// Write test code to temporary file
const testCode = `
import sys
import json
${code}
try:
result = ${testCase.input}
print(json.dumps({"result": result, "error": None}))
except Exception as e:
print(json.dumps({"result": None, "error": str(e)}))
`;
await writeFile(tempFile, testCode);
// Execute Python code
const result = await new Promise<TestResult>(resolve => {
const python = spawn('python', [tempFile]);
let stdout = '';
let stderr = '';
python.stdout.on('data', data => {
stdout += data.toString();
});
python.stderr.on('data', data => {
stderr += data.toString();
});
python.on('close', async code => {
// Clean up temporary file
try {
await unlink(tempFile);
} catch {
// Ignore cleanup errors
}
if (code !== 0) {
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: stderr || `Python process exited with code ${code}`,
});
return;
}
try {
const output = JSON.parse(stdout.trim());
if (output.error) {
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: output.error,
});
} else {
const passed =
JSON.stringify(output.result) === JSON.stringify(testCase.expectedOutput);
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: output.result,
passed,
error: undefined,
});
}
} catch (parseError) {
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: stdout,
passed: false,
error: `Failed to parse Python output: ${parseError}`,
});
}
});
});
return result;
} catch (error) {
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: `Python execution failed: ${error instanceof Error ? error.message : String(error)}`,
};
}
}
/**
* Extract function name from code
*/
private extractFunctionName(code: string): string {
// Try to find function declaration
const functionMatch = code.match(/function\s+(\w+)/);
if (functionMatch) {
return functionMatch[1];
}
// Try to find arrow function assignment
const arrowMatch = code.match(/(?:const|let|var)\s+(\w+)\s*=/);
if (arrowMatch) {
return arrowMatch[1];
}
// Default to a common name
return 'solution';
}
/**
* Compare actual vs expected outputs
*/
private compareOutputs(actual: any, expected: any): boolean {
if (typeof actual !== typeof expected) {
return false;
}
if (Array.isArray(actual) && Array.isArray(expected)) {
if (actual.length !== expected.length) {
return false;
}
return actual.every((item, index) => this.compareOutputs(item, expected[index]));
}
if (typeof actual === 'object' && actual !== null && expected !== null) {
const actualKeys = Object.keys(actual).sort();
const expectedKeys = Object.keys(expected).sort();
if (actualKeys.length !== expectedKeys.length) {
return false;
}
return actualKeys.every(
key => expectedKeys.includes(key) && this.compareOutputs(actual[key], expected[key])
);
}
return actual === expected;
}
/**
* Assess code quality metrics
*/
private assessCodeQuality(
code: string,
_challenge: CodingChallenge
): {
readability: number;
efficiency: number;
correctness: number;
} {
let readability = 0.5;
let efficiency = 0.5;
let correctness = 0.5;
// Readability factors
if (code.includes('\n')) readability += 0.1; // Multi-line
if (code.match(/\/\/|\/\*/)) readability += 0.1; // Comments
if (code.length > 50) readability += 0.1; // Substantial code
if (code.match(/\s{2,}/)) readability += 0.1; // Proper spacing
// Efficiency factors (basic heuristics)
const codeLength = code.length;
if (codeLength < 200) efficiency += 0.2; // Concise
if (!code.includes('for') || code.includes('while')) efficiency += 0.1; // Not obviously inefficient
if (code.includes('return')) efficiency += 0.1; // Proper return
// Correctness factors
if (code.includes('function') || code.includes('=>')) correctness += 0.2; // Is a function
if (code.includes('return')) correctness += 0.2; // Returns something
if (!code.includes('undefined') && !code.includes('null')) correctness += 0.1; // No obvious nulls
return {
readability: Math.min(1, readability),
efficiency: Math.min(1, efficiency),
correctness: Math.min(1, correctness),
};
}
/**
* Calculate category breakdown
*/
private calculateCategoryBreakdown(
challenges: CodingChallenge[],
results: BenchmarkResult[]
): Record<string, { passed: number; total: number }> {
const breakdown: Record<string, { passed: number; total: number }> = {};
challenges.forEach((challenge, index) => {
const category = challenge.category;
if (!breakdown[category]) {
breakdown[category] = { passed: 0, total: 0 };
}
breakdown[category].total++;
if (results[index]?.passed) {
breakdown[category].passed++;
}
});
return breakdown;
}
/**
* Calculate difficulty breakdown
*/
private calculateDifficultyBreakdown(
challenges: CodingChallenge[],
results: BenchmarkResult[]
): Record<string, { passed: number; total: number }> {
const breakdown: Record<string, { passed: number; total: number }> = {};
challenges.forEach((challenge, index) => {
const difficulty = challenge.difficulty;
if (!breakdown[difficulty]) {
breakdown[difficulty] = { passed: 0, total: 0 };
}
breakdown[difficulty].total++;
if (results[index]?.passed) {
breakdown[difficulty].passed++;
}
});
return breakdown;
}
/**
* Get category distribution of challenges
*/
private getCategoryDistribution(): Record<string, number> {
const distribution: Record<string, number> = {};
this.challenges.forEach(challenge => {
distribution[challenge.category] = (distribution[challenge.category] || 0) + 1;
});
return distribution;
}
/**
* Initialize LLM clients
*/
private initializeClients(): void {
try {
this.hybridClient = new UnifiedModelClient({
providers: [
{ type: 'ollama', endpoint: 'http://localhost:11434', model: 'auto', timeout: 30000 },
{ type: 'lm-studio', endpoint: 'http://localhost:1234', model: 'auto', timeout: 30000 },
],
executionMode: 'auto',
fallbackChain: ['ollama', 'lm-studio'],
performanceThresholds: {
fastModeMaxTokens: 2048,
timeoutMs: 30000,
maxConcurrentRequests: 3,
},
security: {
enableSandbox: true,
maxInputLength: 10000,
allowedCommands: ['node', 'python3'],
},
});
this.ollamaClient = new UnifiedModelClient({
providers: [
{
type: 'ollama',
endpoint: 'http://localhost:11434',
model: 'codellama:34b',
timeout: 60000,
},
],
executionMode: 'quality',
fallbackChain: ['ollama'],
performanceThresholds: {
fastModeMaxTokens: 4096,
timeoutMs: 60000,
maxConcurrentRequests: 1,
},
security: {
enableSandbox: true,
maxInputLength: 20000,
allowedCommands: ['node', 'python3'],
},
});
logger.debug('Benchmark clients initialized');
} catch (error) {
logger.warn('Some benchmark clients failed to initialize:', error);
}
}
/**
* Load default coding challenges (HumanEval-inspired)
*/
private loadDefaultChallenges(): CodingChallenge[] {
return [
{
id: 'reverse-string',
title: 'Reverse String',
prompt: 'Write a function that reverses a string.',
testCases: [
{ input: 'hello', expectedOutput: 'olleh' },
{ input: 'world', expectedOutput: 'dlrow' },
{ input: '', expectedOutput: '' },
{ input: 'a', expectedOutput: 'a' },
],
difficulty: 'easy',
category: 'string-manipulation',
language: 'javascript',
},
{
id: 'fibonacci',
title: 'Fibonacci Sequence',
prompt: 'Write a function that returns the nth Fibonacci number.',
testCases: [
{ input: 0, expectedOutput: 0 },
{ input: 1, expectedOutput: 1 },
{ input: 5, expectedOutput: 5 },
{ input: 10, expectedOutput: 55 },
],
difficulty: 'medium',
category: 'math',
language: 'javascript',
},
{
id: 'two-sum',
title: 'Two Sum',
prompt: 'Write a function that finds two numbers in an array that add up to a target sum.',
testCases: [
{ input: { nums: [2, 7, 11, 15], target: 9 }, expectedOutput: [0, 1] },
{ input: { nums: [3, 2, 4], target: 6 }, expectedOutput: [1, 2] },
{ input: { nums: [3, 3], target: 6 }, expectedOutput: [0, 1] },
],
difficulty: 'medium',
category: 'algorithms',
language: 'javascript',
},
{
id: 'palindrome-check',
title: 'Palindrome Check',
prompt: 'Write a function that checks if a string is a palindrome.',
testCases: [
{ input: 'racecar', expectedOutput: true },
{ input: 'hello', expectedOutput: false },
{ input: 'A man a plan a canal Panama', expectedOutput: true },
{ input: '', expectedOutput: true },
],
difficulty: 'easy',
category: 'string-manipulation',
language: 'javascript',
},
{
id: 'binary-search',
title: 'Binary Search',
prompt: 'Write a function that performs binary search on a sorted array.',
testCases: [
{ input: { arr: [1, 2, 3, 4, 5], target: 3 }, expectedOutput: 2 },
{ input: { arr: [1, 2, 3, 4, 5], target: 6 }, expectedOutput: -1 },
{ input: { arr: [], target: 1 }, expectedOutput: -1 },
],
difficulty: 'medium',
category: 'algorithms',
language: 'javascript',
},
];
}
/**
* Save benchmark results to file
*/
private async saveBenchmarkResults(summary: BenchmarkSummary): Promise<void> {
try {
const resultsDir = path.join(process.cwd(), 'benchmark-results');
await fs.mkdir(resultsDir, { recursive: true });
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const filename = `benchmark-${summary.modelUsed}-${timestamp}.json`;
const filepath = path.join(resultsDir, filename);
await fs.writeFile(filepath, JSON.stringify(summary, null, 2));
console.log(`๐ Results saved to: ${filepath}`);
} catch (error) {
logger.warn('Failed to save benchmark results:', error);
}
}
}