codecrucible-synth
Version:
Production-Ready AI Development Platform with Multi-Voice Synthesis, Smithery MCP Integration, Enterprise Security, and Zero-Timeout Reliability
713 lines (708 loc) โข 28.6 kB
JavaScript
import { UnifiedModelClient } from '../../refactor/unified-model-client.js';
import { logger } from '../logger.js';
import * as fs from 'fs/promises';
import * as path from 'path';
import ivm from 'isolated-vm';
/**
* HumanEval-inspired benchmark runner for code generation models
* Evaluates model performance on standardized coding challenges
*/
export class BenchmarkRunner {
challenges;
hybridClient;
ollamaClient;
constructor() {
this.challenges = this.loadDefaultChallenges();
this.initializeClients();
logger.info('Benchmark runner initialized with challenges', {
totalChallenges: this.challenges.length,
categories: this.getCategoryDistribution(),
});
}
/**
* Run benchmark suite on specified model
*/
async runBenchmark(modelName, options = {}) {
const startTime = Date.now();
console.log('๐งช Starting benchmark suite...');
console.log(`๐ Model: ${modelName || 'hybrid'}`);
// Filter challenges based on options
let selectedChallenges = this.challenges;
if (options.categories?.length) {
selectedChallenges = selectedChallenges.filter(c => options.categories.includes(c.category));
}
if (options.difficulties?.length) {
selectedChallenges = selectedChallenges.filter(c => options.difficulties.includes(c.difficulty));
}
if (options.limit) {
selectedChallenges = selectedChallenges.slice(0, options.limit);
}
console.log(`๐ฏ Running ${selectedChallenges.length} challenges`);
const results = [];
let passed = 0;
let totalTime = 0;
let totalConfidence = 0;
// Run each challenge
for (let i = 0; i < selectedChallenges.length; i++) {
const challenge = selectedChallenges[i];
console.log(`\n[${i + 1}/${selectedChallenges.length}] ${challenge.title} (${challenge.difficulty})`);
try {
const result = await this.runSingleChallenge(challenge, modelName, options.timeoutMs);
results.push(result);
if (result.passed) {
passed++;
console.log(` โ
PASSED (${result.executionTime}ms, confidence: ${(result.confidence * 100).toFixed(0)}%)`);
}
else {
console.log(` โ FAILED (${result.errors.length} errors)`);
if (result.errors.length > 0) {
console.log(` ${result.errors[0]}`);
}
}
totalTime += result.executionTime;
totalConfidence += result.confidence;
}
catch (error) {
console.log(` ๐ฅ ERROR: ${error instanceof Error ? error.message : 'Unknown error'}`);
// Add failed result
results.push({
challengeId: challenge.id,
passed: false,
generatedCode: '',
executionTime: 0,
errors: [error instanceof Error ? error.message : 'Benchmark execution error'],
testResults: [],
confidence: 0,
codeQuality: { readability: 0, efficiency: 0, correctness: 0 },
});
}
// Small delay between challenges
await new Promise(resolve => setTimeout(resolve, 500));
}
// Calculate summary statistics
const successRate = (passed / selectedChallenges.length) * 100;
const averageTime = totalTime / selectedChallenges.length;
const averageConfidence = totalConfidence / selectedChallenges.length;
// Category and difficulty breakdowns
const categoryBreakdown = this.calculateCategoryBreakdown(selectedChallenges, results);
const difficultyBreakdown = this.calculateDifficultyBreakdown(selectedChallenges, results);
const summary = {
totalChallenges: selectedChallenges.length,
passed,
failed: selectedChallenges.length - passed,
successRate,
averageTime,
averageConfidence,
categoryBreakdown,
difficultyBreakdown,
detailedResults: results,
modelUsed: modelName || 'hybrid',
timestamp: Date.now(),
};
// Save results
await this.saveBenchmarkResults(summary);
const totalDuration = Date.now() - startTime;
console.log('\n๐ Benchmark Results:');
console.log(`โ
Passed: ${passed}/${selectedChallenges.length} (${successRate.toFixed(1)}%)`);
console.log(`โฑ๏ธ Average Time: ${averageTime.toFixed(0)}ms per challenge`);
console.log(`๐ฏ Average Confidence: ${(averageConfidence * 100).toFixed(1)}%`);
console.log(`โณ Total Duration: ${(totalDuration / 1000).toFixed(1)}s`);
// Category breakdown
console.log('\n๐ Category Breakdown:');
Object.entries(categoryBreakdown).forEach(([category, stats]) => {
const rate = ((stats.passed / stats.total) * 100).toFixed(1);
console.log(` ${category}: ${stats.passed}/${stats.total} (${rate}%)`);
});
// Difficulty breakdown
console.log('\n๐๏ธ Difficulty Breakdown:');
Object.entries(difficultyBreakdown).forEach(([difficulty, stats]) => {
const rate = ((stats.passed / stats.total) * 100).toFixed(1);
console.log(` ${difficulty}: ${stats.passed}/${stats.total} (${rate}%)`);
});
return summary;
}
/**
* Run a single coding challenge
*/
async runSingleChallenge(challenge, modelName, timeoutMs = 30000) {
const startTime = Date.now();
try {
// Generate code using specified model
const generation = await this.generateCodeForChallenge(challenge, modelName, timeoutMs);
const executionTime = Date.now() - startTime;
// Execute tests
const testResults = await this.executeTests(challenge, generation.code);
// Check if all tests passed
const passed = testResults.every(result => result.passed);
// Calculate code quality metrics
const codeQuality = this.assessCodeQuality(generation.code, challenge);
return {
challengeId: challenge.id,
passed,
generatedCode: generation.code,
executionTime,
errors: generation.errors,
testResults,
confidence: generation.confidence,
codeQuality,
};
}
catch (error) {
return {
challengeId: challenge.id,
passed: false,
generatedCode: '',
executionTime: Date.now() - startTime,
errors: [error instanceof Error ? error.message : 'Unknown execution error'],
testResults: [],
confidence: 0,
codeQuality: { readability: 0, efficiency: 0, correctness: 0 },
};
}
}
/**
* Generate code for a challenge using specified model
*/
async generateCodeForChallenge(challenge, modelName, timeoutMs = 30000) {
const enhancedPrompt = `${challenge.prompt}
Requirements:
- Write only the function implementation
- Use ${challenge.language}
- Do not include test cases or examples
- Ensure the solution handles edge cases
- Return only the code without explanations
Function signature and implementation:`;
try {
if (modelName && modelName !== 'hybrid') {
// Use specific model (Ollama)
if (this.ollamaClient) {
const result = (await Promise.race([
this.ollamaClient.generateText(enhancedPrompt, { includeContext: false }),
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), timeoutMs)),
]));
const extractedCode = this.extractCodeFromResponse(result.text, challenge.language);
return {
code: extractedCode,
confidence: 0.8, // Default confidence for Ollama
errors: [],
};
}
else {
throw new Error('Ollama client not available');
}
}
else {
// Use hybrid client
if (this.hybridClient) {
const result = (await Promise.race([
this.hybridClient.generate({
prompt: enhancedPrompt,
}),
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), timeoutMs)),
]));
const extractedCode = this.extractCodeFromResponse(result.code || result.synthesis, challenge.language);
return {
code: extractedCode,
confidence: result.confidence || 0.7,
errors: [],
};
}
else {
throw new Error('Hybrid client not available');
}
}
}
catch (error) {
return {
code: '',
confidence: 0,
errors: [error instanceof Error ? error.message : 'Code generation failed'],
};
}
}
/**
* Extract clean code from model response
*/
extractCodeFromResponse(response, language) {
// Remove markdown code blocks
let code = response.replace(/```[\w]*\n?/g, '').replace(/```/g, '');
// Remove explanatory text (simple heuristic)
const lines = code.split('\n');
const codeLines = lines.filter(line => {
const trimmed = line.trim();
return (trimmed &&
!trimmed.startsWith('//') &&
!trimmed.startsWith('#') &&
!trimmed.startsWith('Here') &&
!trimmed.startsWith('This') &&
!trimmed.startsWith('The function'));
});
code = codeLines.join('\n').trim();
// Language-specific cleanup
if (language === 'javascript' || language === 'typescript') {
// Ensure we have a function
if (!code.includes('function') && !code.includes('=>')) {
// Try to wrap in a function if it looks like function body
if (code.includes('return')) {
code = `function solution() {\n${code}\n}`;
}
}
}
else if (language === 'python') {
// Ensure proper Python indentation
const pyLines = code.split('\n');
if (pyLines.length > 1 && !pyLines[1].startsWith(' ')) {
// Add indentation
const indentedLines = pyLines.map((line, i) => i === 0 ? line : line.trim() ? ' ' + line : line);
code = indentedLines.join('\n');
}
}
return code;
}
/**
* Execute test cases against generated code
*/
async executeTests(challenge, code) {
const testResults = [];
for (const testCase of challenge.testCases) {
try {
const result = await this.executeTestCase(code, testCase, challenge.language);
testResults.push(result);
}
catch (error) {
testResults.push({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: error instanceof Error ? error.message : 'Test execution error',
});
}
}
return testResults;
}
/**
* Execute a single test case
*/
async executeTestCase(code, testCase, language) {
try {
if (language === 'javascript' || language === 'typescript') {
return await this.executeJavaScriptTest(code, testCase);
}
else if (language === 'python') {
return await this.executePythonTest(code, testCase);
}
else {
throw new Error(`Language ${language} not supported for execution`);
}
}
catch (error) {
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: error instanceof Error ? error.message : 'Execution error',
};
}
}
/**
* Execute JavaScript test case using vm2
*/
async executeJavaScriptTest(code, testCase) {
const isolate = new ivm.Isolate({ memoryLimit: 128 });
const context = await isolate.createContext();
try {
// Prepare test execution code
const testCode = `
${code}
// Extract function name
const functionName = ${this.extractFunctionName(code)};
const result = functionName(${JSON.stringify(testCase.input)});
result;
`;
const script = await isolate.compileScript(testCode);
const actualOutput = await script.run(context, { timeout: 5000 });
const passed = this.compareOutputs(actualOutput, testCase.expectedOutput);
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput,
passed,
};
}
catch (error) {
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: error instanceof Error ? error.message : 'JavaScript execution error',
};
}
}
/**
* Execute Python test case (placeholder - would need Python runtime)
*/
async executePythonTest(code, testCase) {
try {
// Check if python is available in the system
const { spawn } = await import('child_process');
const { writeFile, unlink } = await import('fs/promises');
const { join } = await import('path');
const { randomBytes } = await import('crypto');
// Create temporary Python file
const tempId = randomBytes(8).toString('hex');
const tempFile = join(process.cwd(), `temp_test_${tempId}.py`);
// Write test code to temporary file
const testCode = `
import sys
import json
${code}
try:
result = ${testCase.input}
print(json.dumps({"result": result, "error": None}))
except Exception as e:
print(json.dumps({"result": None, "error": str(e)}))
`;
await writeFile(tempFile, testCode);
// Execute Python code
const result = await new Promise(resolve => {
const python = spawn('python', [tempFile]);
let stdout = '';
let stderr = '';
python.stdout.on('data', data => {
stdout += data.toString();
});
python.stderr.on('data', data => {
stderr += data.toString();
});
python.on('close', async (code) => {
// Clean up temporary file
try {
await unlink(tempFile);
}
catch {
// Ignore cleanup errors
}
if (code !== 0) {
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: stderr || `Python process exited with code ${code}`,
});
return;
}
try {
const output = JSON.parse(stdout.trim());
if (output.error) {
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: output.error,
});
}
else {
const passed = JSON.stringify(output.result) === JSON.stringify(testCase.expectedOutput);
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: output.result,
passed,
error: undefined,
});
}
}
catch (parseError) {
resolve({
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: stdout,
passed: false,
error: `Failed to parse Python output: ${parseError}`,
});
}
});
});
return result;
}
catch (error) {
return {
input: testCase.input,
expectedOutput: testCase.expectedOutput,
actualOutput: null,
passed: false,
error: `Python execution failed: ${error instanceof Error ? error.message : String(error)}`,
};
}
}
/**
* Extract function name from code
*/
extractFunctionName(code) {
// Try to find function declaration
const functionMatch = code.match(/function\s+(\w+)/);
if (functionMatch) {
return functionMatch[1];
}
// Try to find arrow function assignment
const arrowMatch = code.match(/(?:const|let|var)\s+(\w+)\s*=/);
if (arrowMatch) {
return arrowMatch[1];
}
// Default to a common name
return 'solution';
}
/**
* Compare actual vs expected outputs
*/
compareOutputs(actual, expected) {
if (typeof actual !== typeof expected) {
return false;
}
if (Array.isArray(actual) && Array.isArray(expected)) {
if (actual.length !== expected.length) {
return false;
}
return actual.every((item, index) => this.compareOutputs(item, expected[index]));
}
if (typeof actual === 'object' && actual !== null && expected !== null) {
const actualKeys = Object.keys(actual).sort();
const expectedKeys = Object.keys(expected).sort();
if (actualKeys.length !== expectedKeys.length) {
return false;
}
return actualKeys.every(key => expectedKeys.includes(key) && this.compareOutputs(actual[key], expected[key]));
}
return actual === expected;
}
/**
* Assess code quality metrics
*/
assessCodeQuality(code, _challenge) {
let readability = 0.5;
let efficiency = 0.5;
let correctness = 0.5;
// Readability factors
if (code.includes('\n'))
readability += 0.1; // Multi-line
if (code.match(/\/\/|\/\*/))
readability += 0.1; // Comments
if (code.length > 50)
readability += 0.1; // Substantial code
if (code.match(/\s{2,}/))
readability += 0.1; // Proper spacing
// Efficiency factors (basic heuristics)
const codeLength = code.length;
if (codeLength < 200)
efficiency += 0.2; // Concise
if (!code.includes('for') || code.includes('while'))
efficiency += 0.1; // Not obviously inefficient
if (code.includes('return'))
efficiency += 0.1; // Proper return
// Correctness factors
if (code.includes('function') || code.includes('=>'))
correctness += 0.2; // Is a function
if (code.includes('return'))
correctness += 0.2; // Returns something
if (!code.includes('undefined') && !code.includes('null'))
correctness += 0.1; // No obvious nulls
return {
readability: Math.min(1, readability),
efficiency: Math.min(1, efficiency),
correctness: Math.min(1, correctness),
};
}
/**
* Calculate category breakdown
*/
calculateCategoryBreakdown(challenges, results) {
const breakdown = {};
challenges.forEach((challenge, index) => {
const category = challenge.category;
if (!breakdown[category]) {
breakdown[category] = { passed: 0, total: 0 };
}
breakdown[category].total++;
if (results[index]?.passed) {
breakdown[category].passed++;
}
});
return breakdown;
}
/**
* Calculate difficulty breakdown
*/
calculateDifficultyBreakdown(challenges, results) {
const breakdown = {};
challenges.forEach((challenge, index) => {
const difficulty = challenge.difficulty;
if (!breakdown[difficulty]) {
breakdown[difficulty] = { passed: 0, total: 0 };
}
breakdown[difficulty].total++;
if (results[index]?.passed) {
breakdown[difficulty].passed++;
}
});
return breakdown;
}
/**
* Get category distribution of challenges
*/
getCategoryDistribution() {
const distribution = {};
this.challenges.forEach(challenge => {
distribution[challenge.category] = (distribution[challenge.category] || 0) + 1;
});
return distribution;
}
/**
* Initialize LLM clients
*/
initializeClients() {
try {
this.hybridClient = new UnifiedModelClient({
providers: [
{ type: 'ollama', endpoint: 'http://localhost:11434', model: 'auto', timeout: 30000 },
{ type: 'lm-studio', endpoint: 'http://localhost:1234', model: 'auto', timeout: 30000 },
],
executionMode: 'auto',
fallbackChain: ['ollama', 'lm-studio'],
performanceThresholds: {
fastModeMaxTokens: 2048,
timeoutMs: 30000,
maxConcurrentRequests: 3,
},
security: {
enableSandbox: true,
maxInputLength: 10000,
allowedCommands: ['node', 'python3'],
},
});
this.ollamaClient = new UnifiedModelClient({
providers: [
{
type: 'ollama',
endpoint: 'http://localhost:11434',
model: 'codellama:34b',
timeout: 60000,
},
],
executionMode: 'quality',
fallbackChain: ['ollama'],
performanceThresholds: {
fastModeMaxTokens: 4096,
timeoutMs: 60000,
maxConcurrentRequests: 1,
},
security: {
enableSandbox: true,
maxInputLength: 20000,
allowedCommands: ['node', 'python3'],
},
});
logger.debug('Benchmark clients initialized');
}
catch (error) {
logger.warn('Some benchmark clients failed to initialize:', error);
}
}
/**
* Load default coding challenges (HumanEval-inspired)
*/
loadDefaultChallenges() {
return [
{
id: 'reverse-string',
title: 'Reverse String',
prompt: 'Write a function that reverses a string.',
testCases: [
{ input: 'hello', expectedOutput: 'olleh' },
{ input: 'world', expectedOutput: 'dlrow' },
{ input: '', expectedOutput: '' },
{ input: 'a', expectedOutput: 'a' },
],
difficulty: 'easy',
category: 'string-manipulation',
language: 'javascript',
},
{
id: 'fibonacci',
title: 'Fibonacci Sequence',
prompt: 'Write a function that returns the nth Fibonacci number.',
testCases: [
{ input: 0, expectedOutput: 0 },
{ input: 1, expectedOutput: 1 },
{ input: 5, expectedOutput: 5 },
{ input: 10, expectedOutput: 55 },
],
difficulty: 'medium',
category: 'math',
language: 'javascript',
},
{
id: 'two-sum',
title: 'Two Sum',
prompt: 'Write a function that finds two numbers in an array that add up to a target sum.',
testCases: [
{ input: { nums: [2, 7, 11, 15], target: 9 }, expectedOutput: [0, 1] },
{ input: { nums: [3, 2, 4], target: 6 }, expectedOutput: [1, 2] },
{ input: { nums: [3, 3], target: 6 }, expectedOutput: [0, 1] },
],
difficulty: 'medium',
category: 'algorithms',
language: 'javascript',
},
{
id: 'palindrome-check',
title: 'Palindrome Check',
prompt: 'Write a function that checks if a string is a palindrome.',
testCases: [
{ input: 'racecar', expectedOutput: true },
{ input: 'hello', expectedOutput: false },
{ input: 'A man a plan a canal Panama', expectedOutput: true },
{ input: '', expectedOutput: true },
],
difficulty: 'easy',
category: 'string-manipulation',
language: 'javascript',
},
{
id: 'binary-search',
title: 'Binary Search',
prompt: 'Write a function that performs binary search on a sorted array.',
testCases: [
{ input: { arr: [1, 2, 3, 4, 5], target: 3 }, expectedOutput: 2 },
{ input: { arr: [1, 2, 3, 4, 5], target: 6 }, expectedOutput: -1 },
{ input: { arr: [], target: 1 }, expectedOutput: -1 },
],
difficulty: 'medium',
category: 'algorithms',
language: 'javascript',
},
];
}
/**
* Save benchmark results to file
*/
async saveBenchmarkResults(summary) {
try {
const resultsDir = path.join(process.cwd(), 'benchmark-results');
await fs.mkdir(resultsDir, { recursive: true });
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const filename = `benchmark-${summary.modelUsed}-${timestamp}.json`;
const filepath = path.join(resultsDir, filename);
await fs.writeFile(filepath, JSON.stringify(summary, null, 2));
console.log(`๐ Results saved to: ${filepath}`);
}
catch (error) {
logger.warn('Failed to save benchmark results:', error);
}
}
}
//# sourceMappingURL=benchmark-runner.js.map