codecrucible-synth
Version:
Production-Ready AI Development Platform with Multi-Voice Synthesis, Smithery MCP Integration, Enterprise Security, and Zero-Timeout Reliability
140 lines • 3.48 kB
TypeScript
export interface CodingChallenge {
id: string;
title: string;
prompt: string;
testCases: TestCase[];
expectedSolution?: string;
difficulty: 'easy' | 'medium' | 'hard';
category: 'algorithms' | 'data-structures' | 'string-manipulation' | 'math' | 'logic';
language: 'javascript' | 'python' | 'typescript';
timeLimit?: number;
}
export interface TestCase {
input: any;
expectedOutput: any;
description?: string;
}
export interface BenchmarkResult {
challengeId: string;
passed: boolean;
generatedCode: string;
executionTime: number;
errors: string[];
testResults: TestResult[];
confidence: number;
codeQuality: {
readability: number;
efficiency: number;
correctness: number;
};
}
export interface TestResult {
input: any;
expectedOutput: any;
actualOutput: any;
passed: boolean;
error?: string;
}
export interface BenchmarkSummary {
totalChallenges: number;
passed: number;
failed: number;
successRate: number;
averageTime: number;
averageConfidence: number;
categoryBreakdown: Record<string, {
passed: number;
total: number;
}>;
difficultyBreakdown: Record<string, {
passed: number;
total: number;
}>;
detailedResults: BenchmarkResult[];
modelUsed: string;
timestamp: number;
}
/**
* HumanEval-inspired benchmark runner for code generation models
* Evaluates model performance on standardized coding challenges
*/
export declare class BenchmarkRunner {
private challenges;
private hybridClient?;
private ollamaClient?;
constructor();
/**
* Run benchmark suite on specified model
*/
runBenchmark(modelName?: string, options?: {
categories?: string[];
difficulties?: string[];
limit?: number;
timeoutMs?: number;
}): Promise<BenchmarkSummary>;
/**
* Run a single coding challenge
*/
private runSingleChallenge;
/**
* Generate code for a challenge using specified model
*/
private generateCodeForChallenge;
/**
* Extract clean code from model response
*/
private extractCodeFromResponse;
/**
* Execute test cases against generated code
*/
private executeTests;
/**
* Execute a single test case
*/
private executeTestCase;
/**
* Execute JavaScript test case using vm2
*/
private executeJavaScriptTest;
/**
* Execute Python test case (placeholder - would need Python runtime)
*/
private executePythonTest;
/**
* Extract function name from code
*/
private extractFunctionName;
/**
* Compare actual vs expected outputs
*/
private compareOutputs;
/**
* Assess code quality metrics
*/
private assessCodeQuality;
/**
* Calculate category breakdown
*/
private calculateCategoryBreakdown;
/**
* Calculate difficulty breakdown
*/
private calculateDifficultyBreakdown;
/**
* Get category distribution of challenges
*/
private getCategoryDistribution;
/**
* Initialize LLM clients
*/
private initializeClients;
/**
* Load default coding challenges (HumanEval-inspired)
*/
private loadDefaultChallenges;
/**
* Save benchmark results to file
*/
private saveBenchmarkResults;
}
//# sourceMappingURL=benchmark-runner.d.ts.map