UNPKG

codecrucible-synth

Version:

Production-Ready AI Development Platform with Multi-Voice Synthesis, Smithery MCP Integration, Enterprise Security, and Zero-Timeout Reliability

140 lines 3.48 kB
export interface CodingChallenge { id: string; title: string; prompt: string; testCases: TestCase[]; expectedSolution?: string; difficulty: 'easy' | 'medium' | 'hard'; category: 'algorithms' | 'data-structures' | 'string-manipulation' | 'math' | 'logic'; language: 'javascript' | 'python' | 'typescript'; timeLimit?: number; } export interface TestCase { input: any; expectedOutput: any; description?: string; } export interface BenchmarkResult { challengeId: string; passed: boolean; generatedCode: string; executionTime: number; errors: string[]; testResults: TestResult[]; confidence: number; codeQuality: { readability: number; efficiency: number; correctness: number; }; } export interface TestResult { input: any; expectedOutput: any; actualOutput: any; passed: boolean; error?: string; } export interface BenchmarkSummary { totalChallenges: number; passed: number; failed: number; successRate: number; averageTime: number; averageConfidence: number; categoryBreakdown: Record<string, { passed: number; total: number; }>; difficultyBreakdown: Record<string, { passed: number; total: number; }>; detailedResults: BenchmarkResult[]; modelUsed: string; timestamp: number; } /** * HumanEval-inspired benchmark runner for code generation models * Evaluates model performance on standardized coding challenges */ export declare class BenchmarkRunner { private challenges; private hybridClient?; private ollamaClient?; constructor(); /** * Run benchmark suite on specified model */ runBenchmark(modelName?: string, options?: { categories?: string[]; difficulties?: string[]; limit?: number; timeoutMs?: number; }): Promise<BenchmarkSummary>; /** * Run a single coding challenge */ private runSingleChallenge; /** * Generate code for a challenge using specified model */ private generateCodeForChallenge; /** * Extract clean code from model response */ private extractCodeFromResponse; /** * Execute test cases against generated code */ private executeTests; /** * Execute a single test case */ private executeTestCase; /** * Execute JavaScript test case using vm2 */ private executeJavaScriptTest; /** * Execute Python test case (placeholder - would need Python runtime) */ private executePythonTest; /** * Extract function name from code */ private extractFunctionName; /** * Compare actual vs expected outputs */ private compareOutputs; /** * Assess code quality metrics */ private assessCodeQuality; /** * Calculate category breakdown */ private calculateCategoryBreakdown; /** * Calculate difficulty breakdown */ private calculateDifficultyBreakdown; /** * Get category distribution of challenges */ private getCategoryDistribution; /** * Initialize LLM clients */ private initializeClients; /** * Load default coding challenges (HumanEval-inspired) */ private loadDefaultChallenges; /** * Save benchmark results to file */ private saveBenchmarkResults; } //# sourceMappingURL=benchmark-runner.d.ts.map