UNPKG

shipdeck

Version:

Ship MVPs in 48 hours. Fix bugs in 30 seconds. The command deck for developers who ship.

118 lines (99 loc) โ€ข 3.75 kB
/** * Test file for LLM Judge Agent * Quick validation of basic functionality */ const { LLMJudgeAgent } = require('./llm-judge'); async function testLLMJudge() { console.log('๐Ÿงช Testing LLM Judge Agent...\n'); try { // Initialize agent const judge = new LLMJudgeAgent({ anthropicConfig: { skipValidation: true // Skip API validation for testing } }); // Test 1: Agent metadata console.log('โœ… Test 1: Agent Metadata'); const metadata = judge.getMetadata(); console.log('Agent Name:', metadata.name); console.log('Capabilities:', metadata.capabilities); console.log('Version:', metadata.version); console.log(); // Test 2: Capabilities check console.log('โœ… Test 2: Capabilities Check'); const capabilities = judge.getCapabilities(); const expectedCapabilities = ['compare', 'evaluate', 'judge', 'rank', 'select-best']; const hasRequiredCapabilities = expectedCapabilities.every(cap => capabilities.includes(cap)); console.log('Has required capabilities:', hasRequiredCapabilities); console.log('All capabilities:', capabilities); console.log(); // Test 3: Validation console.log('โœ… Test 3: Task Validation'); // Should pass try { judge.validateTask({ type: 'compare', solutions: [] }); console.log('Valid task validation: PASSED'); } catch (error) { console.log('Valid task validation: FAILED -', error.message); } // Should fail try { judge.validateTask({}); console.log('Invalid task validation: FAILED (should have thrown)'); } catch (error) { console.log('Invalid task validation: PASSED (correctly rejected)'); } console.log(); // Test 4: Comparison task setup (without actual AI call) console.log('โœ… Test 4: Comparison Task Structure'); const mockSolutions = [ { id: 'sol1', code: 'function add(a, b) { return a + b; }' }, { id: 'sol2', code: 'const add = (a, b) => a + b;' } ]; const comparisonTask = { type: 'compare', solutions: mockSolutions, criteria: { correctness: { weight: 0.4 }, readability: { weight: 0.6 } } }; try { judge.validateTask(comparisonTask); console.log('Comparison task structure: VALID'); console.log('Solutions count:', comparisonTask.solutions.length); } catch (error) { console.log('Comparison task structure: INVALID -', error.message); } console.log(); // Test 5: Utility functions console.log('โœ… Test 5: Utility Functions'); // Test score calculation const mockScores = { correctness: 8, readability: 6 }; const mockCriteria = { correctness: { weight: 0.4 }, readability: { weight: 0.6 } }; const overallScore = judge.calculateOverallScore(mockScores, mockCriteria); console.log('Overall score calculation:', overallScore.toFixed(2)); // Test rating conversion const rating = judge.scoreToRating(overallScore); console.log('Score to rating:', rating); // Test array shuffle const testArray = [1, 2, 3, 4, 5]; const shuffled = judge.shuffleArray(testArray); console.log('Array shuffle test:', shuffled.length === testArray.length ? 'PASSED' : 'FAILED'); console.log(); console.log('๐ŸŽ‰ All tests completed successfully!'); console.log('\nLLM Judge Agent is ready for use with the following capabilities:'); capabilities.forEach(cap => console.log(` - ${cap}`)); } catch (error) { console.error('โŒ Test failed:', error.message); console.error(error.stack); } } // Run tests if called directly if (require.main === module) { testLLMJudge(); } module.exports = { testLLMJudge };