UNPKG

judgeval

Version:

Judgment SDK for TypeScript/JavaScript

221 lines 10.4 kB
"use strict"; /** * @file basic_evaluation.test.ts * @description Tests for basic evaluation functionality. * This file tests: * - Evaluation execution * - Result handling * - Trace generation and management * - Trace comparison * - Error handling * - Async behavior * - Timeout handling */ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); const globals_1 = require("@jest/globals"); const judgment_client_js_1 = require("../../judgment-client.js"); const example_js_1 = require("../../data/example.js"); const result_js_1 = require("../../data/result.js"); const api_scorer_js_1 = require("../../scorers/api-scorer.js"); // Mock the JudgmentClient globals_1.jest.mock('../../judgment-client.js'); (0, globals_1.describe)('Basic Evaluation Example', () => { let client; let mockEvaluate; let mockRunEvaluation; (0, globals_1.beforeEach)(() => { mockEvaluate = globals_1.jest.fn(); mockRunEvaluation = globals_1.jest.fn(); // Reset and setup mocks judgment_client_js_1.JudgmentClient.getInstance.mockReturnValue({ evaluate: mockEvaluate, runEvaluation: mockRunEvaluation, }); client = judgment_client_js_1.JudgmentClient.getInstance(); }); (0, globals_1.describe)('Single Example Evaluation', () => { (0, globals_1.it)('should evaluate a single example with one scorer', () => __awaiter(void 0, void 0, void 0, function* () { const example = new example_js_1.ExampleBuilder() .input("What is the capital of France?") .actualOutput("The capital of France is Paris.") .expectedOutput("Paris is the capital of France.") .build(); const scorer = new api_scorer_js_1.AnswerCorrectnessScorer(); const expectedResult = new result_js_1.ScoringResult({ dataObject: example, scorersData: [{ name: 'answer_correctness', score: 0.85, threshold: 0.7, success: true, reason: 'Test passed', strict_mode: false, evaluation_model: null, error: null, evaluation_cost: 0, verbose_logs: null, additional_metadata: {} }] }); mockEvaluate.mockResolvedValue([expectedResult]); const results = yield client.evaluate({ examples: [example], scorers: [scorer], }); (0, globals_1.expect)(mockEvaluate).toHaveBeenCalledWith({ examples: [example], scorers: [scorer], }); (0, globals_1.expect)(results).toHaveLength(1); (0, globals_1.expect)(results[0].scorersData).toHaveLength(1); (0, globals_1.expect)(results[0].scorersData[0].score).toBe(0.85); (0, globals_1.expect)(results[0].scorersData[0].success).toBe(true); (0, globals_1.expect)(results[0].dataObject).toBe(example); })); (0, globals_1.it)('should evaluate a single example with multiple scorers', () => __awaiter(void 0, void 0, void 0, function* () { const example = new example_js_1.ExampleBuilder() .input("Based on the context, what is the capital of France?") .actualOutput("According to the context, Paris is the capital of France.") .context(["France is a country in Western Europe.", "Paris is the capital of France."]) .build(); const scorers = [ new api_scorer_js_1.AnswerCorrectnessScorer(), new api_scorer_js_1.ContextualRelevancyScorer(), new api_scorer_js_1.FaithfulnessScorer(), ]; const expectedResult = new result_js_1.ScoringResult({ dataObject: example, scorersData: scorers.map(scorer => ({ name: scorer.type, score: 0.9, threshold: 0.7, success: true, reason: 'Test passed', strict_mode: false, evaluation_model: null, error: null, evaluation_cost: 0, verbose_logs: null, additional_metadata: {} })) }); mockEvaluate.mockResolvedValue([expectedResult]); const results = yield client.evaluate({ examples: [example], scorers, }); (0, globals_1.expect)(mockEvaluate).toHaveBeenCalledWith({ examples: [example], scorers, }); (0, globals_1.expect)(results).toHaveLength(1); (0, globals_1.expect)(results[0].scorersData).toHaveLength(3); (0, globals_1.expect)(results[0].dataObject).toBe(example); })); }); (0, globals_1.describe)('Batch Evaluation', () => { (0, globals_1.it)('should evaluate multiple examples in batches', () => __awaiter(void 0, void 0, void 0, function* () { const examples = [ new example_js_1.ExampleBuilder() .input("What is the capital of France?") .actualOutput("Paris is the capital of France.") .build(), new example_js_1.ExampleBuilder() .input("What is the capital of Japan?") .actualOutput("Tokyo is the capital of Japan.") .build(), new example_js_1.ExampleBuilder() .input("What is the capital of Germany?") .actualOutput("Berlin is the capital of Germany.") .build(), ]; const scorers = [new api_scorer_js_1.AnswerCorrectnessScorer()]; const expectedResults = examples.map(example => new result_js_1.ScoringResult({ dataObject: example, scorersData: [{ name: 'answer_correctness', score: 1.0, threshold: 0.7, success: true, reason: 'Test passed', strict_mode: false, evaluation_model: 'gpt-4', error: null, evaluation_cost: 0, verbose_logs: null, additional_metadata: {} }] })); mockRunEvaluation.mockResolvedValue(expectedResults); const results = yield client.runEvaluation(examples, scorers, 'gpt-4'); (0, globals_1.expect)(mockRunEvaluation).toHaveBeenCalledWith(examples, scorers, 'gpt-4'); (0, globals_1.expect)(results).toHaveLength(3); results.forEach(result => { (0, globals_1.expect)(result.scorersData).toHaveLength(1); (0, globals_1.expect)(result.scorersData[0].evaluation_model).toBe('gpt-4'); }); })); }); (0, globals_1.describe)('Error Handling', () => { (0, globals_1.it)('should handle evaluation errors', () => __awaiter(void 0, void 0, void 0, function* () { const example = new example_js_1.ExampleBuilder() .input('What is 2+2?') .actualOutput('4') .expectedOutput('4') .build(); const scorer = new api_scorer_js_1.AnswerCorrectnessScorer(); mockEvaluate.mockRejectedValue(new Error('Evaluation failed')); yield (0, globals_1.expect)(client.evaluate({ examples: [example], scorers: [scorer], model: 'gpt-4' })).rejects.toThrow('Evaluation failed'); })); }); (0, globals_1.describe)('Async Behavior', () => { (0, globals_1.it)('should handle timeout', () => __awaiter(void 0, void 0, void 0, function* () { const example = new example_js_1.ExampleBuilder() .input('What is 2+2?') .actualOutput('4') .expectedOutput('4') .build(); const scorer = new api_scorer_js_1.AnswerCorrectnessScorer(); mockEvaluate.mockImplementation(() => new Promise((resolve) => setTimeout(() => resolve([ new result_js_1.ScoringResult({ dataObject: example, scorersData: [{ name: 'answer_correctness', score: 1.0, threshold: 0.7, success: true, reason: 'Correct answer', strict_mode: false, evaluation_model: 'gpt-4', error: null, evaluation_cost: 0, verbose_logs: null, additional_metadata: {} }] }) ]), 1000))); yield (0, globals_1.expect)(Promise.race([ client.evaluate({ examples: [example], scorers: [scorer], model: 'gpt-4' }), new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), 500)) ])).rejects.toThrow('Timeout'); })); }); }); //# sourceMappingURL=basic_evaluation.test.js.map