UNPKG

judgeval

Version:

Judgment SDK for TypeScript/JavaScript

176 lines 8.95 kB
"use strict"; /** * @file api_scorer.test.ts * @description Tests for API-based scorers. * This file tests: * - Scorer initialization and configuration * - Trace-aware scoring * - Trace context handling * - Trace metadata validation * - Error handling * - Threshold validation * - Scorer metadata */ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); const globals_1 = require("@jest/globals"); const example_js_1 = require("../../data/example.js"); const api_scorer_js_1 = require("../../scorers/api-scorer.js"); // Mock the APIJudgmentScorer base class globals_1.jest.mock('../../scorers/base-scorer.js', () => { const mockScorer = globals_1.jest.fn().mockImplementation((...args) => { const [type, threshold = 0.7, additional_metadata = {}, strict_mode = false, async_mode = true, verbose_mode = true, include_reason = true] = args; const mockInstance = { type, scoreType: type, threshold, additional_metadata, strict_mode, async_mode, verbose_mode, include_reason, requiredFields: ['input', 'actual_output'], validateThreshold: globals_1.jest.fn(), toJSON: function () { return { score_type: this.type, threshold: this.threshold, additional_metadata: this.additional_metadata, strict_mode: this.strict_mode, async_mode: this.async_mode, verbose_mode: this.verbose_mode, include_reason: this.include_reason }; }, a_score_example: globals_1.jest.fn().mockImplementation(() => Promise.reject(new Error('API scorers are evaluated on the server side'))), }; return mockInstance; }); return { APIJudgmentScorer: mockScorer, }; }); (0, globals_1.describe)('API Scorers', () => { const mockExample = new example_js_1.ExampleBuilder() .input('What is the capital of France?') .actualOutput('The capital of France is Paris.') .expectedOutput('Paris is the capital of France.') .context(['France is a country in Western Europe.', 'Paris is the capital of France.']) .retrievalContext(['France is a country in Western Europe.', 'Paris is the capital of France.']) .build(); (0, globals_1.describe)('Initialization', () => { (0, globals_1.it)('should initialize scorers with default values', () => { const scorer = new api_scorer_js_1.AnswerCorrectnessScorer(); (0, globals_1.expect)(scorer.threshold).toBe(0.7); (0, globals_1.expect)(scorer.strict_mode).toBe(false); (0, globals_1.expect)(scorer.async_mode).toBe(true); (0, globals_1.expect)(scorer.verbose_mode).toBe(true); (0, globals_1.expect)(scorer.include_reason).toBe(true); }); (0, globals_1.it)('should initialize scorers with custom values', () => { const scorer = new api_scorer_js_1.AnswerCorrectnessScorer(0.8, { custom: 'metadata' }, true, false, false, false); (0, globals_1.expect)(scorer.threshold).toBe(0.8); (0, globals_1.expect)(scorer.additional_metadata).toEqual({ custom: 'metadata' }); (0, globals_1.expect)(scorer.strict_mode).toBe(true); (0, globals_1.expect)(scorer.async_mode).toBe(false); (0, globals_1.expect)(scorer.verbose_mode).toBe(false); (0, globals_1.expect)(scorer.include_reason).toBe(false); }); }); (0, globals_1.describe)('Trace Context Handling', () => { (0, globals_1.it)('should validate required context fields', () => { const scorer = new api_scorer_js_1.ContextualRelevancyScorer(); (0, globals_1.expect)(scorer.requiredFields).toContain('retrieval_context'); }); (0, globals_1.it)('should handle trace context in comparison scorer', () => { const criteria = ['Accuracy', 'Relevance']; const description = 'Compare outputs'; const scorer = new api_scorer_js_1.ComparisonScorer(0.5, criteria, description); (0, globals_1.expect)(scorer.criteria).toEqual(criteria); (0, globals_1.expect)(scorer.description).toBe(description); }); (0, globals_1.it)('should handle execution order with trace context', () => { const expectedTools = ['tool1', 'tool2']; const scorer = new api_scorer_js_1.ExecutionOrderScorer(1.0, expectedTools); (0, globals_1.expect)(scorer.expectedTools).toEqual(expectedTools); (0, globals_1.expect)(scorer.strictMode).toBe(false); }); }); (0, globals_1.describe)('Trace Metadata', () => { (0, globals_1.it)('should handle JSON schema in trace context', () => { const jsonSchema = { type: 'object', properties: { name: { type: 'string' }, age: { type: 'number' } } }; const scorer = new api_scorer_js_1.JsonCorrectnessScorer(0.7, jsonSchema); (0, globals_1.expect)(scorer.jsonSchema).toEqual(jsonSchema); }); (0, globals_1.it)('should serialize scorer with trace metadata', () => { const criteria = ['Accuracy', 'Relevance']; const description = 'Compare outputs'; const scorer = new api_scorer_js_1.ComparisonScorer(0.5, criteria, description, { trace: 'metadata' }); const json = scorer.toJSON(); (0, globals_1.expect)(json).toEqual({ score_type: 'comparison', threshold: 0.5, additional_metadata: { trace: 'metadata' }, strict_mode: false, async_mode: true, verbose_mode: true, include_reason: true }); }); }); (0, globals_1.describe)('Error Handling', () => { (0, globals_1.it)('should throw error for invalid threshold', () => { (0, globals_1.expect)(() => new api_scorer_js_1.ComparisonScorer(-0.1)).toThrow('Threshold for comparison must be greater than or equal to 0'); }); (0, globals_1.it)('should throw error for server-side evaluation', () => __awaiter(void 0, void 0, void 0, function* () { const scorer = new api_scorer_js_1.AnswerCorrectnessScorer(); const example = new example_js_1.ExampleBuilder() .input('test') .actualOutput('test') .build(); yield (0, globals_1.expect)(scorer.a_score_example(example)).rejects.toThrow('API scorers are evaluated on the server side'); })); }); (0, globals_1.describe)('Specialized Scorers', () => { (0, globals_1.it)('should initialize contextual scorers with required fields', () => { const contextualScorers = [ new api_scorer_js_1.ContextualPrecisionScorer(), new api_scorer_js_1.ContextualRecallScorer(), new api_scorer_js_1.ContextualRelevancyScorer(), new api_scorer_js_1.FaithfulnessScorer(), new api_scorer_js_1.HallucinationScorer() ]; for (const scorer of contextualScorers) { (0, globals_1.expect)(scorer.requiredFields).toContain('input'); (0, globals_1.expect)(scorer.requiredFields).toContain('actual_output'); } }); (0, globals_1.it)('should initialize specialized scorers with correct types', () => { const scorers = [ new api_scorer_js_1.GroundednessScorer(), new api_scorer_js_1.InstructionAdherenceScorer(), new api_scorer_js_1.SummarizationScorer(), new api_scorer_js_1.Text2SQLScorer() ]; for (const scorer of scorers) { (0, globals_1.expect)(scorer.threshold).toBe(0.7); (0, globals_1.expect)(typeof scorer.validateThreshold).toBe('function'); } }); }); }); //# sourceMappingURL=api_scorer.test.js.map