judgeval
Version:
Judgment SDK for TypeScript/JavaScript
176 lines • 8.95 kB
JavaScript
;
/**
* @file api_scorer.test.ts
* @description Tests for API-based scorers.
* This file tests:
* - Scorer initialization and configuration
* - Trace-aware scoring
* - Trace context handling
* - Trace metadata validation
* - Error handling
* - Threshold validation
* - Scorer metadata
*/
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
const globals_1 = require("@jest/globals");
const example_js_1 = require("../../data/example.js");
const api_scorer_js_1 = require("../../scorers/api-scorer.js");
// Mock the APIJudgmentScorer base class
globals_1.jest.mock('../../scorers/base-scorer.js', () => {
const mockScorer = globals_1.jest.fn().mockImplementation((...args) => {
const [type, threshold = 0.7, additional_metadata = {}, strict_mode = false, async_mode = true, verbose_mode = true, include_reason = true] = args;
const mockInstance = {
type,
scoreType: type,
threshold,
additional_metadata,
strict_mode,
async_mode,
verbose_mode,
include_reason,
requiredFields: ['input', 'actual_output'],
validateThreshold: globals_1.jest.fn(),
toJSON: function () {
return {
score_type: this.type,
threshold: this.threshold,
additional_metadata: this.additional_metadata,
strict_mode: this.strict_mode,
async_mode: this.async_mode,
verbose_mode: this.verbose_mode,
include_reason: this.include_reason
};
},
a_score_example: globals_1.jest.fn().mockImplementation(() => Promise.reject(new Error('API scorers are evaluated on the server side'))),
};
return mockInstance;
});
return {
APIJudgmentScorer: mockScorer,
};
});
(0, globals_1.describe)('API Scorers', () => {
const mockExample = new example_js_1.ExampleBuilder()
.input('What is the capital of France?')
.actualOutput('The capital of France is Paris.')
.expectedOutput('Paris is the capital of France.')
.context(['France is a country in Western Europe.', 'Paris is the capital of France.'])
.retrievalContext(['France is a country in Western Europe.', 'Paris is the capital of France.'])
.build();
(0, globals_1.describe)('Initialization', () => {
(0, globals_1.it)('should initialize scorers with default values', () => {
const scorer = new api_scorer_js_1.AnswerCorrectnessScorer();
(0, globals_1.expect)(scorer.threshold).toBe(0.7);
(0, globals_1.expect)(scorer.strict_mode).toBe(false);
(0, globals_1.expect)(scorer.async_mode).toBe(true);
(0, globals_1.expect)(scorer.verbose_mode).toBe(true);
(0, globals_1.expect)(scorer.include_reason).toBe(true);
});
(0, globals_1.it)('should initialize scorers with custom values', () => {
const scorer = new api_scorer_js_1.AnswerCorrectnessScorer(0.8, { custom: 'metadata' }, true, false, false, false);
(0, globals_1.expect)(scorer.threshold).toBe(0.8);
(0, globals_1.expect)(scorer.additional_metadata).toEqual({ custom: 'metadata' });
(0, globals_1.expect)(scorer.strict_mode).toBe(true);
(0, globals_1.expect)(scorer.async_mode).toBe(false);
(0, globals_1.expect)(scorer.verbose_mode).toBe(false);
(0, globals_1.expect)(scorer.include_reason).toBe(false);
});
});
(0, globals_1.describe)('Trace Context Handling', () => {
(0, globals_1.it)('should validate required context fields', () => {
const scorer = new api_scorer_js_1.ContextualRelevancyScorer();
(0, globals_1.expect)(scorer.requiredFields).toContain('retrieval_context');
});
(0, globals_1.it)('should handle trace context in comparison scorer', () => {
const criteria = ['Accuracy', 'Relevance'];
const description = 'Compare outputs';
const scorer = new api_scorer_js_1.ComparisonScorer(0.5, criteria, description);
(0, globals_1.expect)(scorer.criteria).toEqual(criteria);
(0, globals_1.expect)(scorer.description).toBe(description);
});
(0, globals_1.it)('should handle execution order with trace context', () => {
const expectedTools = ['tool1', 'tool2'];
const scorer = new api_scorer_js_1.ExecutionOrderScorer(1.0, expectedTools);
(0, globals_1.expect)(scorer.expectedTools).toEqual(expectedTools);
(0, globals_1.expect)(scorer.strictMode).toBe(false);
});
});
(0, globals_1.describe)('Trace Metadata', () => {
(0, globals_1.it)('should handle JSON schema in trace context', () => {
const jsonSchema = {
type: 'object',
properties: {
name: { type: 'string' },
age: { type: 'number' }
}
};
const scorer = new api_scorer_js_1.JsonCorrectnessScorer(0.7, jsonSchema);
(0, globals_1.expect)(scorer.jsonSchema).toEqual(jsonSchema);
});
(0, globals_1.it)('should serialize scorer with trace metadata', () => {
const criteria = ['Accuracy', 'Relevance'];
const description = 'Compare outputs';
const scorer = new api_scorer_js_1.ComparisonScorer(0.5, criteria, description, { trace: 'metadata' });
const json = scorer.toJSON();
(0, globals_1.expect)(json).toEqual({
score_type: 'comparison',
threshold: 0.5,
additional_metadata: { trace: 'metadata' },
strict_mode: false,
async_mode: true,
verbose_mode: true,
include_reason: true
});
});
});
(0, globals_1.describe)('Error Handling', () => {
(0, globals_1.it)('should throw error for invalid threshold', () => {
(0, globals_1.expect)(() => new api_scorer_js_1.ComparisonScorer(-0.1)).toThrow('Threshold for comparison must be greater than or equal to 0');
});
(0, globals_1.it)('should throw error for server-side evaluation', () => __awaiter(void 0, void 0, void 0, function* () {
const scorer = new api_scorer_js_1.AnswerCorrectnessScorer();
const example = new example_js_1.ExampleBuilder()
.input('test')
.actualOutput('test')
.build();
yield (0, globals_1.expect)(scorer.a_score_example(example)).rejects.toThrow('API scorers are evaluated on the server side');
}));
});
(0, globals_1.describe)('Specialized Scorers', () => {
(0, globals_1.it)('should initialize contextual scorers with required fields', () => {
const contextualScorers = [
new api_scorer_js_1.ContextualPrecisionScorer(),
new api_scorer_js_1.ContextualRecallScorer(),
new api_scorer_js_1.ContextualRelevancyScorer(),
new api_scorer_js_1.FaithfulnessScorer(),
new api_scorer_js_1.HallucinationScorer()
];
for (const scorer of contextualScorers) {
(0, globals_1.expect)(scorer.requiredFields).toContain('input');
(0, globals_1.expect)(scorer.requiredFields).toContain('actual_output');
}
});
(0, globals_1.it)('should initialize specialized scorers with correct types', () => {
const scorers = [
new api_scorer_js_1.GroundednessScorer(),
new api_scorer_js_1.InstructionAdherenceScorer(),
new api_scorer_js_1.SummarizationScorer(),
new api_scorer_js_1.Text2SQLScorer()
];
for (const scorer of scorers) {
(0, globals_1.expect)(scorer.threshold).toBe(0.7);
(0, globals_1.expect)(typeof scorer.validateThreshold).toBe('function');
}
});
});
});
//# sourceMappingURL=api_scorer.test.js.map