judgeval
Version:
Judgment SDK for TypeScript/JavaScript
174 lines • 8.19 kB
JavaScript
/**
* @file api_scorer.test.ts
* @description Tests for API-based scorers.
* This file tests:
* - Scorer initialization and configuration
* - Trace-aware scoring
* - Trace context handling
* - Trace metadata validation
* - Error handling
* - Threshold validation
* - Scorer metadata
*/
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
import { describe, expect, it, jest } from '@jest/globals';
import { ExampleBuilder } from '../../data/example.js';
import { AnswerCorrectnessScorer, ComparisonScorer, ContextualPrecisionScorer, ContextualRecallScorer, ContextualRelevancyScorer, ExecutionOrderScorer, FaithfulnessScorer, GroundednessScorer, HallucinationScorer, InstructionAdherenceScorer, JsonCorrectnessScorer, SummarizationScorer, Text2SQLScorer } from '../../scorers/api-scorer.js';
// Mock the APIJudgmentScorer base class
jest.mock('../../scorers/base-scorer.js', () => {
const mockScorer = jest.fn().mockImplementation((...args) => {
const [type, threshold = 0.7, additional_metadata = {}, strict_mode = false, async_mode = true, verbose_mode = true, include_reason = true] = args;
const mockInstance = {
type,
scoreType: type,
threshold,
additional_metadata,
strict_mode,
async_mode,
verbose_mode,
include_reason,
requiredFields: ['input', 'actual_output'],
validateThreshold: jest.fn(),
toJSON: function () {
return {
score_type: this.type,
threshold: this.threshold,
additional_metadata: this.additional_metadata,
strict_mode: this.strict_mode,
async_mode: this.async_mode,
verbose_mode: this.verbose_mode,
include_reason: this.include_reason
};
},
a_score_example: jest.fn().mockImplementation(() => Promise.reject(new Error('API scorers are evaluated on the server side'))),
};
return mockInstance;
});
return {
APIJudgmentScorer: mockScorer,
};
});
describe('API Scorers', () => {
const mockExample = new ExampleBuilder()
.input('What is the capital of France?')
.actualOutput('The capital of France is Paris.')
.expectedOutput('Paris is the capital of France.')
.context(['France is a country in Western Europe.', 'Paris is the capital of France.'])
.retrievalContext(['France is a country in Western Europe.', 'Paris is the capital of France.'])
.build();
describe('Initialization', () => {
it('should initialize scorers with default values', () => {
const scorer = new AnswerCorrectnessScorer();
expect(scorer.threshold).toBe(0.7);
expect(scorer.strict_mode).toBe(false);
expect(scorer.async_mode).toBe(true);
expect(scorer.verbose_mode).toBe(true);
expect(scorer.include_reason).toBe(true);
});
it('should initialize scorers with custom values', () => {
const scorer = new AnswerCorrectnessScorer(0.8, { custom: 'metadata' }, true, false, false, false);
expect(scorer.threshold).toBe(0.8);
expect(scorer.additional_metadata).toEqual({ custom: 'metadata' });
expect(scorer.strict_mode).toBe(true);
expect(scorer.async_mode).toBe(false);
expect(scorer.verbose_mode).toBe(false);
expect(scorer.include_reason).toBe(false);
});
});
describe('Trace Context Handling', () => {
it('should validate required context fields', () => {
const scorer = new ContextualRelevancyScorer();
expect(scorer.requiredFields).toContain('retrieval_context');
});
it('should handle trace context in comparison scorer', () => {
const criteria = ['Accuracy', 'Relevance'];
const description = 'Compare outputs';
const scorer = new ComparisonScorer(0.5, criteria, description);
expect(scorer.criteria).toEqual(criteria);
expect(scorer.description).toBe(description);
});
it('should handle execution order with trace context', () => {
const expectedTools = ['tool1', 'tool2'];
const scorer = new ExecutionOrderScorer(1.0, expectedTools);
expect(scorer.expectedTools).toEqual(expectedTools);
expect(scorer.strictMode).toBe(false);
});
});
describe('Trace Metadata', () => {
it('should handle JSON schema in trace context', () => {
const jsonSchema = {
type: 'object',
properties: {
name: { type: 'string' },
age: { type: 'number' }
}
};
const scorer = new JsonCorrectnessScorer(0.7, jsonSchema);
expect(scorer.jsonSchema).toEqual(jsonSchema);
});
it('should serialize scorer with trace metadata', () => {
const criteria = ['Accuracy', 'Relevance'];
const description = 'Compare outputs';
const scorer = new ComparisonScorer(0.5, criteria, description, { trace: 'metadata' });
const json = scorer.toJSON();
expect(json).toEqual({
score_type: 'comparison',
threshold: 0.5,
additional_metadata: { trace: 'metadata' },
strict_mode: false,
async_mode: true,
verbose_mode: true,
include_reason: true
});
});
});
describe('Error Handling', () => {
it('should throw error for invalid threshold', () => {
expect(() => new ComparisonScorer(-0.1)).toThrow('Threshold for comparison must be greater than or equal to 0');
});
it('should throw error for server-side evaluation', () => __awaiter(void 0, void 0, void 0, function* () {
const scorer = new AnswerCorrectnessScorer();
const example = new ExampleBuilder()
.input('test')
.actualOutput('test')
.build();
yield expect(scorer.a_score_example(example)).rejects.toThrow('API scorers are evaluated on the server side');
}));
});
describe('Specialized Scorers', () => {
it('should initialize contextual scorers with required fields', () => {
const contextualScorers = [
new ContextualPrecisionScorer(),
new ContextualRecallScorer(),
new ContextualRelevancyScorer(),
new FaithfulnessScorer(),
new HallucinationScorer()
];
for (const scorer of contextualScorers) {
expect(scorer.requiredFields).toContain('input');
expect(scorer.requiredFields).toContain('actual_output');
}
});
it('should initialize specialized scorers with correct types', () => {
const scorers = [
new GroundednessScorer(),
new InstructionAdherenceScorer(),
new SummarizationScorer(),
new Text2SQLScorer()
];
for (const scorer of scorers) {
expect(scorer.threshold).toBe(0.7);
expect(typeof scorer.validateThreshold).toBe('function');
}
});
});
});
//# sourceMappingURL=api_scorer.test.js.map