judgeval
Version:
Judgment SDK for TypeScript/JavaScript
221 lines • 10.4 kB
JavaScript
"use strict";
/**
* @file basic_evaluation.test.ts
* @description Tests for basic evaluation functionality.
* This file tests:
* - Evaluation execution
* - Result handling
* - Trace generation and management
* - Trace comparison
* - Error handling
* - Async behavior
* - Timeout handling
*/
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
const globals_1 = require("@jest/globals");
const judgment_client_js_1 = require("../../judgment-client.js");
const example_js_1 = require("../../data/example.js");
const result_js_1 = require("../../data/result.js");
const api_scorer_js_1 = require("../../scorers/api-scorer.js");
// Mock the JudgmentClient
globals_1.jest.mock('../../judgment-client.js');
(0, globals_1.describe)('Basic Evaluation Example', () => {
let client;
let mockEvaluate;
let mockRunEvaluation;
(0, globals_1.beforeEach)(() => {
mockEvaluate = globals_1.jest.fn();
mockRunEvaluation = globals_1.jest.fn();
// Reset and setup mocks
judgment_client_js_1.JudgmentClient.getInstance.mockReturnValue({
evaluate: mockEvaluate,
runEvaluation: mockRunEvaluation,
});
client = judgment_client_js_1.JudgmentClient.getInstance();
});
(0, globals_1.describe)('Single Example Evaluation', () => {
(0, globals_1.it)('should evaluate a single example with one scorer', () => __awaiter(void 0, void 0, void 0, function* () {
const example = new example_js_1.ExampleBuilder()
.input("What is the capital of France?")
.actualOutput("The capital of France is Paris.")
.expectedOutput("Paris is the capital of France.")
.build();
const scorer = new api_scorer_js_1.AnswerCorrectnessScorer();
const expectedResult = new result_js_1.ScoringResult({
dataObject: example,
scorersData: [{
name: 'answer_correctness',
score: 0.85,
threshold: 0.7,
success: true,
reason: 'Test passed',
strict_mode: false,
evaluation_model: null,
error: null,
evaluation_cost: 0,
verbose_logs: null,
additional_metadata: {}
}]
});
mockEvaluate.mockResolvedValue([expectedResult]);
const results = yield client.evaluate({
examples: [example],
scorers: [scorer],
});
(0, globals_1.expect)(mockEvaluate).toHaveBeenCalledWith({
examples: [example],
scorers: [scorer],
});
(0, globals_1.expect)(results).toHaveLength(1);
(0, globals_1.expect)(results[0].scorersData).toHaveLength(1);
(0, globals_1.expect)(results[0].scorersData[0].score).toBe(0.85);
(0, globals_1.expect)(results[0].scorersData[0].success).toBe(true);
(0, globals_1.expect)(results[0].dataObject).toBe(example);
}));
(0, globals_1.it)('should evaluate a single example with multiple scorers', () => __awaiter(void 0, void 0, void 0, function* () {
const example = new example_js_1.ExampleBuilder()
.input("Based on the context, what is the capital of France?")
.actualOutput("According to the context, Paris is the capital of France.")
.context(["France is a country in Western Europe.", "Paris is the capital of France."])
.build();
const scorers = [
new api_scorer_js_1.AnswerCorrectnessScorer(),
new api_scorer_js_1.ContextualRelevancyScorer(),
new api_scorer_js_1.FaithfulnessScorer(),
];
const expectedResult = new result_js_1.ScoringResult({
dataObject: example,
scorersData: scorers.map(scorer => ({
name: scorer.type,
score: 0.9,
threshold: 0.7,
success: true,
reason: 'Test passed',
strict_mode: false,
evaluation_model: null,
error: null,
evaluation_cost: 0,
verbose_logs: null,
additional_metadata: {}
}))
});
mockEvaluate.mockResolvedValue([expectedResult]);
const results = yield client.evaluate({
examples: [example],
scorers,
});
(0, globals_1.expect)(mockEvaluate).toHaveBeenCalledWith({
examples: [example],
scorers,
});
(0, globals_1.expect)(results).toHaveLength(1);
(0, globals_1.expect)(results[0].scorersData).toHaveLength(3);
(0, globals_1.expect)(results[0].dataObject).toBe(example);
}));
});
(0, globals_1.describe)('Batch Evaluation', () => {
(0, globals_1.it)('should evaluate multiple examples in batches', () => __awaiter(void 0, void 0, void 0, function* () {
const examples = [
new example_js_1.ExampleBuilder()
.input("What is the capital of France?")
.actualOutput("Paris is the capital of France.")
.build(),
new example_js_1.ExampleBuilder()
.input("What is the capital of Japan?")
.actualOutput("Tokyo is the capital of Japan.")
.build(),
new example_js_1.ExampleBuilder()
.input("What is the capital of Germany?")
.actualOutput("Berlin is the capital of Germany.")
.build(),
];
const scorers = [new api_scorer_js_1.AnswerCorrectnessScorer()];
const expectedResults = examples.map(example => new result_js_1.ScoringResult({
dataObject: example,
scorersData: [{
name: 'answer_correctness',
score: 1.0,
threshold: 0.7,
success: true,
reason: 'Test passed',
strict_mode: false,
evaluation_model: 'gpt-4',
error: null,
evaluation_cost: 0,
verbose_logs: null,
additional_metadata: {}
}]
}));
mockRunEvaluation.mockResolvedValue(expectedResults);
const results = yield client.runEvaluation(examples, scorers, 'gpt-4');
(0, globals_1.expect)(mockRunEvaluation).toHaveBeenCalledWith(examples, scorers, 'gpt-4');
(0, globals_1.expect)(results).toHaveLength(3);
results.forEach(result => {
(0, globals_1.expect)(result.scorersData).toHaveLength(1);
(0, globals_1.expect)(result.scorersData[0].evaluation_model).toBe('gpt-4');
});
}));
});
(0, globals_1.describe)('Error Handling', () => {
(0, globals_1.it)('should handle evaluation errors', () => __awaiter(void 0, void 0, void 0, function* () {
const example = new example_js_1.ExampleBuilder()
.input('What is 2+2?')
.actualOutput('4')
.expectedOutput('4')
.build();
const scorer = new api_scorer_js_1.AnswerCorrectnessScorer();
mockEvaluate.mockRejectedValue(new Error('Evaluation failed'));
yield (0, globals_1.expect)(client.evaluate({
examples: [example],
scorers: [scorer],
model: 'gpt-4'
})).rejects.toThrow('Evaluation failed');
}));
});
(0, globals_1.describe)('Async Behavior', () => {
(0, globals_1.it)('should handle timeout', () => __awaiter(void 0, void 0, void 0, function* () {
const example = new example_js_1.ExampleBuilder()
.input('What is 2+2?')
.actualOutput('4')
.expectedOutput('4')
.build();
const scorer = new api_scorer_js_1.AnswerCorrectnessScorer();
mockEvaluate.mockImplementation(() => new Promise((resolve) => setTimeout(() => resolve([
new result_js_1.ScoringResult({
dataObject: example,
scorersData: [{
name: 'answer_correctness',
score: 1.0,
threshold: 0.7,
success: true,
reason: 'Correct answer',
strict_mode: false,
evaluation_model: 'gpt-4',
error: null,
evaluation_cost: 0,
verbose_logs: null,
additional_metadata: {}
}]
})
]), 1000)));
yield (0, globals_1.expect)(Promise.race([
client.evaluate({
examples: [example],
scorers: [scorer],
model: 'gpt-4'
}),
new Promise((_, reject) => setTimeout(() => reject(new Error('Timeout')), 500))
])).rejects.toThrow('Timeout');
}));
});
});
//# sourceMappingURL=basic_evaluation.test.js.map