judgeval
Version:
Judgment SDK for TypeScript/JavaScript
282 lines • 15.7 kB
JavaScript
;
/**
* E2E tests for evaluation operations in the JudgmentClient.
* Migrated from the Python SDK's test_eval_operations.py
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const dotenv = __importStar(require("dotenv"));
const judgment_client_js_1 = require("../judgment-client.js");
const example_js_1 = require("../data/example.js");
const api_scorer_js_1 = require("../scorers/api-scorer.js");
const axios_1 = __importDefault(require("axios"));
// Load environment variables
dotenv.config();
// Generate a random string for test names
const generateRandomString = (length = 20) => {
const characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789';
let result = '';
for (let i = 0; i < length; i++) {
result += characters.charAt(Math.floor(Math.random() * characters.length));
}
return result;
};
describe('Evaluation Operations', () => {
let client;
beforeAll(() => {
client = judgment_client_js_1.JudgmentClient.getInstance();
});
/**
* Helper function to run evaluation
*/
const runEvalHelper = (projectName, evalRunName) => __awaiter(void 0, void 0, void 0, function* () {
// Single step in our workflow, an outreach Sales Agent
const example1 = new example_js_1.ExampleBuilder()
.input("Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.")
.actualOutput("Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex")
.retrievalContext(["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"])
.build();
const example2 = new example_js_1.ExampleBuilder()
.input("Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.")
.actualOutput("Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex")
.expectedOutput("A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans")
.context(["Business Development"])
.retrievalContext(["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"])
.build();
const scorer = new api_scorer_js_1.FaithfulnessScorer(0.5);
const scorer2 = new api_scorer_js_1.HallucinationScorer(0.5);
return client.runEvaluation([example1, example2], [scorer, scorer2], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, projectName, evalRunName, true);
});
test('Basic evaluation workflow', () => __awaiter(void 0, void 0, void 0, function* () {
const PROJECT_NAME = "OutreachWorkflow";
const EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt";
yield runEvalHelper(PROJECT_NAME, EVAL_RUN_NAME);
const results = yield client.pullEval(PROJECT_NAME, EVAL_RUN_NAME);
expect(results).toBeTruthy();
expect(results.length).toBeGreaterThan(0);
// Clean up
yield client.deleteProject(PROJECT_NAME);
}));
test('Delete evaluation by project and run names', () => __awaiter(void 0, void 0, void 0, function* () {
var _a;
const PROJECT_NAME = generateRandomString();
const EVAL_RUN_NAMES = Array(3).fill(0).map(() => generateRandomString());
// Run evaluations with different run names
for (const evalRunName of EVAL_RUN_NAMES) {
yield runEvalHelper(PROJECT_NAME, evalRunName);
}
// Delete evaluations
yield client.deleteEval(PROJECT_NAME, EVAL_RUN_NAMES);
// Delete project
yield client.deleteProject(PROJECT_NAME);
// Verify evaluations are deleted
for (const evalRunName of EVAL_RUN_NAMES) {
try {
yield client.pullEval(PROJECT_NAME, evalRunName);
// If pullEval succeeds, the test should fail
throw new Error(`pullEval for ${evalRunName} should have failed after project deletion, but it succeeded.`);
}
catch (error) {
// Expect either 404 (ideal) or 500 (current behavior)
expect(axios_1.default.isAxiosError(error)).toBe(true);
if (axios_1.default.isAxiosError(error)) {
expect([404, 500]).toContain((_a = error.response) === null || _a === void 0 ? void 0 : _a.status);
}
else {
// If it's not an AxiosError, rethrow to fail the test
throw error;
}
}
}
}));
test('Delete evaluation by project', () => __awaiter(void 0, void 0, void 0, function* () {
var _a, _b;
const PROJECT_NAME = generateRandomString();
const EVAL_RUN_NAME = generateRandomString();
const EVAL_RUN_NAME2 = generateRandomString();
yield runEvalHelper(PROJECT_NAME, EVAL_RUN_NAME);
yield runEvalHelper(PROJECT_NAME, EVAL_RUN_NAME2);
// Delete project
yield client.deleteProject(PROJECT_NAME);
// Verify evaluations are deleted
try {
yield client.pullEval(PROJECT_NAME, EVAL_RUN_NAME);
throw new Error(`pullEval for ${EVAL_RUN_NAME} should have failed after project deletion, but it succeeded.`);
}
catch (error) {
expect(axios_1.default.isAxiosError(error)).toBe(true);
if (axios_1.default.isAxiosError(error)) {
expect([404, 500]).toContain((_a = error.response) === null || _a === void 0 ? void 0 : _a.status);
}
else {
throw error;
}
}
try {
yield client.pullEval(PROJECT_NAME, EVAL_RUN_NAME2);
throw new Error(`pullEval for ${EVAL_RUN_NAME2} should have failed after project deletion, but it succeeded.`);
}
catch (error) {
expect(axios_1.default.isAxiosError(error)).toBe(true);
if (axios_1.default.isAxiosError(error)) {
expect([404, 500]).toContain((_b = error.response) === null || _b === void 0 ? void 0 : _b.status);
}
else {
throw error;
}
}
}));
test('Assert test functionality', () => __awaiter(void 0, void 0, void 0, function* () {
// Create examples and scorers
const example = new example_js_1.ExampleBuilder()
.input("What if these shoes don't fit?")
.actualOutput("We offer a 30-day full refund at no extra cost.")
.retrievalContext(["All customers are eligible for a 30 day full refund at no extra cost."])
.build();
const example1 = new example_js_1.ExampleBuilder()
.input("How much are your croissants?")
.actualOutput("Sorry, we don't accept electronic returns.")
.build();
const example2 = new example_js_1.ExampleBuilder()
.input("Who is the best basketball player in the world?")
.actualOutput("No, the room is too small.")
.build();
const scorer = new api_scorer_js_1.FaithfulnessScorer(0.5);
const scorer1 = new api_scorer_js_1.AnswerRelevancyScorer(0.5);
const projectName = `test_project_${generateRandomString(8)}`;
const evalName = `test_eval_${generateRandomString(8)}`;
try {
// This should fail with an assertion error
yield expect(client.assertTest([example, example1, example2], [scorer, scorer1], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, {}, true, projectName, evalName, true)).rejects.toThrow();
}
finally {
// Clean up resources to prevent leaks
try {
yield client.deleteProject(projectName);
}
catch (error) {
console.warn(`Failed to clean up project ${projectName}:`, error);
}
}
}), 120000);
test('Evaluate dataset', () => __awaiter(void 0, void 0, void 0, function* () {
const example1 = new example_js_1.ExampleBuilder()
.input("What if these shoes don't fit?")
.actualOutput("We offer a 30-day full refund at no extra cost.")
.retrievalContext(["All customers are eligible for a 30 day full refund at no extra cost."])
.build();
const example2 = new example_js_1.ExampleBuilder()
.input("How do I reset my password?")
.actualOutput("You can reset your password by clicking on 'Forgot Password' at the login screen.")
.expectedOutput("You can reset your password by clicking on 'Forgot Password' at the login screen.")
.additionalMetadata({ name: "Password Reset", difficulty: "medium" })
.context(["User Account"])
.retrievalContext(["Password reset instructions"])
.toolsCalled(["authentication"])
.expectedTools(["authentication"])
.build();
const projectName = `test_project_${generateRandomString(8)}`;
const evalName = `test_eval_run_${generateRandomString(8)}`;
// Use the evaluate method with examples directly
const res = yield client.evaluate({
examples: [example1, example2],
scorers: [new api_scorer_js_1.FaithfulnessScorer(0.5)],
model: "Qwen/Qwen2.5-72B-Instruct-Turbo",
metadata: { batch: "test" },
projectName,
evalName
});
expect(res).toBeTruthy();
expect(res.length).toBeGreaterThan(0);
// Clean up
yield client.deleteProject(projectName);
}));
test('Override eval behavior', () => __awaiter(void 0, void 0, void 0, function* () {
const example1 = new example_js_1.ExampleBuilder()
.input("What if these shoes don't fit?")
.actualOutput("We offer a 30-day full refund at no extra cost.")
.retrievalContext(["All customers are eligible for a 30 day full refund at no extra cost."])
.build();
const scorer = new api_scorer_js_1.FaithfulnessScorer(0.5);
const PROJECT_NAME = "test_eval_run_naming_collisions";
const EVAL_RUN_NAME = generateRandomString();
// First run should succeed
yield client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, PROJECT_NAME, EVAL_RUN_NAME, false // override=false
);
// Second run with log_results=false should succeed
yield client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, false, // log_results=false
PROJECT_NAME, EVAL_RUN_NAME, false // override=false
);
// Third run with override=true should succeed
yield client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, PROJECT_NAME, EVAL_RUN_NAME, true // override=true
);
// Fourth run with override=false should fail
yield expect(client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, PROJECT_NAME, EVAL_RUN_NAME, false // override=false
)).rejects.toThrow();
// Clean up
yield client.deleteProject(PROJECT_NAME);
}));
});
// Advanced evaluation operations tests
describe('Advanced Evaluation Operations', () => {
let client;
beforeAll(() => {
client = judgment_client_js_1.JudgmentClient.getInstance();
});
test('JSON scorer functionality', () => __awaiter(void 0, void 0, void 0, function* () {
var _a, _b;
// Test data for JSON scorer
const jsonExample = new example_js_1.ExampleBuilder()
.input("Extract the following information as JSON: Name: John Smith, Age: 35, Occupation: Software Engineer")
.actualOutput('{"name": "John Smith", "age": 35, "occupation": "Software Engineer"}')
.expectedOutput('{"name": "John Smith", "age": 35, "occupation": "Software Engineer"}')
.build();
const jsonScorer = new api_scorer_js_1.JsonCorrectnessScorer(0.8);
const results = yield client.evaluate({
examples: [jsonExample],
scorers: [jsonScorer],
model: "Qwen/Qwen2.5-72B-Instruct-Turbo",
projectName: "json-scorer-test",
evalName: `json-scorer-${generateRandomString()}`
});
expect(results).toBeTruthy();
expect(results.length).toBe(1);
expect((_a = results[0].scorersData) === null || _a === void 0 ? void 0 : _a.length).toBe(1);
expect((_b = results[0].scorersData) === null || _b === void 0 ? void 0 : _b[0].name).toBe("json_correctness");
}));
});
//# sourceMappingURL=eval-operations.test.js.map