UNPKG

judgeval

Version:

Judgment SDK for TypeScript/JavaScript

282 lines 15.7 kB
"use strict"; /** * E2E tests for evaluation operations in the JudgmentClient. * Migrated from the Python SDK's test_eval_operations.py */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const dotenv = __importStar(require("dotenv")); const judgment_client_js_1 = require("../judgment-client.js"); const example_js_1 = require("../data/example.js"); const api_scorer_js_1 = require("../scorers/api-scorer.js"); const axios_1 = __importDefault(require("axios")); // Load environment variables dotenv.config(); // Generate a random string for test names const generateRandomString = (length = 20) => { const characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'; let result = ''; for (let i = 0; i < length; i++) { result += characters.charAt(Math.floor(Math.random() * characters.length)); } return result; }; describe('Evaluation Operations', () => { let client; beforeAll(() => { client = judgment_client_js_1.JudgmentClient.getInstance(); }); /** * Helper function to run evaluation */ const runEvalHelper = (projectName, evalRunName) => __awaiter(void 0, void 0, void 0, function* () { // Single step in our workflow, an outreach Sales Agent const example1 = new example_js_1.ExampleBuilder() .input("Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.") .actualOutput("Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex") .retrievalContext(["TechCorp launched AI analytics platform in 2024", "Sarah Chen is CEO, ex-Google executive", "Current client base: 50+ enterprise customers"]) .build(); const example2 = new example_js_1.ExampleBuilder() .input("Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.") .actualOutput("Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex") .expectedOutput("A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans") .context(["Business Development"]) .retrievalContext(["GreenEnergy Solutions won 2023 sustainability award", "New solar technology 30% more efficient", "Planning European market expansion"]) .build(); const scorer = new api_scorer_js_1.FaithfulnessScorer(0.5); const scorer2 = new api_scorer_js_1.HallucinationScorer(0.5); return client.runEvaluation([example1, example2], [scorer, scorer2], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, projectName, evalRunName, true); }); test('Basic evaluation workflow', () => __awaiter(void 0, void 0, void 0, function* () { const PROJECT_NAME = "OutreachWorkflow"; const EVAL_RUN_NAME = "ColdEmailGenerator-Improve-BasePrompt"; yield runEvalHelper(PROJECT_NAME, EVAL_RUN_NAME); const results = yield client.pullEval(PROJECT_NAME, EVAL_RUN_NAME); expect(results).toBeTruthy(); expect(results.length).toBeGreaterThan(0); // Clean up yield client.deleteProject(PROJECT_NAME); })); test('Delete evaluation by project and run names', () => __awaiter(void 0, void 0, void 0, function* () { var _a; const PROJECT_NAME = generateRandomString(); const EVAL_RUN_NAMES = Array(3).fill(0).map(() => generateRandomString()); // Run evaluations with different run names for (const evalRunName of EVAL_RUN_NAMES) { yield runEvalHelper(PROJECT_NAME, evalRunName); } // Delete evaluations yield client.deleteEval(PROJECT_NAME, EVAL_RUN_NAMES); // Delete project yield client.deleteProject(PROJECT_NAME); // Verify evaluations are deleted for (const evalRunName of EVAL_RUN_NAMES) { try { yield client.pullEval(PROJECT_NAME, evalRunName); // If pullEval succeeds, the test should fail throw new Error(`pullEval for ${evalRunName} should have failed after project deletion, but it succeeded.`); } catch (error) { // Expect either 404 (ideal) or 500 (current behavior) expect(axios_1.default.isAxiosError(error)).toBe(true); if (axios_1.default.isAxiosError(error)) { expect([404, 500]).toContain((_a = error.response) === null || _a === void 0 ? void 0 : _a.status); } else { // If it's not an AxiosError, rethrow to fail the test throw error; } } } })); test('Delete evaluation by project', () => __awaiter(void 0, void 0, void 0, function* () { var _a, _b; const PROJECT_NAME = generateRandomString(); const EVAL_RUN_NAME = generateRandomString(); const EVAL_RUN_NAME2 = generateRandomString(); yield runEvalHelper(PROJECT_NAME, EVAL_RUN_NAME); yield runEvalHelper(PROJECT_NAME, EVAL_RUN_NAME2); // Delete project yield client.deleteProject(PROJECT_NAME); // Verify evaluations are deleted try { yield client.pullEval(PROJECT_NAME, EVAL_RUN_NAME); throw new Error(`pullEval for ${EVAL_RUN_NAME} should have failed after project deletion, but it succeeded.`); } catch (error) { expect(axios_1.default.isAxiosError(error)).toBe(true); if (axios_1.default.isAxiosError(error)) { expect([404, 500]).toContain((_a = error.response) === null || _a === void 0 ? void 0 : _a.status); } else { throw error; } } try { yield client.pullEval(PROJECT_NAME, EVAL_RUN_NAME2); throw new Error(`pullEval for ${EVAL_RUN_NAME2} should have failed after project deletion, but it succeeded.`); } catch (error) { expect(axios_1.default.isAxiosError(error)).toBe(true); if (axios_1.default.isAxiosError(error)) { expect([404, 500]).toContain((_b = error.response) === null || _b === void 0 ? void 0 : _b.status); } else { throw error; } } })); test('Assert test functionality', () => __awaiter(void 0, void 0, void 0, function* () { // Create examples and scorers const example = new example_js_1.ExampleBuilder() .input("What if these shoes don't fit?") .actualOutput("We offer a 30-day full refund at no extra cost.") .retrievalContext(["All customers are eligible for a 30 day full refund at no extra cost."]) .build(); const example1 = new example_js_1.ExampleBuilder() .input("How much are your croissants?") .actualOutput("Sorry, we don't accept electronic returns.") .build(); const example2 = new example_js_1.ExampleBuilder() .input("Who is the best basketball player in the world?") .actualOutput("No, the room is too small.") .build(); const scorer = new api_scorer_js_1.FaithfulnessScorer(0.5); const scorer1 = new api_scorer_js_1.AnswerRelevancyScorer(0.5); const projectName = `test_project_${generateRandomString(8)}`; const evalName = `test_eval_${generateRandomString(8)}`; try { // This should fail with an assertion error yield expect(client.assertTest([example, example1, example2], [scorer, scorer1], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, {}, true, projectName, evalName, true)).rejects.toThrow(); } finally { // Clean up resources to prevent leaks try { yield client.deleteProject(projectName); } catch (error) { console.warn(`Failed to clean up project ${projectName}:`, error); } } }), 120000); test('Evaluate dataset', () => __awaiter(void 0, void 0, void 0, function* () { const example1 = new example_js_1.ExampleBuilder() .input("What if these shoes don't fit?") .actualOutput("We offer a 30-day full refund at no extra cost.") .retrievalContext(["All customers are eligible for a 30 day full refund at no extra cost."]) .build(); const example2 = new example_js_1.ExampleBuilder() .input("How do I reset my password?") .actualOutput("You can reset your password by clicking on 'Forgot Password' at the login screen.") .expectedOutput("You can reset your password by clicking on 'Forgot Password' at the login screen.") .additionalMetadata({ name: "Password Reset", difficulty: "medium" }) .context(["User Account"]) .retrievalContext(["Password reset instructions"]) .toolsCalled(["authentication"]) .expectedTools(["authentication"]) .build(); const projectName = `test_project_${generateRandomString(8)}`; const evalName = `test_eval_run_${generateRandomString(8)}`; // Use the evaluate method with examples directly const res = yield client.evaluate({ examples: [example1, example2], scorers: [new api_scorer_js_1.FaithfulnessScorer(0.5)], model: "Qwen/Qwen2.5-72B-Instruct-Turbo", metadata: { batch: "test" }, projectName, evalName }); expect(res).toBeTruthy(); expect(res.length).toBeGreaterThan(0); // Clean up yield client.deleteProject(projectName); })); test('Override eval behavior', () => __awaiter(void 0, void 0, void 0, function* () { const example1 = new example_js_1.ExampleBuilder() .input("What if these shoes don't fit?") .actualOutput("We offer a 30-day full refund at no extra cost.") .retrievalContext(["All customers are eligible for a 30 day full refund at no extra cost."]) .build(); const scorer = new api_scorer_js_1.FaithfulnessScorer(0.5); const PROJECT_NAME = "test_eval_run_naming_collisions"; const EVAL_RUN_NAME = generateRandomString(); // First run should succeed yield client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, PROJECT_NAME, EVAL_RUN_NAME, false // override=false ); // Second run with log_results=false should succeed yield client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, false, // log_results=false PROJECT_NAME, EVAL_RUN_NAME, false // override=false ); // Third run with override=true should succeed yield client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, PROJECT_NAME, EVAL_RUN_NAME, true // override=true ); // Fourth run with override=false should fail yield expect(client.runEvaluation([example1], [scorer], "Qwen/Qwen2.5-72B-Instruct-Turbo", undefined, { batch: "test" }, true, PROJECT_NAME, EVAL_RUN_NAME, false // override=false )).rejects.toThrow(); // Clean up yield client.deleteProject(PROJECT_NAME); })); }); // Advanced evaluation operations tests describe('Advanced Evaluation Operations', () => { let client; beforeAll(() => { client = judgment_client_js_1.JudgmentClient.getInstance(); }); test('JSON scorer functionality', () => __awaiter(void 0, void 0, void 0, function* () { var _a, _b; // Test data for JSON scorer const jsonExample = new example_js_1.ExampleBuilder() .input("Extract the following information as JSON: Name: John Smith, Age: 35, Occupation: Software Engineer") .actualOutput('{"name": "John Smith", "age": 35, "occupation": "Software Engineer"}') .expectedOutput('{"name": "John Smith", "age": 35, "occupation": "Software Engineer"}') .build(); const jsonScorer = new api_scorer_js_1.JsonCorrectnessScorer(0.8); const results = yield client.evaluate({ examples: [jsonExample], scorers: [jsonScorer], model: "Qwen/Qwen2.5-72B-Instruct-Turbo", projectName: "json-scorer-test", evalName: `json-scorer-${generateRandomString()}` }); expect(results).toBeTruthy(); expect(results.length).toBe(1); expect((_a = results[0].scorersData) === null || _a === void 0 ? void 0 : _a.length).toBe(1); expect((_b = results[0].scorersData) === null || _b === void 0 ? void 0 : _b[0].name).toBe("json_correctness"); })); }); //# sourceMappingURL=eval-operations.test.js.map