UNPKG

judgeval

Version:

Judgment SDK for TypeScript/JavaScript

562 lines 26.7 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.AnswerCorrectnessScorer = void 0; const logger_js_1 = require("../../../common/logger.js"); const base_scorer_js_1 = require("../../base-scorer.js"); const constants_js_1 = require("../../../constants.js"); const prompts_js_1 = require("./prompts.js"); const index_js_1 = require("../../../judges/index.js"); // Required parameters for this scorer const requiredParams = ['input', 'actualOutput', 'expectedOutput']; /** * Answer Correctness Scorer * * This scorer evaluates whether the actual output correctly represents * the expected output by extracting statements from the expected output * and checking if they are correctly represented in the actual output. */ class AnswerCorrectnessScorer extends base_scorer_js_1.JudgevalScorer { /** * Constructor for AnswerCorrectnessScorer * @param threshold Minimum score to consider the evaluation successful (default: 0.5) * @param model LLM to use for evaluation (string or Judge instance) * @param include_reason Whether to generate a reason for the score * @param async_mode Whether to use asynchronous evaluation * @param strict_mode If true, sets threshold to 1.0 (requiring perfect match) * @param verbose_mode Enables detailed logging * @param user Optional user identifier for the LLM * @param additional_metadata Additional metadata to include in the result */ constructor(threshold = 0.5, model, include_reason = true, async_mode = true, strict_mode = false, verbose_mode = true, user, additional_metadata) { super(constants_js_1.APIScorer.ANSWER_CORRECTNESS, strict_mode ? 1.0 : threshold, additional_metadata, include_reason, async_mode, strict_mode, verbose_mode); (0, logger_js_1.info)(`Initializing AnswerCorrectnessScorer with threshold=${this.threshold}, model=${model}, strict_mode=${strict_mode}`); const { judge, usingNativeModel } = (0, index_js_1.createJudge)(model, user); this.model = judge; this.usingNativeModel = usingNativeModel; this.evaluation_model = this.model.getModelName(); (0, logger_js_1.log)(`Using model: ${this.evaluation_model}`); // Set required fields for this scorer this.requiredFields = ['input', 'actualOutput', 'expectedOutput']; } /** * Get statements from expected output asynchronously */ _aGetStatements(expectedOutput) { return __awaiter(this, void 0, void 0, function* () { (0, logger_js_1.log)("Getting statements asynchronously"); // Handle string array const expectedOutputStr = Array.isArray(expectedOutput) ? expectedOutput.join('\n') : expectedOutput; const prompt = prompts_js_1.AnswerCorrectnessTemplate.deduceStatements(expectedOutputStr); try { const response = yield this.model.aGenerate(prompt); // Parse the response try { const jsonResponse = JSON.parse(response); const parsed = prompts_js_1.StatementsSchema.safeParse(jsonResponse); if (parsed.success) { return parsed.data.statements; } else { // Fallback to direct access if schema validation fails (0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing"); if (jsonResponse.statements && Array.isArray(jsonResponse.statements)) { return jsonResponse.statements; } } } catch (parseError) { (0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`); // Try to extract JSON from the response text const jsonMatch = response.match(/\{[\s\S]*\}/); if (jsonMatch) { try { const extractedJson = JSON.parse(jsonMatch[0]); if (extractedJson.statements && Array.isArray(extractedJson.statements)) { return extractedJson.statements; } } catch (e) { (0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`); } } } // If all parsing attempts fail, return empty array (0, logger_js_1.error)("Failed to parse statements from model response"); return []; } catch (e) { (0, logger_js_1.error)(`Error getting statements: ${e}`); return []; } }); } /** * Get statements from expected output synchronously */ _getStatements(expectedOutput) { // Handle string array const expectedOutputStr = Array.isArray(expectedOutput) ? expectedOutput.join('\n') : expectedOutput; const prompt = prompts_js_1.AnswerCorrectnessTemplate.deduceStatements(expectedOutputStr); try { const response = this.model.generate(prompt); // Parse the response try { const jsonResponse = JSON.parse(response); const parsed = prompts_js_1.StatementsSchema.safeParse(jsonResponse); if (parsed.success) { return parsed.data.statements; } else { // Fallback to direct access if schema validation fails (0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing"); if (jsonResponse.statements && Array.isArray(jsonResponse.statements)) { return jsonResponse.statements; } } } catch (parseError) { (0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`); // Try to extract JSON from the response text const jsonMatch = response.match(/\{[\s\S]*\}/); if (jsonMatch) { try { const extractedJson = JSON.parse(jsonMatch[0]); if (extractedJson.statements && Array.isArray(extractedJson.statements)) { return extractedJson.statements; } } catch (e) { (0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`); } } } // If all parsing attempts fail, return empty array (0, logger_js_1.error)("Failed to parse statements from model response"); return []; } catch (e) { (0, logger_js_1.error)(`Error getting statements: ${e}`); return []; } } /** * Get verdicts for statements against actual output asynchronously */ _aGetVerdicts(actualOutput) { return __awaiter(this, void 0, void 0, function* () { (0, logger_js_1.log)("Getting verdicts asynchronously"); if (!this.statements || this.statements.length === 0) { (0, logger_js_1.warn)("No statements to evaluate"); return []; } // Handle string array const actualOutputStr = Array.isArray(actualOutput) ? actualOutput.join('\n') : actualOutput; const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateVerdicts(this.statements, actualOutputStr); try { const response = yield this.model.aGenerate(prompt); // Parse the response try { const jsonResponse = JSON.parse(response); const parsed = prompts_js_1.VerdictsSchema.safeParse(jsonResponse); if (parsed.success) { return parsed.data.verdicts; } else { // Fallback to direct access if schema validation fails (0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing"); if (jsonResponse.verdicts && Array.isArray(jsonResponse.verdicts)) { return jsonResponse.verdicts.map((v) => ({ verdict: v.verdict, reason: v.reason })); } } } catch (parseError) { (0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`); // Try to extract JSON from the response text const jsonMatch = response.match(/\{[\s\S]*\}/); if (jsonMatch) { try { const extractedJson = JSON.parse(jsonMatch[0]); if (extractedJson.verdicts && Array.isArray(extractedJson.verdicts)) { return extractedJson.verdicts.map((v) => ({ verdict: v.verdict, reason: v.reason })); } } catch (e) { (0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`); } } } // If all parsing attempts fail, return empty array (0, logger_js_1.error)("Failed to parse verdicts from model response"); return []; } catch (e) { (0, logger_js_1.error)(`Error getting verdicts: ${e}`); return []; } }); } /** * Get verdicts for statements against actual output synchronously */ _getVerdicts(actualOutput) { if (!this.statements || this.statements.length === 0) { (0, logger_js_1.warn)("No statements to evaluate"); return []; } // Handle string array const actualOutputStr = Array.isArray(actualOutput) ? actualOutput.join('\n') : actualOutput; const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateVerdicts(this.statements, actualOutputStr); try { const response = this.model.generate(prompt); // Parse the response try { const jsonResponse = JSON.parse(response); const parsed = prompts_js_1.VerdictsSchema.safeParse(jsonResponse); if (parsed.success) { return parsed.data.verdicts; } else { // Fallback to direct access if schema validation fails (0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing"); if (jsonResponse.verdicts && Array.isArray(jsonResponse.verdicts)) { return jsonResponse.verdicts.map((v) => ({ verdict: v.verdict, reason: v.reason })); } } } catch (parseError) { (0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`); // Try to extract JSON from the response text const jsonMatch = response.match(/\{[\s\S]*\}/); if (jsonMatch) { try { const extractedJson = JSON.parse(jsonMatch[0]); if (extractedJson.verdicts && Array.isArray(extractedJson.verdicts)) { return extractedJson.verdicts.map((v) => ({ verdict: v.verdict, reason: v.reason })); } } catch (e) { (0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`); } } } // If all parsing attempts fail, return empty array (0, logger_js_1.error)("Failed to parse verdicts from model response"); return []; } catch (e) { (0, logger_js_1.error)(`Error getting verdicts: ${e}`); return []; } } /** * Get reason for the score asynchronously */ _aGetReason() { return __awaiter(this, void 0, void 0, function* () { var _a; if (!this.include_reason) { return undefined; } if (!this.verdicts || this.verdicts.length === 0) { return undefined; } try { // Get incorrect statements with their verdicts const incorrectStatements = []; for (let i = 0; i < this.statements.length; i++) { if (i < this.verdicts.length && this.verdicts[i].verdict.toLowerCase() === "no") { incorrectStatements.push([this.statements[i], this.verdicts[i].reason]); } } if (incorrectStatements.length === 0) { return "All statements in the expected output are correctly represented in the actual output."; } // Generate reason const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateReason(incorrectStatements, ((_a = this.score) === null || _a === void 0 ? void 0 : _a.toString()) || "0"); const reasonText = yield this.model.aGenerate(prompt); const parsedReason = prompts_js_1.ReasonSchema.safeParse(JSON.parse(reasonText)); if (!parsedReason.success) { (0, logger_js_1.error)(`Failed to parse reason: ${parsedReason.error}`); return undefined; } return parsedReason.data.reason; } catch (err) { (0, logger_js_1.error)(`Error getting reason: ${err}`); return undefined; } }); } /** * Get reason for the score synchronously */ _getReason() { var _a; if (!this.include_reason) { return undefined; } if (!this.verdicts || this.verdicts.length === 0) { return undefined; } try { // Get incorrect statements with their verdicts const incorrectStatements = []; for (let i = 0; i < this.statements.length; i++) { if (i < this.verdicts.length && this.verdicts[i].verdict.toLowerCase() === "no") { incorrectStatements.push([this.statements[i], this.verdicts[i].reason]); } } if (incorrectStatements.length === 0) { return "All statements in the expected output are correctly represented in the actual output."; } // Generate reason const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateReason(incorrectStatements, ((_a = this.score) === null || _a === void 0 ? void 0 : _a.toString()) || "0"); const reasonText = this.model.generate(prompt); const parsedReason = prompts_js_1.ReasonSchema.safeParse(JSON.parse(reasonText)); if (!parsedReason.success) { (0, logger_js_1.error)(`Failed to parse reason: ${parsedReason.error}`); return undefined; } return parsedReason.data.reason; } catch (err) { (0, logger_js_1.error)(`Error getting reason: ${err}`); return undefined; } } /** * Compute score based on verdicts */ _computeScore() { (0, logger_js_1.log)("Computing score"); // If we have no statements or verdicts due to API errors, return 0 instead of 1 // This ensures that when API calls fail, we don't incorrectly return a perfect score if (!this.statements || this.statements.length === 0) { return 0; } if (!this.verdicts || this.verdicts.length === 0) { return 0; } let correctCount = 0; for (const verdict of this.verdicts) { if (verdict.verdict.trim().toLowerCase() === "yes") { correctCount++; } } const score = correctCount / this.verdicts.length; // Match Python implementation's handling of strict_mode return this.strict_mode && score < this.threshold ? 0 : score; } /** * Check if example has required parameters */ _checkExampleParams(example) { for (const param of this.requiredFields) { if (param === 'input' && !example.input) { throw new Error(`Example is missing required parameter: input`); } else if (param === 'actualOutput' && !example.actualOutput) { throw new Error(`Example is missing required parameter: actualOutput`); } else if (param === 'expectedOutput' && !example.expectedOutput) { throw new Error(`Example is missing required parameter: expectedOutput`); } else if (param === 'context' && (!example.context || !Array.isArray(example.context))) { throw new Error(`Example is missing required parameter: context (must be an array)`); } else if (param === 'retrievalContext' && (!example.retrievalContext || !Array.isArray(example.retrievalContext))) { throw new Error(`Example is missing required parameter: retrievalContext (must be an array)`); } } } /** * Create verbose logs for debugging */ _createVerboseLogs() { if (!this.verbose_mode) { return ''; } const steps = [ `Statements:\n${JSON.stringify(this.statements, null, 2)}`, `Verdicts:\n${JSON.stringify(this.verdicts, null, 2)}`, `Score: ${this.score}\nReason: ${this.reason}` ]; return steps.join('\n\n'); } /** * Score an example synchronously - this is for compatibility with the Python SDK */ syncScoreExample(example) { (0, logger_js_1.info)("Starting example scoring (sync mode)"); try { // Check required parameters this._checkExampleParams(example); // Process example if (this.async_mode) { throw new Error("Cannot use synchronous scoreExample with async_mode=true. Use async scoreExample instead."); } // Track token usage let promptTokens = 0; let completionTokens = 0; // Get statements and track tokens this.statements = this._getStatements(example.expectedOutput); promptTokens += 500; // Approximate tokens for statements prompt completionTokens += 100; // Approximate tokens for statements response // Get verdicts and track tokens this.verdicts = this._getVerdicts(example.actualOutput); promptTokens += 800; // Approximate tokens for verdicts prompt completionTokens += 200; // Approximate tokens for verdicts response // Compute score this.score = this._computeScore(); // Get reason if needed if (this.include_reason) { this.reason = this._getReason(); promptTokens += 300; // Approximate tokens for reason prompt completionTokens += 100; // Approximate tokens for reason response } // Calculate evaluation cost this.evaluation_cost = this._calculateTokenCosts(this.evaluation_model || 'gpt-3.5-turbo', promptTokens, completionTokens); // Set success flag this.success = this.score >= this.threshold; // Create verbose logs if needed if (this.verbose_mode) { this.verbose_logs = this._createVerboseLogs(); } (0, logger_js_1.info)(`Scoring completed with score: ${this.score}`); // Ensure all fields match the ScorerData interface return { name: this.type, threshold: this.threshold, success: this.success || false, score: this.score || 0, reason: this.reason !== undefined ? this.reason : null, strict_mode: this.strict_mode || false, evaluation_model: this.evaluation_model || null, error: null, evaluation_cost: this.evaluation_cost, verbose_logs: this.verbose_logs ? this.verbose_logs : null, additional_metadata: this.additional_metadata || {} }; } catch (error) { // Handle errors const errorMessage = error instanceof Error ? error.message : String(error); this.error = errorMessage; this.success = false; return { name: this.type, threshold: this.threshold, success: false, score: 0, reason: `Error during scoring: ${errorMessage}`, strict_mode: this.strict_mode || false, evaluation_model: this.evaluation_model || null, error: errorMessage, evaluation_cost: null, verbose_logs: null, additional_metadata: this.additional_metadata || {} }; } } /** * Score an example - this is the main method that should be called * It will use async or sync methods based on the async_mode setting */ scoreExample(example) { return __awaiter(this, void 0, void 0, function* () { if (!this.async_mode) { return this.syncScoreExample(example); } (0, logger_js_1.info)("Starting example scoring (async mode)"); try { // Check required parameters this._checkExampleParams(example); // Track token usage let promptTokens = 0; let completionTokens = 0; // Process example this.statements = yield this._aGetStatements(example.expectedOutput); promptTokens += 500; // Approximate tokens for statements prompt completionTokens += 100; // Approximate tokens for statements response this.verdicts = yield this._aGetVerdicts(example.actualOutput); promptTokens += 800; // Approximate tokens for verdicts prompt completionTokens += 200; // Approximate tokens for verdicts response this.score = this._computeScore(); if (this.include_reason) { this.reason = yield this._aGetReason(); promptTokens += 300; // Approximate tokens for reason prompt completionTokens += 100; // Approximate tokens for reason response } this.success = this._successCheck(); if (this.verbose_mode) { this.verbose_logs = this._createVerboseLogs(); } // Calculate evaluation cost this.evaluation_cost = this._calculateTokenCosts(this.evaluation_model || 'gpt-3.5-turbo', promptTokens, completionTokens); (0, logger_js_1.info)(`Scoring completed with score: ${this.score}`); // Ensure all fields match the ScorerData interface return { name: this.type, threshold: this.threshold, success: this.success || false, score: this.score || 0, reason: this.reason !== undefined ? this.reason : null, strict_mode: this.strict_mode || false, evaluation_model: this.evaluation_model || null, error: null, evaluation_cost: this.evaluation_cost, verbose_logs: this.verbose_logs ? this.verbose_logs : null, additional_metadata: this.additional_metadata || {} }; } catch (error) { // Handle errors const errorMessage = error instanceof Error ? error.message : String(error); this.error = errorMessage; this.success = false; return { name: this.type, threshold: this.threshold, success: false, score: 0, reason: `Error during scoring: ${errorMessage}`, strict_mode: this.strict_mode || false, evaluation_model: this.evaluation_model || null, error: errorMessage, evaluation_cost: null, verbose_logs: null, additional_metadata: this.additional_metadata || {} }; } }); } /** * Get the name of the scorer */ get name() { return "Answer Correctness"; } } exports.AnswerCorrectnessScorer = AnswerCorrectnessScorer; //# sourceMappingURL=answer-correctness.js.map