judgeval
Version:
Judgment SDK for TypeScript/JavaScript
562 lines • 26.7 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.AnswerCorrectnessScorer = void 0;
const logger_js_1 = require("../../../common/logger.js");
const base_scorer_js_1 = require("../../base-scorer.js");
const constants_js_1 = require("../../../constants.js");
const prompts_js_1 = require("./prompts.js");
const index_js_1 = require("../../../judges/index.js");
// Required parameters for this scorer
const requiredParams = ['input', 'actualOutput', 'expectedOutput'];
/**
* Answer Correctness Scorer
*
* This scorer evaluates whether the actual output correctly represents
* the expected output by extracting statements from the expected output
* and checking if they are correctly represented in the actual output.
*/
class AnswerCorrectnessScorer extends base_scorer_js_1.JudgevalScorer {
/**
* Constructor for AnswerCorrectnessScorer
* @param threshold Minimum score to consider the evaluation successful (default: 0.5)
* @param model LLM to use for evaluation (string or Judge instance)
* @param include_reason Whether to generate a reason for the score
* @param async_mode Whether to use asynchronous evaluation
* @param strict_mode If true, sets threshold to 1.0 (requiring perfect match)
* @param verbose_mode Enables detailed logging
* @param user Optional user identifier for the LLM
* @param additional_metadata Additional metadata to include in the result
*/
constructor(threshold = 0.5, model, include_reason = true, async_mode = true, strict_mode = false, verbose_mode = true, user, additional_metadata) {
super(constants_js_1.APIScorer.ANSWER_CORRECTNESS, strict_mode ? 1.0 : threshold, additional_metadata, include_reason, async_mode, strict_mode, verbose_mode);
(0, logger_js_1.info)(`Initializing AnswerCorrectnessScorer with threshold=${this.threshold}, model=${model}, strict_mode=${strict_mode}`);
const { judge, usingNativeModel } = (0, index_js_1.createJudge)(model, user);
this.model = judge;
this.usingNativeModel = usingNativeModel;
this.evaluation_model = this.model.getModelName();
(0, logger_js_1.log)(`Using model: ${this.evaluation_model}`);
// Set required fields for this scorer
this.requiredFields = ['input', 'actualOutput', 'expectedOutput'];
}
/**
* Get statements from expected output asynchronously
*/
_aGetStatements(expectedOutput) {
return __awaiter(this, void 0, void 0, function* () {
(0, logger_js_1.log)("Getting statements asynchronously");
// Handle string array
const expectedOutputStr = Array.isArray(expectedOutput) ? expectedOutput.join('\n') : expectedOutput;
const prompt = prompts_js_1.AnswerCorrectnessTemplate.deduceStatements(expectedOutputStr);
try {
const response = yield this.model.aGenerate(prompt);
// Parse the response
try {
const jsonResponse = JSON.parse(response);
const parsed = prompts_js_1.StatementsSchema.safeParse(jsonResponse);
if (parsed.success) {
return parsed.data.statements;
}
else {
// Fallback to direct access if schema validation fails
(0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing");
if (jsonResponse.statements && Array.isArray(jsonResponse.statements)) {
return jsonResponse.statements;
}
}
}
catch (parseError) {
(0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`);
// Try to extract JSON from the response text
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (jsonMatch) {
try {
const extractedJson = JSON.parse(jsonMatch[0]);
if (extractedJson.statements && Array.isArray(extractedJson.statements)) {
return extractedJson.statements;
}
}
catch (e) {
(0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`);
}
}
}
// If all parsing attempts fail, return empty array
(0, logger_js_1.error)("Failed to parse statements from model response");
return [];
}
catch (e) {
(0, logger_js_1.error)(`Error getting statements: ${e}`);
return [];
}
});
}
/**
* Get statements from expected output synchronously
*/
_getStatements(expectedOutput) {
// Handle string array
const expectedOutputStr = Array.isArray(expectedOutput) ? expectedOutput.join('\n') : expectedOutput;
const prompt = prompts_js_1.AnswerCorrectnessTemplate.deduceStatements(expectedOutputStr);
try {
const response = this.model.generate(prompt);
// Parse the response
try {
const jsonResponse = JSON.parse(response);
const parsed = prompts_js_1.StatementsSchema.safeParse(jsonResponse);
if (parsed.success) {
return parsed.data.statements;
}
else {
// Fallback to direct access if schema validation fails
(0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing");
if (jsonResponse.statements && Array.isArray(jsonResponse.statements)) {
return jsonResponse.statements;
}
}
}
catch (parseError) {
(0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`);
// Try to extract JSON from the response text
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (jsonMatch) {
try {
const extractedJson = JSON.parse(jsonMatch[0]);
if (extractedJson.statements && Array.isArray(extractedJson.statements)) {
return extractedJson.statements;
}
}
catch (e) {
(0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`);
}
}
}
// If all parsing attempts fail, return empty array
(0, logger_js_1.error)("Failed to parse statements from model response");
return [];
}
catch (e) {
(0, logger_js_1.error)(`Error getting statements: ${e}`);
return [];
}
}
/**
* Get verdicts for statements against actual output asynchronously
*/
_aGetVerdicts(actualOutput) {
return __awaiter(this, void 0, void 0, function* () {
(0, logger_js_1.log)("Getting verdicts asynchronously");
if (!this.statements || this.statements.length === 0) {
(0, logger_js_1.warn)("No statements to evaluate");
return [];
}
// Handle string array
const actualOutputStr = Array.isArray(actualOutput) ? actualOutput.join('\n') : actualOutput;
const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateVerdicts(this.statements, actualOutputStr);
try {
const response = yield this.model.aGenerate(prompt);
// Parse the response
try {
const jsonResponse = JSON.parse(response);
const parsed = prompts_js_1.VerdictsSchema.safeParse(jsonResponse);
if (parsed.success) {
return parsed.data.verdicts;
}
else {
// Fallback to direct access if schema validation fails
(0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing");
if (jsonResponse.verdicts && Array.isArray(jsonResponse.verdicts)) {
return jsonResponse.verdicts.map((v) => ({
verdict: v.verdict,
reason: v.reason
}));
}
}
}
catch (parseError) {
(0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`);
// Try to extract JSON from the response text
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (jsonMatch) {
try {
const extractedJson = JSON.parse(jsonMatch[0]);
if (extractedJson.verdicts && Array.isArray(extractedJson.verdicts)) {
return extractedJson.verdicts.map((v) => ({
verdict: v.verdict,
reason: v.reason
}));
}
}
catch (e) {
(0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`);
}
}
}
// If all parsing attempts fail, return empty array
(0, logger_js_1.error)("Failed to parse verdicts from model response");
return [];
}
catch (e) {
(0, logger_js_1.error)(`Error getting verdicts: ${e}`);
return [];
}
});
}
/**
* Get verdicts for statements against actual output synchronously
*/
_getVerdicts(actualOutput) {
if (!this.statements || this.statements.length === 0) {
(0, logger_js_1.warn)("No statements to evaluate");
return [];
}
// Handle string array
const actualOutputStr = Array.isArray(actualOutput) ? actualOutput.join('\n') : actualOutput;
const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateVerdicts(this.statements, actualOutputStr);
try {
const response = this.model.generate(prompt);
// Parse the response
try {
const jsonResponse = JSON.parse(response);
const parsed = prompts_js_1.VerdictsSchema.safeParse(jsonResponse);
if (parsed.success) {
return parsed.data.verdicts;
}
else {
// Fallback to direct access if schema validation fails
(0, logger_js_1.warn)("Schema validation failed, falling back to raw response parsing");
if (jsonResponse.verdicts && Array.isArray(jsonResponse.verdicts)) {
return jsonResponse.verdicts.map((v) => ({
verdict: v.verdict,
reason: v.reason
}));
}
}
}
catch (parseError) {
(0, logger_js_1.warn)(`Error parsing JSON response: ${parseError}`);
// Try to extract JSON from the response text
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (jsonMatch) {
try {
const extractedJson = JSON.parse(jsonMatch[0]);
if (extractedJson.verdicts && Array.isArray(extractedJson.verdicts)) {
return extractedJson.verdicts.map((v) => ({
verdict: v.verdict,
reason: v.reason
}));
}
}
catch (e) {
(0, logger_js_1.error)(`Failed to extract JSON from response: ${e}`);
}
}
}
// If all parsing attempts fail, return empty array
(0, logger_js_1.error)("Failed to parse verdicts from model response");
return [];
}
catch (e) {
(0, logger_js_1.error)(`Error getting verdicts: ${e}`);
return [];
}
}
/**
* Get reason for the score asynchronously
*/
_aGetReason() {
return __awaiter(this, void 0, void 0, function* () {
var _a;
if (!this.include_reason) {
return undefined;
}
if (!this.verdicts || this.verdicts.length === 0) {
return undefined;
}
try {
// Get incorrect statements with their verdicts
const incorrectStatements = [];
for (let i = 0; i < this.statements.length; i++) {
if (i < this.verdicts.length && this.verdicts[i].verdict.toLowerCase() === "no") {
incorrectStatements.push([this.statements[i], this.verdicts[i].reason]);
}
}
if (incorrectStatements.length === 0) {
return "All statements in the expected output are correctly represented in the actual output.";
}
// Generate reason
const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateReason(incorrectStatements, ((_a = this.score) === null || _a === void 0 ? void 0 : _a.toString()) || "0");
const reasonText = yield this.model.aGenerate(prompt);
const parsedReason = prompts_js_1.ReasonSchema.safeParse(JSON.parse(reasonText));
if (!parsedReason.success) {
(0, logger_js_1.error)(`Failed to parse reason: ${parsedReason.error}`);
return undefined;
}
return parsedReason.data.reason;
}
catch (err) {
(0, logger_js_1.error)(`Error getting reason: ${err}`);
return undefined;
}
});
}
/**
* Get reason for the score synchronously
*/
_getReason() {
var _a;
if (!this.include_reason) {
return undefined;
}
if (!this.verdicts || this.verdicts.length === 0) {
return undefined;
}
try {
// Get incorrect statements with their verdicts
const incorrectStatements = [];
for (let i = 0; i < this.statements.length; i++) {
if (i < this.verdicts.length && this.verdicts[i].verdict.toLowerCase() === "no") {
incorrectStatements.push([this.statements[i], this.verdicts[i].reason]);
}
}
if (incorrectStatements.length === 0) {
return "All statements in the expected output are correctly represented in the actual output.";
}
// Generate reason
const prompt = prompts_js_1.AnswerCorrectnessTemplate.generateReason(incorrectStatements, ((_a = this.score) === null || _a === void 0 ? void 0 : _a.toString()) || "0");
const reasonText = this.model.generate(prompt);
const parsedReason = prompts_js_1.ReasonSchema.safeParse(JSON.parse(reasonText));
if (!parsedReason.success) {
(0, logger_js_1.error)(`Failed to parse reason: ${parsedReason.error}`);
return undefined;
}
return parsedReason.data.reason;
}
catch (err) {
(0, logger_js_1.error)(`Error getting reason: ${err}`);
return undefined;
}
}
/**
* Compute score based on verdicts
*/
_computeScore() {
(0, logger_js_1.log)("Computing score");
// If we have no statements or verdicts due to API errors, return 0 instead of 1
// This ensures that when API calls fail, we don't incorrectly return a perfect score
if (!this.statements || this.statements.length === 0) {
return 0;
}
if (!this.verdicts || this.verdicts.length === 0) {
return 0;
}
let correctCount = 0;
for (const verdict of this.verdicts) {
if (verdict.verdict.trim().toLowerCase() === "yes") {
correctCount++;
}
}
const score = correctCount / this.verdicts.length;
// Match Python implementation's handling of strict_mode
return this.strict_mode && score < this.threshold ? 0 : score;
}
/**
* Check if example has required parameters
*/
_checkExampleParams(example) {
for (const param of this.requiredFields) {
if (param === 'input' && !example.input) {
throw new Error(`Example is missing required parameter: input`);
}
else if (param === 'actualOutput' && !example.actualOutput) {
throw new Error(`Example is missing required parameter: actualOutput`);
}
else if (param === 'expectedOutput' && !example.expectedOutput) {
throw new Error(`Example is missing required parameter: expectedOutput`);
}
else if (param === 'context' && (!example.context || !Array.isArray(example.context))) {
throw new Error(`Example is missing required parameter: context (must be an array)`);
}
else if (param === 'retrievalContext' && (!example.retrievalContext || !Array.isArray(example.retrievalContext))) {
throw new Error(`Example is missing required parameter: retrievalContext (must be an array)`);
}
}
}
/**
* Create verbose logs for debugging
*/
_createVerboseLogs() {
if (!this.verbose_mode) {
return '';
}
const steps = [
`Statements:\n${JSON.stringify(this.statements, null, 2)}`,
`Verdicts:\n${JSON.stringify(this.verdicts, null, 2)}`,
`Score: ${this.score}\nReason: ${this.reason}`
];
return steps.join('\n\n');
}
/**
* Score an example synchronously - this is for compatibility with the Python SDK
*/
syncScoreExample(example) {
(0, logger_js_1.info)("Starting example scoring (sync mode)");
try {
// Check required parameters
this._checkExampleParams(example);
// Process example
if (this.async_mode) {
throw new Error("Cannot use synchronous scoreExample with async_mode=true. Use async scoreExample instead.");
}
// Track token usage
let promptTokens = 0;
let completionTokens = 0;
// Get statements and track tokens
this.statements = this._getStatements(example.expectedOutput);
promptTokens += 500; // Approximate tokens for statements prompt
completionTokens += 100; // Approximate tokens for statements response
// Get verdicts and track tokens
this.verdicts = this._getVerdicts(example.actualOutput);
promptTokens += 800; // Approximate tokens for verdicts prompt
completionTokens += 200; // Approximate tokens for verdicts response
// Compute score
this.score = this._computeScore();
// Get reason if needed
if (this.include_reason) {
this.reason = this._getReason();
promptTokens += 300; // Approximate tokens for reason prompt
completionTokens += 100; // Approximate tokens for reason response
}
// Calculate evaluation cost
this.evaluation_cost = this._calculateTokenCosts(this.evaluation_model || 'gpt-3.5-turbo', promptTokens, completionTokens);
// Set success flag
this.success = this.score >= this.threshold;
// Create verbose logs if needed
if (this.verbose_mode) {
this.verbose_logs = this._createVerboseLogs();
}
(0, logger_js_1.info)(`Scoring completed with score: ${this.score}`);
// Ensure all fields match the ScorerData interface
return {
name: this.type,
threshold: this.threshold,
success: this.success || false,
score: this.score || 0,
reason: this.reason !== undefined ? this.reason : null,
strict_mode: this.strict_mode || false,
evaluation_model: this.evaluation_model || null,
error: null,
evaluation_cost: this.evaluation_cost,
verbose_logs: this.verbose_logs ? this.verbose_logs : null,
additional_metadata: this.additional_metadata || {}
};
}
catch (error) {
// Handle errors
const errorMessage = error instanceof Error ? error.message : String(error);
this.error = errorMessage;
this.success = false;
return {
name: this.type,
threshold: this.threshold,
success: false,
score: 0,
reason: `Error during scoring: ${errorMessage}`,
strict_mode: this.strict_mode || false,
evaluation_model: this.evaluation_model || null,
error: errorMessage,
evaluation_cost: null,
verbose_logs: null,
additional_metadata: this.additional_metadata || {}
};
}
}
/**
* Score an example - this is the main method that should be called
* It will use async or sync methods based on the async_mode setting
*/
scoreExample(example) {
return __awaiter(this, void 0, void 0, function* () {
if (!this.async_mode) {
return this.syncScoreExample(example);
}
(0, logger_js_1.info)("Starting example scoring (async mode)");
try {
// Check required parameters
this._checkExampleParams(example);
// Track token usage
let promptTokens = 0;
let completionTokens = 0;
// Process example
this.statements = yield this._aGetStatements(example.expectedOutput);
promptTokens += 500; // Approximate tokens for statements prompt
completionTokens += 100; // Approximate tokens for statements response
this.verdicts = yield this._aGetVerdicts(example.actualOutput);
promptTokens += 800; // Approximate tokens for verdicts prompt
completionTokens += 200; // Approximate tokens for verdicts response
this.score = this._computeScore();
if (this.include_reason) {
this.reason = yield this._aGetReason();
promptTokens += 300; // Approximate tokens for reason prompt
completionTokens += 100; // Approximate tokens for reason response
}
this.success = this._successCheck();
if (this.verbose_mode) {
this.verbose_logs = this._createVerboseLogs();
}
// Calculate evaluation cost
this.evaluation_cost = this._calculateTokenCosts(this.evaluation_model || 'gpt-3.5-turbo', promptTokens, completionTokens);
(0, logger_js_1.info)(`Scoring completed with score: ${this.score}`);
// Ensure all fields match the ScorerData interface
return {
name: this.type,
threshold: this.threshold,
success: this.success || false,
score: this.score || 0,
reason: this.reason !== undefined ? this.reason : null,
strict_mode: this.strict_mode || false,
evaluation_model: this.evaluation_model || null,
error: null,
evaluation_cost: this.evaluation_cost,
verbose_logs: this.verbose_logs ? this.verbose_logs : null,
additional_metadata: this.additional_metadata || {}
};
}
catch (error) {
// Handle errors
const errorMessage = error instanceof Error ? error.message : String(error);
this.error = errorMessage;
this.success = false;
return {
name: this.type,
threshold: this.threshold,
success: false,
score: 0,
reason: `Error during scoring: ${errorMessage}`,
strict_mode: this.strict_mode || false,
evaluation_model: this.evaluation_model || null,
error: errorMessage,
evaluation_cost: null,
verbose_logs: null,
additional_metadata: this.additional_metadata || {}
};
}
});
}
/**
* Get the name of the scorer
*/
get name() {
return "Answer Correctness";
}
}
exports.AnswerCorrectnessScorer = AnswerCorrectnessScorer;
//# sourceMappingURL=answer-correctness.js.map