@future-agi/ai-evaluation
Version:
We help GenAI teams maintain high-accuracy for their Models in production.
256 lines • 11.7 kB
JavaScript
import { APIKeyAuth, ResponseHandler, HttpMethod, Routes, InvalidAuthError } from '@future-agi/sdk';
/**
* Handles responses for evaluation requests
*/
export class EvalResponseHandler extends ResponseHandler {
static _parseSuccess(response) {
const data = response.data || {};
const evalResults = [];
if (Array.isArray(data.result)) {
for (const result of data.result) {
if (result && Array.isArray(result.evaluations)) {
for (const evaluation of result.evaluations) {
const newMetadata = {};
if (evaluation?.metadata) {
let metadata = evaluation.metadata;
if (typeof metadata === "string") {
try {
metadata = JSON.parse(metadata);
}
catch { /* ignore parse errors */ }
}
if (metadata && typeof metadata === "object") {
newMetadata["usage"] = metadata.usage ?? {};
newMetadata["cost"] = metadata.cost ?? {};
newMetadata["explanation"] = metadata.explanation ?? {};
}
}
evalResults.push({
data: evaluation?.data,
failure: evaluation?.failure,
reason: evaluation?.reason ?? "",
runtime: evaluation?.runtime ?? 0,
metadata: newMetadata,
metrics: Array.isArray(evaluation?.metrics)
? evaluation.metrics.map((m) => ({
id: m.id,
value: m.value,
}))
: [],
});
}
}
}
}
return { eval_results: evalResults };
}
static _handleError(response) {
if (response.status === 400) {
throw new Error(`Evaluation failed with a 400 Bad Request. Please check your input data and evaluation configuration. Response: ${response.data}`);
}
else if (response.status === 403) {
throw new InvalidAuthError();
}
else {
throw new Error(`Error in evaluation: ${response.status}, response: ${response.data}`);
}
}
}
/**
* Handles responses for evaluation info requests
*/
export class EvalInfoResponseHandler extends ResponseHandler {
static _parseSuccess(response) {
const data = response.data;
if (data.result) {
return data.result;
}
else {
throw new Error(`Failed to get evaluation info: ${data}`);
}
}
static _handleError(response) {
if (response.status === 400) {
// In TypeScript with axios, it's more common to let the caller handle response.data
throw new Error(`Bad request: ${response.data}`);
}
if (response.status === 403) {
throw new InvalidAuthError();
}
throw new Error(`Failed to get evaluation info: ${response.status}`);
}
}
/**
* Client for evaluating LLM test cases
*/
export class Evaluator extends APIKeyAuth {
constructor(options = {}) {
const fiApiKey = process.env.FI_API_KEY || options.fiApiKey;
const fiSecretKey = process.env.FI_SECRET_KEY || options.fiSecretKey;
const fiBaseUrl = process.env.FI_BASE_URL || options.fiBaseUrl;
super({ ...options, fiApiKey, fiSecretKey, fiBaseUrl });
this.evalInfoCache = new Map();
this.maxWorkers = options.maxWorkers || 8;
}
async evaluate(evalTemplates, inputs, options) {
const { timeout, modelName, customEvalName } = options;
if (!modelName || modelName.trim() === "") {
throw new TypeError("'modelName' is a required option and must be a non-empty string.");
}
let traceEval = options.traceEval || false;
let spanId = undefined;
const extractName = (t) => {
if (typeof t === 'string') {
return t;
}
if (typeof t === 'object' && t.eval_name) {
return t.eval_name;
}
return undefined;
};
const firstTemplate = Array.isArray(evalTemplates) ? evalTemplates[0] : evalTemplates;
const evalName = extractName(firstTemplate);
if (!evalName) {
throw new TypeError("Unsupported eval_templates argument. Expect eval template class/obj or name str.");
}
// OpenTelemetry logic
if (traceEval) {
if (!customEvalName) {
traceEval = false;
console.warn("Failed to trace the evaluation. Please set the customEvalName.");
}
else {
try {
// Dynamically import to avoid making OTEL a hard dependency
const otel = await import('@opentelemetry/api');
const { checkCustomEvalConfigExists } = await import('@traceai/fi-core');
const currentSpan = otel.trace.getSpan(otel.context.active());
if (currentSpan && currentSpan.isRecording()) {
const spanContext = currentSpan.spanContext();
if (otel.isSpanContextValid(spanContext)) {
spanId = spanContext.spanId;
// Accessing the resource is not part of the public API interface,
// but is available on SDK implementations. This mirrors the Python SDK's approach.
const tracerProvider = otel.trace.getTracerProvider();
// @ts-ignore
const resource = tracerProvider.resource || (currentSpan && (currentSpan).resource);
let projectName = resource?.attributes['project_name'];
if (!projectName) {
// Fallback to standard OTEL service.name if custom attribute is absent
projectName = resource?.attributes['service.name'];
}
if (projectName) {
const evalTags = [{
custom_eval_name: customEvalName,
eval_name: evalName,
mapping: {},
config: {},
}];
const customEvalExists = await checkCustomEvalConfigExists(projectName, evalTags);
if (customEvalExists) {
traceEval = false;
console.warn("Failed to trace the evaluation. Custom eval configuration with the same name already exists for this project");
}
}
else {
traceEval = false;
console.warn("Could not determine project_name from OpenTelemetry context. " +
"Skipping check for existing custom eval configuration.");
}
}
}
}
catch (error) {
console.warn("OpenTelemetry API not found. Please install '@opentelemetry/api' to enable tracing. " +
"Skipping trace for this evaluation.", error);
traceEval = false;
}
}
}
const transformedApiInputs = {};
if (Array.isArray(inputs)) {
// Explicitly disallow array-of-dicts per spec
throw new TypeError("'inputs' must be a dictionary, array-of-dicts is not supported.");
}
for (const [key, value] of Object.entries(inputs)) {
if (Array.isArray(value)) {
if (!value.every(v => typeof v === "string")) {
throw new TypeError(`All values in array for key '${key}' must be strings.`);
}
transformedApiInputs[key] = value;
}
else if (typeof value === "string") {
transformedApiInputs[key] = [value];
}
else {
throw new TypeError(`Invalid input type for key '${key}'. Expected string or string[].`);
}
}
const finalApiPayload = {
eval_name: evalName,
inputs: transformedApiInputs,
model: modelName,
span_id: spanId,
custom_eval_name: customEvalName,
trace_eval: traceEval,
};
// Convert timeout (seconds) to milliseconds for axios. Use a higher default (200s) if not provided.
const timeoutMs = timeout !== undefined ? timeout * 1000 : this.defaultTimeout * 1000;
try {
const response = await this.request({
method: HttpMethod.POST,
url: `${this.baseUrl}/${Routes.evaluatev2}`,
json: finalApiPayload,
timeout: timeoutMs,
}, EvalResponseHandler);
return response;
}
catch (error) {
console.error("Evaluation failed:", error);
throw error;
}
}
async _get_eval_info(evalName) {
if (this.evalInfoCache.has(evalName)) {
return this.evalInfoCache.get(evalName);
}
const response = await this.request({
method: HttpMethod.GET,
url: `${this.baseUrl}/${Routes.get_eval_templates}`,
}, EvalInfoResponseHandler);
const evalInfo = response.find(item => item.name === evalName);
if (!evalInfo) {
throw new Error(`Evaluation template with name '${evalName}' not found`);
}
this.evalInfoCache.set(evalName, evalInfo);
return evalInfo;
}
async list_evaluations() {
const config = {
method: HttpMethod.GET,
url: `${this.baseUrl}/${Routes.get_eval_templates}`
};
const response = await this.request(config, EvalInfoResponseHandler);
return response;
}
}
/**
* Convenience function to run a single or batch of evaluations.
* @param evalTemplates - Evaluation name string (e.g., "Factual Accuracy") or list of templates.
* @param inputs - Single test case or list of test cases as dictionaries.
* @param options - Optional parameters for the evaluation.
* @returns BatchRunResult containing evaluation results.
*/
export const evaluate = (evalTemplates, inputs, options) => {
const { fiApiKey, fiSecretKey, fiBaseUrl, ...restOptions } = options;
return new Evaluator({ fiApiKey, fiSecretKey, fiBaseUrl }).evaluate(evalTemplates, inputs, restOptions);
};
/**
* Convenience function to fetch information about all available evaluation templates.
* @returns A list of evaluation template information dictionaries.
*/
export const list_evaluations = (options = {}) => {
const { fiApiKey, fiSecretKey, fiBaseUrl } = options;
return new Evaluator({ fiApiKey, fiSecretKey, fiBaseUrl }).list_evaluations();
};
//# sourceMappingURL=evaluator.js.map