UNPKG

@future-agi/ai-evaluation

Version:

We help GenAI teams maintain high-accuracy for their Models in production.

256 lines 11.7 kB
import { APIKeyAuth, ResponseHandler, HttpMethod, Routes, InvalidAuthError } from '@future-agi/sdk'; /** * Handles responses for evaluation requests */ export class EvalResponseHandler extends ResponseHandler { static _parseSuccess(response) { const data = response.data || {}; const evalResults = []; if (Array.isArray(data.result)) { for (const result of data.result) { if (result && Array.isArray(result.evaluations)) { for (const evaluation of result.evaluations) { const newMetadata = {}; if (evaluation?.metadata) { let metadata = evaluation.metadata; if (typeof metadata === "string") { try { metadata = JSON.parse(metadata); } catch { /* ignore parse errors */ } } if (metadata && typeof metadata === "object") { newMetadata["usage"] = metadata.usage ?? {}; newMetadata["cost"] = metadata.cost ?? {}; newMetadata["explanation"] = metadata.explanation ?? {}; } } evalResults.push({ data: evaluation?.data, failure: evaluation?.failure, reason: evaluation?.reason ?? "", runtime: evaluation?.runtime ?? 0, metadata: newMetadata, metrics: Array.isArray(evaluation?.metrics) ? evaluation.metrics.map((m) => ({ id: m.id, value: m.value, })) : [], }); } } } } return { eval_results: evalResults }; } static _handleError(response) { if (response.status === 400) { throw new Error(`Evaluation failed with a 400 Bad Request. Please check your input data and evaluation configuration. Response: ${response.data}`); } else if (response.status === 403) { throw new InvalidAuthError(); } else { throw new Error(`Error in evaluation: ${response.status}, response: ${response.data}`); } } } /** * Handles responses for evaluation info requests */ export class EvalInfoResponseHandler extends ResponseHandler { static _parseSuccess(response) { const data = response.data; if (data.result) { return data.result; } else { throw new Error(`Failed to get evaluation info: ${data}`); } } static _handleError(response) { if (response.status === 400) { // In TypeScript with axios, it's more common to let the caller handle response.data throw new Error(`Bad request: ${response.data}`); } if (response.status === 403) { throw new InvalidAuthError(); } throw new Error(`Failed to get evaluation info: ${response.status}`); } } /** * Client for evaluating LLM test cases */ export class Evaluator extends APIKeyAuth { constructor(options = {}) { const fiApiKey = process.env.FI_API_KEY || options.fiApiKey; const fiSecretKey = process.env.FI_SECRET_KEY || options.fiSecretKey; const fiBaseUrl = process.env.FI_BASE_URL || options.fiBaseUrl; super({ ...options, fiApiKey, fiSecretKey, fiBaseUrl }); this.evalInfoCache = new Map(); this.maxWorkers = options.maxWorkers || 8; } async evaluate(evalTemplates, inputs, options) { const { timeout, modelName, customEvalName } = options; if (!modelName || modelName.trim() === "") { throw new TypeError("'modelName' is a required option and must be a non-empty string."); } let traceEval = options.traceEval || false; let spanId = undefined; const extractName = (t) => { if (typeof t === 'string') { return t; } if (typeof t === 'object' && t.eval_name) { return t.eval_name; } return undefined; }; const firstTemplate = Array.isArray(evalTemplates) ? evalTemplates[0] : evalTemplates; const evalName = extractName(firstTemplate); if (!evalName) { throw new TypeError("Unsupported eval_templates argument. Expect eval template class/obj or name str."); } // OpenTelemetry logic if (traceEval) { if (!customEvalName) { traceEval = false; console.warn("Failed to trace the evaluation. Please set the customEvalName."); } else { try { // Dynamically import to avoid making OTEL a hard dependency const otel = await import('@opentelemetry/api'); const { checkCustomEvalConfigExists } = await import('@traceai/fi-core'); const currentSpan = otel.trace.getSpan(otel.context.active()); if (currentSpan && currentSpan.isRecording()) { const spanContext = currentSpan.spanContext(); if (otel.isSpanContextValid(spanContext)) { spanId = spanContext.spanId; // Accessing the resource is not part of the public API interface, // but is available on SDK implementations. This mirrors the Python SDK's approach. const tracerProvider = otel.trace.getTracerProvider(); // @ts-ignore const resource = tracerProvider.resource || (currentSpan && (currentSpan).resource); let projectName = resource?.attributes['project_name']; if (!projectName) { // Fallback to standard OTEL service.name if custom attribute is absent projectName = resource?.attributes['service.name']; } if (projectName) { const evalTags = [{ custom_eval_name: customEvalName, eval_name: evalName, mapping: {}, config: {}, }]; const customEvalExists = await checkCustomEvalConfigExists(projectName, evalTags); if (customEvalExists) { traceEval = false; console.warn("Failed to trace the evaluation. Custom eval configuration with the same name already exists for this project"); } } else { traceEval = false; console.warn("Could not determine project_name from OpenTelemetry context. " + "Skipping check for existing custom eval configuration."); } } } } catch (error) { console.warn("OpenTelemetry API not found. Please install '@opentelemetry/api' to enable tracing. " + "Skipping trace for this evaluation.", error); traceEval = false; } } } const transformedApiInputs = {}; if (Array.isArray(inputs)) { // Explicitly disallow array-of-dicts per spec throw new TypeError("'inputs' must be a dictionary, array-of-dicts is not supported."); } for (const [key, value] of Object.entries(inputs)) { if (Array.isArray(value)) { if (!value.every(v => typeof v === "string")) { throw new TypeError(`All values in array for key '${key}' must be strings.`); } transformedApiInputs[key] = value; } else if (typeof value === "string") { transformedApiInputs[key] = [value]; } else { throw new TypeError(`Invalid input type for key '${key}'. Expected string or string[].`); } } const finalApiPayload = { eval_name: evalName, inputs: transformedApiInputs, model: modelName, span_id: spanId, custom_eval_name: customEvalName, trace_eval: traceEval, }; // Convert timeout (seconds) to milliseconds for axios. Use a higher default (200s) if not provided. const timeoutMs = timeout !== undefined ? timeout * 1000 : this.defaultTimeout * 1000; try { const response = await this.request({ method: HttpMethod.POST, url: `${this.baseUrl}/${Routes.evaluatev2}`, json: finalApiPayload, timeout: timeoutMs, }, EvalResponseHandler); return response; } catch (error) { console.error("Evaluation failed:", error); throw error; } } async _get_eval_info(evalName) { if (this.evalInfoCache.has(evalName)) { return this.evalInfoCache.get(evalName); } const response = await this.request({ method: HttpMethod.GET, url: `${this.baseUrl}/${Routes.get_eval_templates}`, }, EvalInfoResponseHandler); const evalInfo = response.find(item => item.name === evalName); if (!evalInfo) { throw new Error(`Evaluation template with name '${evalName}' not found`); } this.evalInfoCache.set(evalName, evalInfo); return evalInfo; } async list_evaluations() { const config = { method: HttpMethod.GET, url: `${this.baseUrl}/${Routes.get_eval_templates}` }; const response = await this.request(config, EvalInfoResponseHandler); return response; } } /** * Convenience function to run a single or batch of evaluations. * @param evalTemplates - Evaluation name string (e.g., "Factual Accuracy") or list of templates. * @param inputs - Single test case or list of test cases as dictionaries. * @param options - Optional parameters for the evaluation. * @returns BatchRunResult containing evaluation results. */ export const evaluate = (evalTemplates, inputs, options) => { const { fiApiKey, fiSecretKey, fiBaseUrl, ...restOptions } = options; return new Evaluator({ fiApiKey, fiSecretKey, fiBaseUrl }).evaluate(evalTemplates, inputs, restOptions); }; /** * Convenience function to fetch information about all available evaluation templates. * @returns A list of evaluation template information dictionaries. */ export const list_evaluations = (options = {}) => { const { fiApiKey, fiSecretKey, fiBaseUrl } = options; return new Evaluator({ fiApiKey, fiSecretKey, fiBaseUrl }).list_evaluations(); }; //# sourceMappingURL=evaluator.js.map