UNPKG

@future-agi/ai-evaluation

Version:

We help GenAI teams maintain high-accuracy for their Models in production.

283 lines 14.4 kB
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __rest = (this && this.__rest) || function (s, e) { var t = {}; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0) t[p] = s[p]; if (s != null && typeof Object.getOwnPropertySymbols === "function") for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) { if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i])) t[p[i]] = s[p[i]]; } return t; }; import { APIKeyAuth, ResponseHandler, HttpMethod, Routes, InvalidAuthError } from '@future-agi/sdk'; /** * Handles responses for evaluation requests */ export class EvalResponseHandler extends ResponseHandler { static _parseSuccess(response) { var _a, _b, _c, _d, _e; const data = response.data || {}; const evalResults = []; if (Array.isArray(data.result)) { for (const result of data.result) { if (result && Array.isArray(result.evaluations)) { for (const evaluation of result.evaluations) { const newMetadata = {}; if (evaluation === null || evaluation === void 0 ? void 0 : evaluation.metadata) { let metadata = evaluation.metadata; if (typeof metadata === "string") { try { metadata = JSON.parse(metadata); } catch ( /* ignore parse errors */_f) { /* ignore parse errors */ } } if (metadata && typeof metadata === "object") { newMetadata["usage"] = (_a = metadata.usage) !== null && _a !== void 0 ? _a : {}; newMetadata["cost"] = (_b = metadata.cost) !== null && _b !== void 0 ? _b : {}; newMetadata["explanation"] = (_c = metadata.explanation) !== null && _c !== void 0 ? _c : {}; } } evalResults.push({ data: evaluation === null || evaluation === void 0 ? void 0 : evaluation.data, failure: evaluation === null || evaluation === void 0 ? void 0 : evaluation.failure, reason: (_d = evaluation === null || evaluation === void 0 ? void 0 : evaluation.reason) !== null && _d !== void 0 ? _d : "", runtime: (_e = evaluation === null || evaluation === void 0 ? void 0 : evaluation.runtime) !== null && _e !== void 0 ? _e : 0, metadata: newMetadata, metrics: Array.isArray(evaluation === null || evaluation === void 0 ? void 0 : evaluation.metrics) ? evaluation.metrics.map((m) => ({ id: m.id, value: m.value, })) : [], }); } } } } return { eval_results: evalResults }; } static _handleError(response) { if (response.status === 400) { throw new Error(`Evaluation failed with a 400 Bad Request. Please check your input data and evaluation configuration. Response: ${response.data}`); } else if (response.status === 403) { throw new InvalidAuthError(); } else { throw new Error(`Error in evaluation: ${response.status}, response: ${response.data}`); } } } /** * Handles responses for evaluation info requests */ export class EvalInfoResponseHandler extends ResponseHandler { static _parseSuccess(response) { const data = response.data; if (data.result) { return data.result; } else { throw new Error(`Failed to get evaluation info: ${data}`); } } static _handleError(response) { if (response.status === 400) { // In TypeScript with axios, it's more common to let the caller handle response.data throw new Error(`Bad request: ${response.data}`); } if (response.status === 403) { throw new InvalidAuthError(); } throw new Error(`Failed to get evaluation info: ${response.status}`); } } /** * Client for evaluating LLM test cases */ export class Evaluator extends APIKeyAuth { constructor(options = {}) { const fiApiKey = process.env.FI_API_KEY || options.fiApiKey; const fiSecretKey = process.env.FI_SECRET_KEY || options.fiSecretKey; const fiBaseUrl = process.env.FI_BASE_URL || options.fiBaseUrl; super(Object.assign(Object.assign({}, options), { fiApiKey, fiSecretKey, fiBaseUrl })); this.evalInfoCache = new Map(); this.maxWorkers = options.maxWorkers || 8; } evaluate(evalTemplates, inputs, options) { return __awaiter(this, void 0, void 0, function* () { const { timeout, modelName, customEvalName } = options; if (!modelName || modelName.trim() === "") { throw new TypeError("'modelName' is a required option and must be a non-empty string."); } let traceEval = options.traceEval || false; let spanId = undefined; const extractName = (t) => { if (typeof t === 'string') { return t; } if (typeof t === 'object' && t.eval_name) { return t.eval_name; } return undefined; }; const firstTemplate = Array.isArray(evalTemplates) ? evalTemplates[0] : evalTemplates; const evalName = extractName(firstTemplate); if (!evalName) { throw new TypeError("Unsupported eval_templates argument. Expect eval template class/obj or name str."); } // OpenTelemetry logic if (traceEval) { if (!customEvalName) { traceEval = false; console.warn("Failed to trace the evaluation. Please set the customEvalName."); } else { try { // Dynamically import to avoid making OTEL a hard dependency const otel = yield import('@opentelemetry/api'); const { checkCustomEvalConfigExists } = yield import('@traceai/fi-core'); const currentSpan = otel.trace.getSpan(otel.context.active()); if (currentSpan && currentSpan.isRecording()) { const spanContext = currentSpan.spanContext(); if (otel.isSpanContextValid(spanContext)) { spanId = spanContext.spanId; // Accessing the resource is not part of the public API interface, // but is available on SDK implementations. This mirrors the Python SDK's approach. const tracerProvider = otel.trace.getTracerProvider(); // @ts-ignore const resource = tracerProvider.resource || (currentSpan && (currentSpan).resource); let projectName = resource === null || resource === void 0 ? void 0 : resource.attributes['project_name']; if (!projectName) { // Fallback to standard OTEL service.name if custom attribute is absent projectName = resource === null || resource === void 0 ? void 0 : resource.attributes['service.name']; } if (projectName) { const evalTags = [{ custom_eval_name: customEvalName, eval_name: evalName, mapping: {}, config: {}, }]; const customEvalExists = yield checkCustomEvalConfigExists(projectName, evalTags); if (customEvalExists) { traceEval = false; console.warn("Failed to trace the evaluation. Custom eval configuration with the same name already exists for this project"); } } else { traceEval = false; console.warn("Could not determine project_name from OpenTelemetry context. " + "Skipping check for existing custom eval configuration."); } } } } catch (error) { console.warn("OpenTelemetry API not found. Please install '@opentelemetry/api' to enable tracing. " + "Skipping trace for this evaluation.", error); traceEval = false; } } } const transformedApiInputs = {}; if (Array.isArray(inputs)) { // Explicitly disallow array-of-dicts per spec throw new TypeError("'inputs' must be a dictionary, array-of-dicts is not supported."); } for (const [key, value] of Object.entries(inputs)) { if (Array.isArray(value)) { if (!value.every(v => typeof v === "string")) { throw new TypeError(`All values in array for key '${key}' must be strings.`); } transformedApiInputs[key] = value; } else if (typeof value === "string") { transformedApiInputs[key] = [value]; } else { throw new TypeError(`Invalid input type for key '${key}'. Expected string or string[].`); } } const finalApiPayload = { eval_name: evalName, inputs: transformedApiInputs, model: modelName, span_id: spanId, custom_eval_name: customEvalName, trace_eval: traceEval, }; // Convert timeout (seconds) to milliseconds for axios. Use a higher default (200s) if not provided. const timeoutMs = timeout !== undefined ? timeout * 1000 : this.defaultTimeout * 1000; try { const response = yield this.request({ method: HttpMethod.POST, url: `${this.baseUrl}/${Routes.evaluatev2}`, json: finalApiPayload, timeout: timeoutMs, }, EvalResponseHandler); return response; } catch (error) { console.error("Evaluation failed:", error); throw error; } }); } _get_eval_info(evalName) { return __awaiter(this, void 0, void 0, function* () { if (this.evalInfoCache.has(evalName)) { return this.evalInfoCache.get(evalName); } const response = yield this.request({ method: HttpMethod.GET, url: `${this.baseUrl}/${Routes.get_eval_templates}`, }, EvalInfoResponseHandler); const evalInfo = response.find(item => item.name === evalName); if (!evalInfo) { throw new Error(`Evaluation template with name '${evalName}' not found`); } this.evalInfoCache.set(evalName, evalInfo); return evalInfo; }); } list_evaluations() { return __awaiter(this, void 0, void 0, function* () { const config = { method: HttpMethod.GET, url: `${this.baseUrl}/${Routes.get_eval_templates}` }; const response = yield this.request(config, EvalInfoResponseHandler); return response; }); } } /** * Convenience function to run a single or batch of evaluations. * @param evalTemplates - Evaluation name string (e.g., "Factual Accuracy") or list of templates. * @param inputs - Single test case or list of test cases as dictionaries. * @param options - Optional parameters for the evaluation. * @returns BatchRunResult containing evaluation results. */ export const evaluate = (evalTemplates, inputs, options) => { const { fiApiKey, fiSecretKey, fiBaseUrl } = options, restOptions = __rest(options, ["fiApiKey", "fiSecretKey", "fiBaseUrl"]); return new Evaluator({ fiApiKey, fiSecretKey, fiBaseUrl }).evaluate(evalTemplates, inputs, restOptions); }; /** * Convenience function to fetch information about all available evaluation templates. * @returns A list of evaluation template information dictionaries. */ export const list_evaluations = (options = {}) => { const { fiApiKey, fiSecretKey, fiBaseUrl } = options; return new Evaluator({ fiApiKey, fiSecretKey, fiBaseUrl }).list_evaluations(); }; //# sourceMappingURL=evaluator.js.map