UNPKG

axiom

Version:

Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.

642 lines (632 loc) 24 kB
"use strict"; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/evals.ts var evals_exports = {}; __export(evals_exports, { experimental_AxiomReporter: () => AxiomReporter, experimental_Eval: () => Eval }); module.exports = __toCommonJS(evals_exports); // src/evals/eval.ts var import_vitest = require("vitest"); var import_api2 = require("@opentelemetry/api"); var import_nanoid = require("nanoid"); // src/otel/semconv/attributes.ts var import_semantic_conventions = require("@opentelemetry/semantic-conventions"); // src/otel/semconv/eval_proposal.ts var ATTR_EVAL_ID = "eval.id"; var ATTR_EVAL_NAME = "eval.name"; var ATTR_EVAL_TYPE = "eval.type"; var ATTR_EVAL_TAGS = "eval.tags"; var ATTR_EVAL_VERSION = "eval.version"; var ATTR_EVAL_TRIALS = "eval.trials"; var ATTR_EVAL_BASE_ID = "eval.base_id"; var ATTR_EVAL_BASE_NAME = "eval.base_name"; var ATTR_EVAL_COLLECTION_ID = "eval.collection.id"; var ATTR_EVAL_COLLECTION_SIZE = "eval.collection.size"; var ATTR_EVAL_COLLECTION_NAME = "eval.collection.name"; var ATTR_EVAL_COLLECTION_SPLIT = "eval.collection.split"; var ATTR_EVAL_CASE_ID = "eval.case.id"; var ATTR_EVAL_CASE_INDEX = "eval.case.index"; var ATTR_EVAL_CASE_INPUT = "eval.case.input"; var ATTR_EVAL_CASE_OUTPUT = "eval.case.output"; var ATTR_EVAL_CASE_EXPECTED = "eval.case.expected"; var ATTR_EVAL_CASE_SCORES = "eval.case.scores"; var ATTR_EVAL_CASE_METADATA = "eval.case.metadata"; var ATTR_EVAL_TASK_OUTPUT = "eval.task.output"; var ATTR_EVAL_TASK_NAME = "eval.task.name"; var ATTR_EVAL_TASK_TYPE = "eval.task.type"; var ATTR_EVAL_TASK_TRIAL = "eval.task.trial"; var ATTR_EVAL_SCORE_NAME = "eval.score.name"; var ATTR_EVAL_SCORE_VALUE = "eval.score.value"; var ATTR_EVAL_SCORE_THRESHOLD = "eval.score.threshold"; var ATTR_EVAL_SCORE_PASSED = "eval.score.passed"; var ATTR_EVAL_SCORE_METADATA = "eval.score.metadata"; var ATTR_EVAL_USER_NAME = "eval.user.name"; var ATTR_EVAL_USER_EMAIL = "eval.user.email"; // src/otel/semconv/semconv_incubating.ts var ATTR_ERROR_MESSAGE = "error.message"; var ATTR_GEN_AI_AGENT_DESCRIPTION = "gen_ai.agent.description"; var ATTR_GEN_AI_AGENT_ID = "gen_ai.agent.id"; var ATTR_GEN_AI_AGENT_NAME = "gen_ai.agent.name"; var ATTR_GEN_AI_COMPLETION = "gen_ai.completion"; var ATTR_GEN_AI_CONVERSATION_ID = "gen_ai.conversation.id"; var ATTR_GEN_AI_DATA_SOURCE_ID = "gen_ai.data_source.id"; var ATTR_GEN_AI_OPERATION_NAME = "gen_ai.operation.name"; var GEN_AI_OPERATION_NAME_VALUE_CHAT = "chat"; var GEN_AI_OPERATION_NAME_VALUE_CREATE_AGENT = "create_agent"; var GEN_AI_OPERATION_NAME_VALUE_EMBEDDINGS = "embeddings"; var GEN_AI_OPERATION_NAME_VALUE_EXECUTE_TOOL = "execute_tool"; var GEN_AI_OPERATION_NAME_VALUE_GENERATE_CONTENT = "generate_content"; var GEN_AI_OPERATION_NAME_VALUE_INVOKE_AGENT = "invoke_agent"; var ATTR_GEN_AI_OUTPUT_TYPE = "gen_ai.output.type"; var GEN_AI_OUTPUT_TYPE_VALUE_IMAGE = "image"; var GEN_AI_OUTPUT_TYPE_VALUE_JSON = "json"; var GEN_AI_OUTPUT_TYPE_VALUE_SPEECH = "speech"; var GEN_AI_OUTPUT_TYPE_VALUE_TEXT = "text"; var ATTR_GEN_AI_PROMPT = "gen_ai.prompt"; var ATTR_GEN_AI_REQUEST_CHOICE_COUNT = "gen_ai.request.choice.count"; var ATTR_GEN_AI_REQUEST_ENCODING_FORMATS = "gen_ai.request.encoding_formats"; var ATTR_GEN_AI_REQUEST_FREQUENCY_PENALTY = "gen_ai.request.frequency_penalty"; var ATTR_GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens"; var ATTR_GEN_AI_REQUEST_MODEL = "gen_ai.request.model"; var ATTR_GEN_AI_REQUEST_PRESENCE_PENALTY = "gen_ai.request.presence_penalty"; var ATTR_GEN_AI_REQUEST_SEED = "gen_ai.request.seed"; var ATTR_GEN_AI_REQUEST_STOP_SEQUENCES = "gen_ai.request.stop_sequences"; var ATTR_GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature"; var ATTR_GEN_AI_REQUEST_TOP_K = "gen_ai.request.top_k"; var ATTR_GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p"; var ATTR_GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons"; var ATTR_GEN_AI_RESPONSE_ID = "gen_ai.response.id"; var ATTR_GEN_AI_RESPONSE_MODEL = "gen_ai.response.model"; var ATTR_GEN_AI_SYSTEM = "gen_ai.system"; var GEN_AI_SYSTEM_VALUE_ANTHROPIC = "anthropic"; var GEN_AI_SYSTEM_VALUE_AWS_BEDROCK = "aws.bedrock"; var GEN_AI_SYSTEM_VALUE_AZURE_AI_INFERENCE = "azure.ai.inference"; var GEN_AI_SYSTEM_VALUE_AZURE_AI_OPENAI = "azure.ai.openai"; var GEN_AI_SYSTEM_VALUE_COHERE = "cohere"; var GEN_AI_SYSTEM_VALUE_DEEPSEEK = "deepseek"; var GEN_AI_SYSTEM_VALUE_GCP_GEMINI = "gcp.gemini"; var GEN_AI_SYSTEM_VALUE_GCP_GEN_AI = "gcp.gen_ai"; var GEN_AI_SYSTEM_VALUE_GCP_VERTEX_AI = "gcp.vertex_ai"; var GEN_AI_SYSTEM_VALUE_GROQ = "groq"; var GEN_AI_SYSTEM_VALUE_IBM_WATSONX_AI = "ibm.watsonx.ai"; var GEN_AI_SYSTEM_VALUE_MISTRAL_AI = "mistral_ai"; var GEN_AI_SYSTEM_VALUE_OPENAI = "openai"; var GEN_AI_SYSTEM_VALUE_PERPLEXITY = "perplexity"; var GEN_AI_SYSTEM_VALUE_XAI = "xai"; var ATTR_GEN_AI_TOOL_CALL_ID = "gen_ai.tool.call.id"; var ATTR_GEN_AI_TOOL_DESCRIPTION = "gen_ai.tool.description"; var ATTR_GEN_AI_TOOL_NAME = "gen_ai.tool.name"; var ATTR_GEN_AI_TOOL_TYPE = "gen_ai.tool.type"; var ATTR_GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens"; var ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens"; // src/otel/semconv/attributes.ts var Attr = { Axiom: { GenAI: { SchemaURL: "axiom.gen_ai.schema_url", SDK: { Name: "axiom.gen_ai.sdk.name", Version: "axiom.gen_ai.sdk.version" } } }, GenAI: { PromptMetadata: { ID: "axiom.gen_ai.prompt.id", Name: "axiom.gen_ai.prompt.name", Slug: "axiom.gen_ai.prompt.slug", Version: "axiom.gen_ai.prompt.version" }, /** * These two are used to identify the span */ Capability: { Name: "gen_ai.capability.name" // proprietary to axiom-ai }, Step: { Name: "gen_ai.step.name" // proprietary to axiom-ai }, /** * Regular attributes */ Agent: { Description: ATTR_GEN_AI_AGENT_DESCRIPTION, // not yet used by axiom-ai ID: ATTR_GEN_AI_AGENT_ID, // not yet used by axiom-ai Name: ATTR_GEN_AI_AGENT_NAME // not yet used by axiom-ai }, Completion: ATTR_GEN_AI_COMPLETION, // OTel suggests to use events API for this now Conversation: { ID: ATTR_GEN_AI_CONVERSATION_ID // not yet used by axiom-ai, anyway probably needs to be provided by user }, DataSource: { ID: ATTR_GEN_AI_DATA_SOURCE_ID // not used in axiom-ai yet }, Operation: { Name: ATTR_GEN_AI_OPERATION_NAME, Name_Values: { /** * Note that "text_completion" is deprecated in favor of "chat" for both OpenAI and Anthropic */ Chat: GEN_AI_OPERATION_NAME_VALUE_CHAT, CreateAgent: GEN_AI_OPERATION_NAME_VALUE_CREATE_AGENT, Embeddings: GEN_AI_OPERATION_NAME_VALUE_EMBEDDINGS, ExecuteTool: GEN_AI_OPERATION_NAME_VALUE_EXECUTE_TOOL, GenerateContent: GEN_AI_OPERATION_NAME_VALUE_GENERATE_CONTENT, InvokeAgent: GEN_AI_OPERATION_NAME_VALUE_INVOKE_AGENT } }, Output: { Type: ATTR_GEN_AI_OUTPUT_TYPE, Type_Values: { Text: GEN_AI_OUTPUT_TYPE_VALUE_TEXT, Json: GEN_AI_OUTPUT_TYPE_VALUE_JSON, Image: GEN_AI_OUTPUT_TYPE_VALUE_IMAGE, Speech: GEN_AI_OUTPUT_TYPE_VALUE_SPEECH } }, /** * The provider that is hosting the model, eg AWS Bedrock * There doesn't seem to be a semconv for this */ Prompt: ATTR_GEN_AI_PROMPT, // OTel suggests to use the events api for this Request: { ChoiceCount: ATTR_GEN_AI_REQUEST_CHOICE_COUNT, // not yet used by axiom-ai EncodingFormats: ATTR_GEN_AI_REQUEST_ENCODING_FORMATS, // not yet used by axiom-ai FrequencyPenalty: ATTR_GEN_AI_REQUEST_FREQUENCY_PENALTY, MaxTokens: ATTR_GEN_AI_REQUEST_MAX_TOKENS, /** * The model you asked for */ Model: ATTR_GEN_AI_REQUEST_MODEL, PresencePenalty: ATTR_GEN_AI_REQUEST_PRESENCE_PENALTY, Seed: ATTR_GEN_AI_REQUEST_SEED, StopSequences: ATTR_GEN_AI_REQUEST_STOP_SEQUENCES, Temperature: ATTR_GEN_AI_REQUEST_TEMPERATURE, TopK: ATTR_GEN_AI_REQUEST_TOP_K, TopP: ATTR_GEN_AI_REQUEST_TOP_P }, Response: { FinishReasons: ATTR_GEN_AI_RESPONSE_FINISH_REASONS, ID: ATTR_GEN_AI_RESPONSE_ID, /** * The model that was actually used (might be different bc routing) - only ever get this from the response, otherwise omit */ Model: ATTR_GEN_AI_RESPONSE_MODEL // somehow not landing on the span for google models? check up on this... }, /** * From OTel docs: * ``` * Multiple systems, including Azure OpenAI and Gemini, are accessible * by OpenAI client libraries. In such cases, the gen_ai.system is set * to openai based on the instrumentation's best knowledge, instead of * the actual system. * ``` */ System: ATTR_GEN_AI_SYSTEM, // not yet used by axiom-ai System_Values: { Anthropic: GEN_AI_SYSTEM_VALUE_ANTHROPIC, AWSBedrock: GEN_AI_SYSTEM_VALUE_AWS_BEDROCK, AzureAIInference: GEN_AI_SYSTEM_VALUE_AZURE_AI_INFERENCE, AzureAIOpenAI: GEN_AI_SYSTEM_VALUE_AZURE_AI_OPENAI, Cohere: GEN_AI_SYSTEM_VALUE_COHERE, Deepseek: GEN_AI_SYSTEM_VALUE_DEEPSEEK, GCPGemini: GEN_AI_SYSTEM_VALUE_GCP_GEMINI, GCPGenAI: GEN_AI_SYSTEM_VALUE_GCP_GEN_AI, GCPVertexAI: GEN_AI_SYSTEM_VALUE_GCP_VERTEX_AI, Groq: GEN_AI_SYSTEM_VALUE_GROQ, IBMWatsonxAI: GEN_AI_SYSTEM_VALUE_IBM_WATSONX_AI, MistralAI: GEN_AI_SYSTEM_VALUE_MISTRAL_AI, OpenAI: GEN_AI_SYSTEM_VALUE_OPENAI, Perplexity: GEN_AI_SYSTEM_VALUE_PERPLEXITY, XAI: GEN_AI_SYSTEM_VALUE_XAI }, Tool: { CallID: ATTR_GEN_AI_TOOL_CALL_ID, Description: ATTR_GEN_AI_TOOL_DESCRIPTION, Name: ATTR_GEN_AI_TOOL_NAME, Type: ATTR_GEN_AI_TOOL_TYPE, /** * Note, OTel Semantic Convention puts these on `gen_ai.choice` events * @see https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/#event-gen_aichoice */ Arguments: "gen_ai.tool.arguments", /** * Note, OTel Semantic Convention puts these on `gen_ai.tool.message` events */ Message: "gen_ai.tool.message" }, Usage: { InputTokens: ATTR_GEN_AI_USAGE_INPUT_TOKENS, OutputTokens: ATTR_GEN_AI_USAGE_OUTPUT_TOKENS } }, Eval: { ID: ATTR_EVAL_ID, Name: ATTR_EVAL_NAME, Type: ATTR_EVAL_TYPE, Version: ATTR_EVAL_VERSION, BaseID: ATTR_EVAL_BASE_ID, BaseName: ATTR_EVAL_BASE_NAME, Trials: ATTR_EVAL_TRIALS, Tags: ATTR_EVAL_TAGS, Collection: { ID: ATTR_EVAL_COLLECTION_ID, Name: ATTR_EVAL_COLLECTION_NAME, Split: ATTR_EVAL_COLLECTION_SPLIT, Size: ATTR_EVAL_COLLECTION_SIZE }, Case: { ID: ATTR_EVAL_CASE_ID, Index: ATTR_EVAL_CASE_INDEX, Input: ATTR_EVAL_CASE_INPUT, Output: ATTR_EVAL_CASE_OUTPUT, Expected: ATTR_EVAL_CASE_EXPECTED, Scores: ATTR_EVAL_CASE_SCORES, Metadata: ATTR_EVAL_CASE_METADATA }, Task: { Output: ATTR_EVAL_TASK_OUTPUT, Name: ATTR_EVAL_TASK_NAME, Type: ATTR_EVAL_TASK_TYPE, Trial: ATTR_EVAL_TASK_TRIAL }, Score: { Name: ATTR_EVAL_SCORE_NAME, Value: ATTR_EVAL_SCORE_VALUE, Threshold: ATTR_EVAL_SCORE_THRESHOLD, Passed: ATTR_EVAL_SCORE_PASSED, Metadata: ATTR_EVAL_SCORE_METADATA }, User: { Name: ATTR_EVAL_USER_NAME, Email: ATTR_EVAL_USER_EMAIL } }, Error: { Type: import_semantic_conventions.ATTR_ERROR_TYPE, Message: ATTR_ERROR_MESSAGE }, HTTP: { Response: { StatusCode: import_semantic_conventions.ATTR_HTTP_RESPONSE_STATUS_CODE } } }; // src/evals/instrument.ts var import_sdk_trace_node = require("@opentelemetry/sdk-trace-node"); var import_resources = require("@opentelemetry/resources"); var import_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otlp-http"); var import_api = require("@opentelemetry/api"); var collectorOptions = { url: process.env.AXIOM_URL ? `${process.env.AXIOM_URL}/v1/traces` : "https://api.axiom.co/v1/traces", // Axiom API endpoint for trace data headers: { Authorization: `Bearer ${process.env.AXIOM_TOKEN}`, // Replace API_TOKEN with your actual API token "X-Axiom-Dataset": process.env.AXIOM_DATASET || "" // Replace DATASET_NAME with your dataset }, concurrencyLimit: 10 // an optional limit on pending requests }; var exporter = new import_exporter_trace_otlp_http.OTLPTraceExporter(collectorOptions); var processor = new import_sdk_trace_node.BatchSpanProcessor(exporter, { maxQueueSize: 2048, maxExportBatchSize: 512, scheduledDelayMillis: 5e3, exportTimeoutMillis: 3e4 }); var provider = new import_sdk_trace_node.NodeTracerProvider({ resource: (0, import_resources.resourceFromAttributes)({ ["service.name"]: "axiom-ai", ["service.version"]: "0.13.0" }), spanProcessors: [processor] }); provider.register(); var tracer = import_api.trace.getTracer("axiom-ai", "0.13.0"); var flush = async () => { await provider.forceFlush(); }; var startSpan = (name, opts, context2) => { return tracer.startSpan(name, opts, context2); }; // src/evals/git-info.ts var import_node_child_process = require("child_process"); function getGitUserInfo() { try { const name = (0, import_node_child_process.execSync)("git config --get user.name").toString().trim(); const email = (0, import_node_child_process.execSync)("git config --get user.email").toString().trim(); return { name, email }; } catch { return null; } } // src/evals/eval.ts var DEFAULT_TIMEOUT = 1e4; var nanoid = (0, import_nanoid.customAlphabet)("1234567890abcdefghijklmnopqrstuvwxyz", 10); var Eval = (name, params) => { registerEval(name, params).catch(console.error); }; async function registerEval(evalName, opts, vitestOpts = {}) { const describeFn = vitestOpts.modifier === "skip" ? import_vitest.describe.skip : import_vitest.describe; const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve([]) : opts.data(); const user = getGitUserInfo(); const result = await describeFn( evalName, async () => { const dataset = await datasetPromise; const id = nanoid(); const suiteSpan = startSpan(`eval ${evalName}-${id}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval", [Attr.Eval.ID]: id, [Attr.Eval.Name]: evalName, [Attr.Eval.Type]: "regression", // TODO: where to get experiment type value from? [Attr.Eval.Tags]: [], // TODO: where to get experiment tags from? [Attr.Eval.Trials]: 1, // TODO: implement trials [Attr.Eval.Collection.Name]: "unknown", // TODO: where to get dataset name from? [Attr.Eval.Collection.Split]: "unknown", // TODO: where to get dataset split value from? [Attr.Eval.Collection.Size]: dataset.length, // user info [Attr.Eval.User.Name]: user?.name, [Attr.Eval.User.Email]: user?.email } }); const suiteContext = import_api2.trace.setSpan(import_api2.context.active(), suiteSpan); (0, import_vitest.afterAll)(async () => { const tags = ["offline"]; suiteSpan.setAttribute(Attr.Eval.Tags, JSON.stringify(tags)); suiteSpan.setStatus({ code: import_api2.SpanStatusCode.OK }); suiteSpan.end(); await flush(); }); await import_vitest.it.concurrent.for(dataset.map((d, index) => ({ ...d, index })))( evalName, async (data, { task }) => { const caseName = data.name ?? `${evalName}_${data.index}`; const start = performance.now(); const caseSpan = startSpan( `case ${caseName}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.case", [Attr.Eval.Case.ID]: caseName, [Attr.Eval.Case.Index]: data.index, [Attr.Eval.Case.Input]: typeof data.input === "string" ? data.input : JSON.stringify(data.input), [Attr.Eval.Case.Expected]: typeof data.expected === "string" ? data.expected : JSON.stringify(data.expected), // user info ["eval.user.name"]: user?.name, ["eval.user.email"]: user?.email } }, suiteContext ); const caseContext = import_api2.trace.setSpan(import_api2.context.active(), caseSpan); try { const { output, duration } = await runTask(caseContext, { index: data.index, expected: data.expected, input: data.input, scorers: opts.scorers, task: opts.task, threshold: opts.threshold }); const scoreList = await Promise.all( opts.scorers.map(async (scorer) => { const scorerSpan = startSpan( `score ${scorer.name}`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.score" } }, caseContext ); const start2 = performance.now(); const result2 = await scorer({ input: data.input, output, expected: data.expected }); const duration2 = Math.round(performance.now() - start2); const scoreValue = result2.score; const passed = scoreValue >= opts.threshold; let hasError = false; scorerSpan.setAttributes({ [Attr.Eval.Score.Name]: scorer.name, [Attr.Eval.Score.Value]: scoreValue, [Attr.Eval.Score.Threshold]: opts.threshold, [Attr.Eval.Score.Passed]: passed }); if (!passed) { hasError = `Score didn't pass`; scorerSpan.setStatus({ code: import_api2.SpanStatusCode.ERROR, message: hasError }); } else { scorerSpan.setStatus({ code: import_api2.SpanStatusCode.OK }); } scorerSpan.end(); return { ...result2, metadata: { duration: duration2, startedAt: start2, error: hasError || null } }; }) ); const scores = Object.fromEntries(scoreList.map((s) => [s.name, s])); caseSpan.setAttributes({ [Attr.Eval.Case.Output]: typeof output === "string" ? output : JSON.stringify(output), [Attr.Eval.Case.Scores]: JSON.stringify(scores) }); caseSpan.setStatus({ code: import_api2.SpanStatusCode.OK }); task.meta.eval = { index: data.index, name: evalName, expected: data.expected, input: data.input, output, scores, status: "success", errors: [], duration, startedAt: start, threshold: opts.threshold }; } catch (e) { caseSpan.recordException(e); caseSpan.setStatus({ code: import_api2.SpanStatusCode.ERROR }); task.meta.eval = { name: evalName, index: data.index, expected: data.expected, input: data.input, output: e, scores: {}, status: "fail", errors: [e], startedAt: start, duration: Math.round(performance.now() - start), threshold: opts.threshold }; throw e; } finally { caseSpan.end(); } } ); }, DEFAULT_TIMEOUT ); return result; } var joinArrayOfUnknownResults = (results) => { return results.reduce((acc, result) => { if (typeof result === "string" || typeof result === "number" || typeof result === "boolean") { return `${acc}${result}`; } throw new Error( `Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.` ); }, ""); }; var executeTask = async (task, input, expected) => { const taskResultOrStream = await task(input, expected); if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) { const chunks = []; for await (const chunk of taskResultOrStream) { chunks.push(chunk); } return joinArrayOfUnknownResults(chunks); } return taskResultOrStream; }; var runTask = async (caseContext, opts) => { const taskName = opts.task.name ?? "anonymous"; const taskSpan = startSpan( `task`, { attributes: { [Attr.GenAI.Operation.Name]: "eval.task", [Attr.Eval.Task.Name]: taskName, [Attr.Eval.Task.Type]: "llm_completion", // TODO: How to determine task type? [Attr.Eval.Task.Trial]: 1 } }, caseContext ); const { output, duration } = await import_api2.context.with( import_api2.trace.setSpan(import_api2.context.active(), taskSpan), async () => { const start = performance.now(); const output2 = await executeTask(opts.task, opts.input, opts.expected); const duration2 = Math.round(performance.now() - start); taskSpan.setAttributes({ [Attr.Eval.Task.Output]: JSON.stringify(output2) }); taskSpan.setStatus({ code: import_api2.SpanStatusCode.OK }); taskSpan.end(); return { output: output2, duration: duration2 }; } ); return { output, duration }; }; // src/evals/reporter.ts var import_console_table_printer = require("console-table-printer"); var prRed = (s) => `\x1B[91m ${s}\x1B[00m`; var AxiomReporter = class { onTestSuiteReady(_testSuite) { } onTestSuiteResult(testSuite) { const scoreboard = new import_console_table_printer.Table({ title: testSuite.name }); for (const test of testSuite.children.array()) { if (test.type !== "test") continue; const testMeta = test.meta(); if (!testMeta.eval) { return; } const scores = Object.keys(testMeta.eval.scores).map((k) => { const v = testMeta.eval.scores[k].score ? testMeta.eval.scores[k].score : 0; const scoreValue = Number(v * 100).toFixed(2); const score = testMeta.eval.threshold && v < testMeta.eval.threshold ? prRed(scoreValue + "%") : scoreValue + "%"; return [k, score]; }); scoreboard.addRow({ case: testMeta.eval.index.toString(), ...Object.fromEntries(scores) }); } scoreboard.printTable(); } async onTestRunEnd(_testModules, _errors, _reason) { } }; // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { experimental_AxiomReporter, experimental_Eval }); //# sourceMappingURL=evals.cjs.map