@axiomhq/ai
Version:
Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.
609 lines (600 loc) • 22.9 kB
JavaScript
"use strict";
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/evals.ts
var evals_exports = {};
__export(evals_exports, {
experimental_AxiomReporter: () => AxiomReporter,
experimental_Eval: () => Eval
});
module.exports = __toCommonJS(evals_exports);
// src/evals/eval.ts
var import_api2 = require("@opentelemetry/api");
var import_vitest = require("vitest");
// src/otel/semconv/attributes.ts
var import_semantic_conventions = require("@opentelemetry/semantic-conventions");
// src/otel/semconv/eval_proposal.ts
var ATTR_EVAL_RUN_ID = "eval.run.id";
var ATTR_EVAL_EXPERIMENT_ID = "eval.experiment.id";
var ATTR_EVAL_EXPERIMENT_NAME = "eval.experiment.name";
var ATTR_EVAL_EXPERIMENT_TYPE = "eval.experiment.type";
var ATTR_EVAL_EXPERIMENT_TAGS = "eval.experiment.tags";
var ATTR_EVAL_EXPERIMENT_VERSION = "eval.experiment.version";
var ATTR_EVAL_EXPERIMENT_GROUP = "eval.experiment.group";
var ATTR_EVAL_EXPERIMENT_BASE_ID = "eval.experiment.base_id";
var ATTR_EVAL_EXPERIMENT_BASE_NAME = "eval.experiment.base_name";
var ATTR_EVAL_EXPERIMENT_TRIALS = "eval.experiment.trials";
var ATTR_EVAL_DATASET_SIZE = "eval.dataset.size";
var ATTR_EVAL_DATASET_NAME = "eval.dataset.name";
var ATTR_EVAL_DATASET_SPLIT = "eval.dataset.split";
var ATTR_EVAL_CASE_ID = "eval.case.id";
var ATTR_EVAL_CASE_INDEX = "eval.case.index";
var ATTR_EVAL_CASE_INPUT = "eval.case.input";
var ATTR_EVAL_CASE_OUTPUT = "eval.case.ouput";
var ATTR_EVAL_CASE_EXPECTED = "eval.case.expected";
var ATTR_EVAL_CASE_METADATA = "eval.case.metadata";
var ATTR_EVAL_TASK_OUTPUT = "eval.task.output";
var ATTR_EVAL_TASK_NAME = "eval.task.name";
var ATTR_EVAL_TASK_TYPE = "eval.task.type";
var ATTR_EVAL_TASK_TRIAL = "eval.task.trial";
var ATTR_EVAL_SCORE_NAME = "eval.score.name";
var ATTR_EVAL_SCORE_VALUE = "eval.score.value";
var ATTR_EVAL_SCORE_THRESHOLD = "eval.score.threshold";
var ATTR_EVAL_SCORE_PASSED = "eval.score.passed";
var ATTR_EVAL_SCORE_SCORER = "eval.score.scorer";
var ATTR_EVAL_SCORE_METADATA = "eval.score.metadata";
// src/otel/semconv/semconv_incubating.ts
var ATTR_ERROR_MESSAGE = "error.message";
var ATTR_GEN_AI_AGENT_DESCRIPTION = "gen_ai.agent.description";
var ATTR_GEN_AI_AGENT_ID = "gen_ai.agent.id";
var ATTR_GEN_AI_AGENT_NAME = "gen_ai.agent.name";
var ATTR_GEN_AI_COMPLETION = "gen_ai.completion";
var ATTR_GEN_AI_CONVERSATION_ID = "gen_ai.conversation.id";
var ATTR_GEN_AI_DATA_SOURCE_ID = "gen_ai.data_source.id";
var ATTR_GEN_AI_OPERATION_NAME = "gen_ai.operation.name";
var GEN_AI_OPERATION_NAME_VALUE_CHAT = "chat";
var GEN_AI_OPERATION_NAME_VALUE_CREATE_AGENT = "create_agent";
var GEN_AI_OPERATION_NAME_VALUE_EMBEDDINGS = "embeddings";
var GEN_AI_OPERATION_NAME_VALUE_EXECUTE_TOOL = "execute_tool";
var GEN_AI_OPERATION_NAME_VALUE_GENERATE_CONTENT = "generate_content";
var GEN_AI_OPERATION_NAME_VALUE_INVOKE_AGENT = "invoke_agent";
var ATTR_GEN_AI_OUTPUT_TYPE = "gen_ai.output.type";
var GEN_AI_OUTPUT_TYPE_VALUE_IMAGE = "image";
var GEN_AI_OUTPUT_TYPE_VALUE_JSON = "json";
var GEN_AI_OUTPUT_TYPE_VALUE_SPEECH = "speech";
var GEN_AI_OUTPUT_TYPE_VALUE_TEXT = "text";
var ATTR_GEN_AI_PROMPT = "gen_ai.prompt";
var ATTR_GEN_AI_REQUEST_CHOICE_COUNT = "gen_ai.request.choice.count";
var ATTR_GEN_AI_REQUEST_ENCODING_FORMATS = "gen_ai.request.encoding_formats";
var ATTR_GEN_AI_REQUEST_FREQUENCY_PENALTY = "gen_ai.request.frequency_penalty";
var ATTR_GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens";
var ATTR_GEN_AI_REQUEST_MODEL = "gen_ai.request.model";
var ATTR_GEN_AI_REQUEST_PRESENCE_PENALTY = "gen_ai.request.presence_penalty";
var ATTR_GEN_AI_REQUEST_SEED = "gen_ai.request.seed";
var ATTR_GEN_AI_REQUEST_STOP_SEQUENCES = "gen_ai.request.stop_sequences";
var ATTR_GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature";
var ATTR_GEN_AI_REQUEST_TOP_K = "gen_ai.request.top_k";
var ATTR_GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p";
var ATTR_GEN_AI_RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons";
var ATTR_GEN_AI_RESPONSE_ID = "gen_ai.response.id";
var ATTR_GEN_AI_RESPONSE_MODEL = "gen_ai.response.model";
var ATTR_GEN_AI_SYSTEM = "gen_ai.system";
var GEN_AI_SYSTEM_VALUE_ANTHROPIC = "anthropic";
var GEN_AI_SYSTEM_VALUE_AWS_BEDROCK = "aws.bedrock";
var GEN_AI_SYSTEM_VALUE_AZURE_AI_INFERENCE = "azure.ai.inference";
var GEN_AI_SYSTEM_VALUE_AZURE_AI_OPENAI = "azure.ai.openai";
var GEN_AI_SYSTEM_VALUE_COHERE = "cohere";
var GEN_AI_SYSTEM_VALUE_DEEPSEEK = "deepseek";
var GEN_AI_SYSTEM_VALUE_GCP_GEMINI = "gcp.gemini";
var GEN_AI_SYSTEM_VALUE_GCP_GEN_AI = "gcp.gen_ai";
var GEN_AI_SYSTEM_VALUE_GCP_VERTEX_AI = "gcp.vertex_ai";
var GEN_AI_SYSTEM_VALUE_GROQ = "groq";
var GEN_AI_SYSTEM_VALUE_IBM_WATSONX_AI = "ibm.watsonx.ai";
var GEN_AI_SYSTEM_VALUE_MISTRAL_AI = "mistral_ai";
var GEN_AI_SYSTEM_VALUE_OPENAI = "openai";
var GEN_AI_SYSTEM_VALUE_PERPLEXITY = "perplexity";
var GEN_AI_SYSTEM_VALUE_XAI = "xai";
var ATTR_GEN_AI_TOOL_CALL_ID = "gen_ai.tool.call.id";
var ATTR_GEN_AI_TOOL_DESCRIPTION = "gen_ai.tool.description";
var ATTR_GEN_AI_TOOL_NAME = "gen_ai.tool.name";
var ATTR_GEN_AI_TOOL_TYPE = "gen_ai.tool.type";
var ATTR_GEN_AI_USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens";
var ATTR_GEN_AI_USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens";
// src/otel/semconv/attributes.ts
var Attr = {
Axiom: {
GenAI: {
SchemaURL: "axiom.gen_ai.schema_url",
SDK: {
Name: "axiom.gen_ai.sdk.name",
Version: "axiom.gen_ai.sdk.version"
}
}
},
GenAI: {
PromptMetadata: {
ID: "axiom.gen_ai.prompt.id",
Name: "axiom.gen_ai.prompt.name",
Slug: "axiom.gen_ai.prompt.slug",
Version: "axiom.gen_ai.prompt.version"
},
/**
* These two are used to identify the span
*/
Capability: {
Name: "gen_ai.capability.name"
// proprietary to axiom-ai
},
Step: {
Name: "gen_ai.step.name"
// proprietary to axiom-ai
},
/**
* Regular attributes
*/
Agent: {
Description: ATTR_GEN_AI_AGENT_DESCRIPTION,
// not yet used by axiom-ai
ID: ATTR_GEN_AI_AGENT_ID,
// not yet used by axiom-ai
Name: ATTR_GEN_AI_AGENT_NAME
// not yet used by axiom-ai
},
Completion: ATTR_GEN_AI_COMPLETION,
// OTel suggests to use events API for this now
Conversation: {
ID: ATTR_GEN_AI_CONVERSATION_ID
// not yet used by axiom-ai, anyway probably needs to be provided by user
},
DataSource: {
ID: ATTR_GEN_AI_DATA_SOURCE_ID
// not used in axiom-ai yet
},
Operation: {
Name: ATTR_GEN_AI_OPERATION_NAME,
Name_Values: {
/**
* Note that "text_completion" is deprecated in favor of "chat" for both OpenAI and Anthropic
*/
Chat: GEN_AI_OPERATION_NAME_VALUE_CHAT,
CreateAgent: GEN_AI_OPERATION_NAME_VALUE_CREATE_AGENT,
Embeddings: GEN_AI_OPERATION_NAME_VALUE_EMBEDDINGS,
ExecuteTool: GEN_AI_OPERATION_NAME_VALUE_EXECUTE_TOOL,
GenerateContent: GEN_AI_OPERATION_NAME_VALUE_GENERATE_CONTENT,
InvokeAgent: GEN_AI_OPERATION_NAME_VALUE_INVOKE_AGENT
}
},
Output: {
Type: ATTR_GEN_AI_OUTPUT_TYPE,
Type_Values: {
Text: GEN_AI_OUTPUT_TYPE_VALUE_TEXT,
Json: GEN_AI_OUTPUT_TYPE_VALUE_JSON,
Image: GEN_AI_OUTPUT_TYPE_VALUE_IMAGE,
Speech: GEN_AI_OUTPUT_TYPE_VALUE_SPEECH
}
},
/**
* The provider that is hosting the model, eg AWS Bedrock
* There doesn't seem to be a semconv for this
*/
Prompt: ATTR_GEN_AI_PROMPT,
// OTel suggests to use the events api for this
Request: {
ChoiceCount: ATTR_GEN_AI_REQUEST_CHOICE_COUNT,
// not yet used by axiom-ai
EncodingFormats: ATTR_GEN_AI_REQUEST_ENCODING_FORMATS,
// not yet used by axiom-ai
FrequencyPenalty: ATTR_GEN_AI_REQUEST_FREQUENCY_PENALTY,
MaxTokens: ATTR_GEN_AI_REQUEST_MAX_TOKENS,
/**
* The model you asked for
*/
Model: ATTR_GEN_AI_REQUEST_MODEL,
PresencePenalty: ATTR_GEN_AI_REQUEST_PRESENCE_PENALTY,
Seed: ATTR_GEN_AI_REQUEST_SEED,
StopSequences: ATTR_GEN_AI_REQUEST_STOP_SEQUENCES,
Temperature: ATTR_GEN_AI_REQUEST_TEMPERATURE,
TopK: ATTR_GEN_AI_REQUEST_TOP_K,
TopP: ATTR_GEN_AI_REQUEST_TOP_P
},
Response: {
FinishReasons: ATTR_GEN_AI_RESPONSE_FINISH_REASONS,
ID: ATTR_GEN_AI_RESPONSE_ID,
/**
* The model that was actually used (might be different bc routing) - only ever get this from the response, otherwise omit
*/
Model: ATTR_GEN_AI_RESPONSE_MODEL
// somehow not landing on the span for google models? check up on this...
},
/**
* From OTel docs:
* ```
* Multiple systems, including Azure OpenAI and Gemini, are accessible
* by OpenAI client libraries. In such cases, the gen_ai.system is set
* to openai based on the instrumentation's best knowledge, instead of
* the actual system.
* ```
*/
System: ATTR_GEN_AI_SYSTEM,
// not yet used by axiom-ai
System_Values: {
Anthropic: GEN_AI_SYSTEM_VALUE_ANTHROPIC,
AWSBedrock: GEN_AI_SYSTEM_VALUE_AWS_BEDROCK,
AzureAIInference: GEN_AI_SYSTEM_VALUE_AZURE_AI_INFERENCE,
AzureAIOpenAI: GEN_AI_SYSTEM_VALUE_AZURE_AI_OPENAI,
Cohere: GEN_AI_SYSTEM_VALUE_COHERE,
Deepseek: GEN_AI_SYSTEM_VALUE_DEEPSEEK,
GCPGemini: GEN_AI_SYSTEM_VALUE_GCP_GEMINI,
GCPGenAI: GEN_AI_SYSTEM_VALUE_GCP_GEN_AI,
GCPVertexAI: GEN_AI_SYSTEM_VALUE_GCP_VERTEX_AI,
Groq: GEN_AI_SYSTEM_VALUE_GROQ,
IBMWatsonxAI: GEN_AI_SYSTEM_VALUE_IBM_WATSONX_AI,
MistralAI: GEN_AI_SYSTEM_VALUE_MISTRAL_AI,
OpenAI: GEN_AI_SYSTEM_VALUE_OPENAI,
Perplexity: GEN_AI_SYSTEM_VALUE_PERPLEXITY,
XAI: GEN_AI_SYSTEM_VALUE_XAI
},
Tool: {
CallID: ATTR_GEN_AI_TOOL_CALL_ID,
Description: ATTR_GEN_AI_TOOL_DESCRIPTION,
Name: ATTR_GEN_AI_TOOL_NAME,
Type: ATTR_GEN_AI_TOOL_TYPE,
/**
* Note, OTel Semantic Convention puts these on `gen_ai.choice` events
* @see https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-events/#event-gen_aichoice
*/
Arguments: "gen_ai.tool.arguments",
/**
* Note, OTel Semantic Convention puts these on `gen_ai.tool.message` events
*/
Message: "gen_ai.tool.message"
},
Usage: {
InputTokens: ATTR_GEN_AI_USAGE_INPUT_TOKENS,
OutputTokens: ATTR_GEN_AI_USAGE_OUTPUT_TOKENS
}
},
Eval: {
Run: {
ID: ATTR_EVAL_RUN_ID
},
Experiment: {
ID: ATTR_EVAL_EXPERIMENT_ID,
Name: ATTR_EVAL_EXPERIMENT_NAME,
Type: ATTR_EVAL_EXPERIMENT_TYPE,
Version: ATTR_EVAL_EXPERIMENT_VERSION,
Group: ATTR_EVAL_EXPERIMENT_GROUP,
BaseID: ATTR_EVAL_EXPERIMENT_BASE_ID,
BaseName: ATTR_EVAL_EXPERIMENT_BASE_NAME,
Trials: ATTR_EVAL_EXPERIMENT_TRIALS,
Tags: ATTR_EVAL_EXPERIMENT_TAGS
},
Dataset: {
Name: ATTR_EVAL_DATASET_NAME,
Split: ATTR_EVAL_DATASET_SPLIT,
Size: ATTR_EVAL_DATASET_SIZE
},
Case: {
ID: ATTR_EVAL_CASE_ID,
Index: ATTR_EVAL_CASE_INDEX,
Input: ATTR_EVAL_CASE_INPUT,
Output: ATTR_EVAL_CASE_OUTPUT,
Expected: ATTR_EVAL_CASE_EXPECTED,
Metadata: ATTR_EVAL_CASE_METADATA
},
Task: {
Output: ATTR_EVAL_TASK_OUTPUT,
Name: ATTR_EVAL_TASK_NAME,
Type: ATTR_EVAL_TASK_TYPE,
Trial: ATTR_EVAL_TASK_TRIAL
},
Score: {
Name: ATTR_EVAL_SCORE_NAME,
Value: ATTR_EVAL_SCORE_VALUE,
Threshold: ATTR_EVAL_SCORE_THRESHOLD,
Passed: ATTR_EVAL_SCORE_PASSED,
Scorer: ATTR_EVAL_SCORE_SCORER,
Metadata: ATTR_EVAL_SCORE_METADATA
}
},
Error: {
Type: import_semantic_conventions.ATTR_ERROR_TYPE,
Message: ATTR_ERROR_MESSAGE
},
HTTP: {
Response: {
StatusCode: import_semantic_conventions.ATTR_HTTP_RESPONSE_STATUS_CODE
}
}
};
// src/evals/instrument.ts
var import_sdk_trace_node = require("@opentelemetry/sdk-trace-node");
var import_resources = require("@opentelemetry/resources");
var import_exporter_trace_otlp_http = require("@opentelemetry/exporter-trace-otlp-http");
var import_api = require("@opentelemetry/api");
var collectorOptions = {
url: process.env.AXIOM_URL ? `${process.env.AXIOM_URL}/v1/traces` : "https://api.axiom.co/v1/traces",
// Axiom API endpoint for trace data
headers: {
Authorization: `Bearer ${process.env.AXIOM_TOKEN}`,
// Replace API_TOKEN with your actual API token
"X-Axiom-Dataset": process.env.AXIOM_DATASET || ""
// Replace DATASET_NAME with your dataset
},
concurrencyLimit: 10
// an optional limit on pending requests
};
var exporter = new import_exporter_trace_otlp_http.OTLPTraceExporter(collectorOptions);
var processor = new import_sdk_trace_node.BatchSpanProcessor(exporter, {
maxQueueSize: 2048,
maxExportBatchSize: 512,
scheduledDelayMillis: 5e3,
exportTimeoutMillis: 3e4
});
var provider = new import_sdk_trace_node.NodeTracerProvider({
resource: (0, import_resources.resourceFromAttributes)({
["service.name"]: "axiom-ai",
["service.version"]: "0.8.0"
}),
spanProcessors: [processor]
});
provider.register();
var tracer = import_api.trace.getTracer("axiom-ai", "0.8.0");
var flush = async () => {
await provider.forceFlush();
};
var startSpan = (name, opts, context2) => {
return tracer.startSpan(name, opts, context2);
};
// src/evals/eval.ts
var DEFAULT_TIMEOUT = 1e4;
var generateExperimentId = () => {
return crypto.randomUUID();
};
var Eval = (name, params) => {
registerEval(name, params).catch(console.error);
};
async function registerEval(evalName, opts, vitestOpts = {}) {
const describeFn = vitestOpts.modifier === "skip" ? import_vitest.describe.skip : import_vitest.describe;
const datasetPromise = vitestOpts.modifier === "skip" ? Promise.resolve([]) : opts.data();
const result = await describeFn(
evalName,
async () => {
const dataset = await datasetPromise;
const suiteSpan = startSpan(`eval ${evalName}`, {
attributes: {
[Attr.GenAI.Operation.Name]: "eval",
[Attr.Eval.Experiment.ID]: generateExperimentId(),
[Attr.Eval.Experiment.Name]: evalName,
[Attr.Eval.Experiment.Type]: "regression",
// TODO: where to get experiment type value from?
[Attr.Eval.Experiment.Tags]: [],
// TODO: where to get experiment tags from?
[Attr.Eval.Experiment.Version]: "1.0.0",
// TODO: where to get experiment version from?
// [Attr.Eval.Experiment.Group]: "default", // TODO: where to get experiment group from?
// [Attr.Eval.Experiment.BaseID]: "default", // TODO: where to get experiment base id from?
// [Attr.Eval.Experiment.BaseName]: "default", // TODO: where to get experiment base name from?
[Attr.Eval.Experiment.Trials]: 1,
// TODO: implement trials
[Attr.Eval.Dataset.Name]: "test",
// TODO: where to get dataset name from?
[Attr.Eval.Dataset.Split]: "test",
// TODO: where to get dataset split value from?
[Attr.Eval.Dataset.Size]: dataset.length
}
});
const suiteContext = import_api2.trace.setSpan(import_api2.context.active(), suiteSpan);
(0, import_vitest.afterAll)(async () => {
suiteSpan.setStatus({ code: import_api2.SpanStatusCode.OK });
suiteSpan.end();
await flush();
});
await import_vitest.it.concurrent.for(dataset.map((d, index) => ({ ...d, index })))(
evalName,
async (data, { task }) => {
const start = performance.now();
const caseSpan = startSpan(
`case ${evalName}_${data.index}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.case",
[Attr.Eval.Case.ID]: `${evalName}_${data.index}`,
[Attr.Eval.Case.Index]: data.index,
[Attr.Eval.Case.Input]: data.input,
[Attr.Eval.Case.Expected]: data.expected
}
},
suiteContext
);
const caseContext = import_api2.trace.setSpan(import_api2.context.active(), caseSpan);
try {
const { output, duration } = await runTask(caseContext, {
index: data.index,
expected: data.expected,
input: data.input,
scorers: opts.scorers,
task: opts.task,
threshold: opts.threshold
});
const scores = await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerSpan = startSpan(
`score ${scorer.name}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.score"
}
},
caseContext
);
if (typeof scorer === "function") {
const start2 = performance.now();
const result2 = await scorer({
input: data.input,
output,
expected: data.expected
});
const duration2 = Math.round(performance.now() - start2);
const scoreValue = result2.score;
const passed = scoreValue >= opts.threshold;
scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: scorer.name,
[Attr.Eval.Score.Value]: scoreValue,
[Attr.Eval.Score.Threshold]: opts.threshold,
[Attr.Eval.Score.Passed]: passed
});
if (!passed) {
scorerSpan.recordException(new Error(`Score did not pass`));
scorerSpan.setStatus({ code: import_api2.SpanStatusCode.ERROR });
} else {
scorerSpan.setStatus({ code: import_api2.SpanStatusCode.OK });
}
scorerSpan.end();
return { ...result2, duration: duration2, startedAt: start2 };
} else {
}
})
);
caseSpan.setAttributes({
[Attr.Eval.Case.Output]: output
// TODO: what if output is other than a string?,
});
caseSpan.setStatus({ code: import_api2.SpanStatusCode.OK });
task.meta.eval = {
order: data.index,
name: evalName,
expected: data.expected,
input: data.input,
output,
scores,
status: "success",
errors: [],
duration,
startedAt: start,
threshold: opts.threshold
};
} catch (e) {
caseSpan.recordException(e);
caseSpan.setStatus({ code: import_api2.SpanStatusCode.ERROR });
task.meta.eval = {
name: evalName,
order: data.index,
expected: data.expected,
input: data.input,
output: e,
scores: [],
status: "fail",
errors: [e],
startedAt: start,
duration: Math.round(performance.now() - start),
threshold: opts.threshold
};
throw e;
} finally {
caseSpan.end();
}
}
);
},
DEFAULT_TIMEOUT
);
return result;
}
var joinArrayOfUnknownResults = (results) => {
return results.reduce((acc, result) => {
if (typeof result === "string" || typeof result === "number" || typeof result === "boolean") {
return `${acc}${result}`;
}
throw new Error(
`Cannot display results of stream: stream contains non-string, non-number, non-boolean chunks.`
);
}, "");
};
var executeTask = async (task, input, expected) => {
const taskResultOrStream = await task(input, expected);
if (typeof taskResultOrStream === "object" && taskResultOrStream && Symbol.asyncIterator in taskResultOrStream) {
const chunks = [];
for await (const chunk of taskResultOrStream) {
chunks.push(chunk);
}
return joinArrayOfUnknownResults(chunks);
}
return taskResultOrStream;
};
var runTask = async (caseContext, opts) => {
const taskName = opts.task.name ?? "anonymous";
const taskSpan = startSpan(
`task`,
{
attributes: {
[Attr.GenAI.Operation.Name]: "eval.task",
[Attr.Eval.Task.Name]: taskName,
[Attr.Eval.Task.Type]: "llm_completion",
// TODO: How to determine task type?
[Attr.Eval.Task.Trial]: 1
}
},
caseContext
);
const { output, duration } = await import_api2.context.with(
import_api2.trace.setSpan(import_api2.context.active(), taskSpan),
async () => {
const start = performance.now();
const output2 = await executeTask(opts.task, opts.input, opts.expected);
const duration2 = Math.round(performance.now() - start);
taskSpan.setAttributes({
[Attr.Eval.Task.Output]: output2
// TODO: what if output is other than a string?,
});
taskSpan.setStatus({ code: import_api2.SpanStatusCode.OK });
taskSpan.end();
return { output: output2, duration: duration2 };
}
);
return {
output,
duration
};
};
// src/evals/reporter.ts
var AxiomReporter = class {
onTestSuiteReady(_testSuite) {
}
onTestSuiteResult(testSuite) {
for (const test of testSuite.children.array()) {
if (test.type !== "test") continue;
const testMeta = test.meta();
if (!testMeta.eval) {
return;
}
const scores = [];
for (const s of Object.entries(testMeta.eval.scores)) {
scores.push({ name: s[1].name, score: s[1].score });
}
}
}
async onTestRunEnd(_testModules, _errors, _reason) {
}
};
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
experimental_AxiomReporter,
experimental_Eval
});
//# sourceMappingURL=evals.cjs.map