axiom
Version:
Axiom AI SDK provides - an API to wrap your AI calls with observability instrumentation. - offline evals - online evals
352 lines (345 loc) • 11.8 kB
TypeScript
import { R as ResolvedAxiomConfig } from './config-6PyyriW8.js';
import { c as ScoreWithName, d as ScorerLike, V as ValidateName } from './name-validation.d-BKPGh6r3.js';
import { SerializedError } from 'vitest';
import { Reporter, TestSuite, TestCase, TestModule, TestRunEndReason } from 'vitest/node';
import { c as createAppScope } from './app-scope-BgNUnFZY.js';
import '@opentelemetry/api';
import 'zod';
import './scorers/aggregations.js';
/**
* Function type for evaluation tasks that process input data and produce output.
*
* Used with {@link EvalParams} to define the task that will be evaluated against a dataset.
* The task output will be scored by functions defined in {@link EvalParams.scorers}.
*
*
* @param input - The input data to process
* @param expected - The expected output for comparison/validation
* @returns The task output, Promise, or AsyncIterable for streaming
*
* @example
* ```typescript
* const textGenerationTask: EvalTask<string, string, string> = async ({ input, expected }) => {
* const result = await generateText({
* model: myModel,
* prompt: input
* });
* return result.text;
* };
* ```
*/
type EvalTask<TInput, TExpected, TOutput> = (args: {
input: TInput;
expected: TExpected;
}) => TOutput | Promise<TOutput> | AsyncIterable<TOutput>;
/**
* Record type representing a single data point in an evaluation dataset.
*
*/
type CollectionRecord<TInput, TExpected> = {
/** The input data for the evaluation case */
input: TInput;
/** The expected output for comparison/validation */
expected: TExpected;
/** Optional metadata for the record */
metadata?: Record<string, unknown>;
};
/**
* Configuration parameters for running an evaluation.
*
* Used with {@link Eval} to define how an evaluation should be executed.
* Results are captured in {@link EvalReport} format.
*
*/
type EvalParams<TInput, TExpected, TOutput> = {
/** Dataset with input/expected pairs for evaluation, or a function that returns one */
data: readonly CollectionRecord<TInput, TExpected>[] | Promise<readonly CollectionRecord<TInput, TExpected>[]> | (() => readonly CollectionRecord<TInput, TExpected>[] | Promise<readonly CollectionRecord<TInput, TExpected>[]>);
capability: string;
step?: string | undefined;
/** The task function to evaluate */
task: EvalTask<TInput, TExpected, TOutput>;
/** Array of scoring functions to evaluate the task output */
scorers: ReadonlyArray<ScorerLike<TInput, TExpected, TOutput>>;
/** Optional metadata for the evaluation */
metadata?: Record<string, unknown>;
/** Optional timeout in milliseconds for task execution */
timeout?: number;
/** Optional reduction of flag namespace */
configFlags?: string[];
/**
* Number of times to run each case. Defaults to 1.
* Each trial runs the task independently, and scores are aggregated per scorer.
*/
trials?: number;
};
type RuntimeFlagLog = {
kind: 'introduced';
value: unknown;
} | {
kind: 'replaced';
value: unknown;
default: unknown;
};
type RuntimeFlagMap = Record<string, RuntimeFlagLog>;
type Evaluation = {
id: string;
name: string;
type: string;
version: string;
baseline: {
id: string | undefined;
name: string | undefined;
};
collection: {
name: string;
size: number;
};
prompt: {
model: string;
params: Record<string, unknown>;
};
duration: number;
status: string;
traceId: string;
runAt: string;
tags: string[];
user: {
name: string | undefined;
email: string | undefined;
};
cases: Case[];
flagConfig?: Record<string, any>;
};
type Case = {
index: number;
input: string;
output: string;
expected: string;
duration: string;
status: string;
scores: Record<string, {
name: string;
value: number;
metadata: Record<string, any>;
/** Per-trial scores when running multiple trials */
trials?: number[];
/** Aggregation type used (e.g., 'mean', 'pass@k') */
aggregation?: string;
/** Threshold for pass-based aggregations */
threshold?: number;
}>;
/** Number of trials run for this case */
trials?: number;
runAt: string;
spanId: string;
traceId: string;
task?: Task;
runtimeFlags?: RuntimeFlagMap;
};
type Chat = {
operation: string;
capability: string;
step: string;
request: {
max_token: string;
model: string;
temperature: number;
};
response: {
finish_reasons: string;
};
usage: {
input_tokens: number;
output_tokens: number;
};
};
type Task = {
name: string;
output: string;
trial: number;
type: string;
error?: string;
chat: Chat;
};
/**
* Complete report for a single evaluation case including results and metadata.
*
* Generated for each test case when running {@link Eval} with {@link EvalParams}.
* Contains all {@link Score} results and execution metadata.
*
*/
type EvalCaseReport = {
/** Order/index of this case in the evaluation suite */
index: number;
/** Name of the evaluation */
name: string;
/** Input data that was provided to the {@link EvalTask} */
input: string | Record<string, any>;
/** Output produced by the {@link EvalTask}; undefined when all trials fail before producing output */
output: string | Record<string, any> | undefined;
/** Expected output for comparison */
expected: string | Record<string, any>;
/** Optional metadata for the case */
metadata?: Record<string, any>;
/** Array of {@link Score} results from all scorers that were run */
scores: Record<string, ScoreWithName>;
/** Any errors that occurred during evaluation */
errors: Error[] | null;
/** Status of the evaluation case */
status: 'success' | 'fail' | 'pending';
/** Per-trial errors in order (null for successful trials) */
trialErrors?: Array<string | null>;
/** Trial summary stats for reporting */
trialSummary?: {
total: number;
succeeded: number;
failed: number;
};
/** Duration in milliseconds for the entire case */
duration: number | undefined;
/** Timestamp when the case started */
startedAt: number | undefined;
/** Flags accessed outside of the picked flags scope for this case */
outOfScopeFlags?: OutOfScopeFlagAccess[];
/** Flags that are in scope for this evaluation */
pickedFlags?: string[];
/** Runtime flags actually used during this case */
runtimeFlags?: RuntimeFlagMap;
};
type OutOfScopeFlagAccess = {
flagPath: string;
accessedAt: number;
stackTrace: string[];
};
type OutOfScopeFlag = {
flagPath: string;
count: number;
firstAccessedAt: number;
lastAccessedAt: number;
stackTrace: string[];
};
type RegistrationStatus = {
status: 'success';
} | {
status: 'failed';
error: string;
};
type EvaluationReport = {
id: string;
name: string;
version: string;
runId: string;
orgId?: string;
baseline: Evaluation | undefined;
/** Flags that are in scope for this evaluation */
configFlags?: string[];
/** Full flag configuration for this evaluation run */
flagConfig?: Record<string, any>;
/** Summary of all flags accessed outside of picked flags scope across all cases */
outOfScopeFlags?: OutOfScopeFlag[];
/** End-of-suite config snapshot for console printing only */
configEnd?: {
flags?: Record<string, any>;
pickedFlags?: string[];
overrides?: Record<string, any>;
};
registrationStatus?: RegistrationStatus;
/** Number of trials per case (only shown if > 1) */
trials?: number;
};
declare module 'vitest' {
interface TestSuiteMeta {
evaluation: EvaluationReport;
}
interface TaskMeta {
case: EvalCaseReport;
evaluation: EvaluationReport;
}
interface ProvidedContext {
baseline?: string;
debug?: boolean;
list?: boolean;
overrides?: Record<string, any>;
axiomConfig?: ResolvedAxiomConfig;
runId: string;
consoleUrl?: string;
}
}
/**
* Creates and registers an evaluation suite with the given name and parameters.
*
* This function sets up a complete evaluation pipeline that will run your {@link EvalTask}
* against a collection, score the results, and provide detailed {@link EvalCaseReport} reporting.
*
*
* @param name - Human-readable name for the evaluation suite
* @param params - {@link EvalParams} configuration parameters for the evaluation
*
* @example
* ```typescript
* import { Eval } from 'axiom/ai/evals';
*
* Eval('Text Generation Quality', {
* capability: 'capability-name',
* data: async () => [
* { input: 'Explain photosynthesis', expected: 'Plants convert light to energy...' },
* { input: 'What is gravity?', expected: 'Gravity is a fundamental force...' }
* ],
* task: async ({ input }) => {
* const result = await generateText({
* model: yourModel,
* prompt: input
* });
* return result.text;
* },
* scorers: [similarityScorer, factualAccuracyScorer],
* });
* ```
*/
declare function Eval<TInput, TExpected, TOutput, Name extends string = string, Capability extends string = string, Step extends string = string>(name: ValidateName<Name>, params: Omit<EvalParams<TInput, TExpected, TOutput>, 'capability' | 'step' | 'scorers'> & {
capability: ValidateName<Capability>;
step?: ValidateName<Step> | undefined;
scorers: ReadonlyArray<ScorerLike<NoInfer<TInput>, NoInfer<TExpected>, TOutput>>;
}): void;
/**
* Custom Vitest reporter for Axiom AI evaluations.
*
* This reporter collects evaluation results and scores from tests
* and processes them for further analysis and reporting.
*
*/
declare class AxiomReporter implements Reporter {
startTime: number;
start: number;
private _endOfRunConfigEnd;
private _suiteData;
private _printedFlagOverrides;
private _config;
onTestRunStart(): void;
onTestSuiteReady(_testSuite: TestSuite): Promise<void>;
onTestCaseReady(test: TestCase): void;
onTestSuiteResult(testSuite: TestSuite): Promise<void>;
onTestRunEnd(_testModules: ReadonlyArray<TestModule>, _errors: ReadonlyArray<SerializedError>, _reason: TestRunEndReason): Promise<void>;
}
interface EvalContextData<Flags = any, Facts = any> {
flags: Partial<Flags>;
facts: Partial<Facts>;
configScope?: ReturnType<typeof createAppScope>;
pickedFlags?: string[];
outOfScopeFlags?: OutOfScopeFlagAccess[];
parent?: EvalContextData<Flags, Facts>;
overrides?: Record<string, any>;
accessedFlagKeys?: string[];
}
interface EvalBuilder<AllowedFlags extends Record<string, any> = {}, TInput extends string | Record<string, any> = string, TExpected extends string | Record<string, any> = string, TOutput extends string | Record<string, any> = string> {
withFlags<F extends Partial<AllowedFlags>>(flags: F): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
withModel(model: string): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
withTimeout(timeout: number): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
/**
* Set the number of times to run each case.
* Each trial runs the task independently, and scores are aggregated per scorer.
*/
withTrials(trials: number): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>;
run(suffix?: string): void;
}
export { AxiomReporter, type Case, type Chat, Eval, type EvalBuilder, type EvalContextData, type EvalParams, type EvalTask, type Evaluation, type Task };