UNPKG

axiom

Version:

Axiom AI SDK provides - an API to wrap your AI calls with observability instrumentation. - offline evals - online evals

352 lines (345 loc) 11.8 kB
import { R as ResolvedAxiomConfig } from './config-6PyyriW8.js'; import { c as ScoreWithName, d as ScorerLike, V as ValidateName } from './name-validation.d-BKPGh6r3.js'; import { SerializedError } from 'vitest'; import { Reporter, TestSuite, TestCase, TestModule, TestRunEndReason } from 'vitest/node'; import { c as createAppScope } from './app-scope-BgNUnFZY.js'; import '@opentelemetry/api'; import 'zod'; import './scorers/aggregations.js'; /** * Function type for evaluation tasks that process input data and produce output. * * Used with {@link EvalParams} to define the task that will be evaluated against a dataset. * The task output will be scored by functions defined in {@link EvalParams.scorers}. * * * @param input - The input data to process * @param expected - The expected output for comparison/validation * @returns The task output, Promise, or AsyncIterable for streaming * * @example * ```typescript * const textGenerationTask: EvalTask<string, string, string> = async ({ input, expected }) => { * const result = await generateText({ * model: myModel, * prompt: input * }); * return result.text; * }; * ``` */ type EvalTask<TInput, TExpected, TOutput> = (args: { input: TInput; expected: TExpected; }) => TOutput | Promise<TOutput> | AsyncIterable<TOutput>; /** * Record type representing a single data point in an evaluation dataset. * */ type CollectionRecord<TInput, TExpected> = { /** The input data for the evaluation case */ input: TInput; /** The expected output for comparison/validation */ expected: TExpected; /** Optional metadata for the record */ metadata?: Record<string, unknown>; }; /** * Configuration parameters for running an evaluation. * * Used with {@link Eval} to define how an evaluation should be executed. * Results are captured in {@link EvalReport} format. * */ type EvalParams<TInput, TExpected, TOutput> = { /** Dataset with input/expected pairs for evaluation, or a function that returns one */ data: readonly CollectionRecord<TInput, TExpected>[] | Promise<readonly CollectionRecord<TInput, TExpected>[]> | (() => readonly CollectionRecord<TInput, TExpected>[] | Promise<readonly CollectionRecord<TInput, TExpected>[]>); capability: string; step?: string | undefined; /** The task function to evaluate */ task: EvalTask<TInput, TExpected, TOutput>; /** Array of scoring functions to evaluate the task output */ scorers: ReadonlyArray<ScorerLike<TInput, TExpected, TOutput>>; /** Optional metadata for the evaluation */ metadata?: Record<string, unknown>; /** Optional timeout in milliseconds for task execution */ timeout?: number; /** Optional reduction of flag namespace */ configFlags?: string[]; /** * Number of times to run each case. Defaults to 1. * Each trial runs the task independently, and scores are aggregated per scorer. */ trials?: number; }; type RuntimeFlagLog = { kind: 'introduced'; value: unknown; } | { kind: 'replaced'; value: unknown; default: unknown; }; type RuntimeFlagMap = Record<string, RuntimeFlagLog>; type Evaluation = { id: string; name: string; type: string; version: string; baseline: { id: string | undefined; name: string | undefined; }; collection: { name: string; size: number; }; prompt: { model: string; params: Record<string, unknown>; }; duration: number; status: string; traceId: string; runAt: string; tags: string[]; user: { name: string | undefined; email: string | undefined; }; cases: Case[]; flagConfig?: Record<string, any>; }; type Case = { index: number; input: string; output: string; expected: string; duration: string; status: string; scores: Record<string, { name: string; value: number; metadata: Record<string, any>; /** Per-trial scores when running multiple trials */ trials?: number[]; /** Aggregation type used (e.g., 'mean', 'pass@k') */ aggregation?: string; /** Threshold for pass-based aggregations */ threshold?: number; }>; /** Number of trials run for this case */ trials?: number; runAt: string; spanId: string; traceId: string; task?: Task; runtimeFlags?: RuntimeFlagMap; }; type Chat = { operation: string; capability: string; step: string; request: { max_token: string; model: string; temperature: number; }; response: { finish_reasons: string; }; usage: { input_tokens: number; output_tokens: number; }; }; type Task = { name: string; output: string; trial: number; type: string; error?: string; chat: Chat; }; /** * Complete report for a single evaluation case including results and metadata. * * Generated for each test case when running {@link Eval} with {@link EvalParams}. * Contains all {@link Score} results and execution metadata. * */ type EvalCaseReport = { /** Order/index of this case in the evaluation suite */ index: number; /** Name of the evaluation */ name: string; /** Input data that was provided to the {@link EvalTask} */ input: string | Record<string, any>; /** Output produced by the {@link EvalTask}; undefined when all trials fail before producing output */ output: string | Record<string, any> | undefined; /** Expected output for comparison */ expected: string | Record<string, any>; /** Optional metadata for the case */ metadata?: Record<string, any>; /** Array of {@link Score} results from all scorers that were run */ scores: Record<string, ScoreWithName>; /** Any errors that occurred during evaluation */ errors: Error[] | null; /** Status of the evaluation case */ status: 'success' | 'fail' | 'pending'; /** Per-trial errors in order (null for successful trials) */ trialErrors?: Array<string | null>; /** Trial summary stats for reporting */ trialSummary?: { total: number; succeeded: number; failed: number; }; /** Duration in milliseconds for the entire case */ duration: number | undefined; /** Timestamp when the case started */ startedAt: number | undefined; /** Flags accessed outside of the picked flags scope for this case */ outOfScopeFlags?: OutOfScopeFlagAccess[]; /** Flags that are in scope for this evaluation */ pickedFlags?: string[]; /** Runtime flags actually used during this case */ runtimeFlags?: RuntimeFlagMap; }; type OutOfScopeFlagAccess = { flagPath: string; accessedAt: number; stackTrace: string[]; }; type OutOfScopeFlag = { flagPath: string; count: number; firstAccessedAt: number; lastAccessedAt: number; stackTrace: string[]; }; type RegistrationStatus = { status: 'success'; } | { status: 'failed'; error: string; }; type EvaluationReport = { id: string; name: string; version: string; runId: string; orgId?: string; baseline: Evaluation | undefined; /** Flags that are in scope for this evaluation */ configFlags?: string[]; /** Full flag configuration for this evaluation run */ flagConfig?: Record<string, any>; /** Summary of all flags accessed outside of picked flags scope across all cases */ outOfScopeFlags?: OutOfScopeFlag[]; /** End-of-suite config snapshot for console printing only */ configEnd?: { flags?: Record<string, any>; pickedFlags?: string[]; overrides?: Record<string, any>; }; registrationStatus?: RegistrationStatus; /** Number of trials per case (only shown if > 1) */ trials?: number; }; declare module 'vitest' { interface TestSuiteMeta { evaluation: EvaluationReport; } interface TaskMeta { case: EvalCaseReport; evaluation: EvaluationReport; } interface ProvidedContext { baseline?: string; debug?: boolean; list?: boolean; overrides?: Record<string, any>; axiomConfig?: ResolvedAxiomConfig; runId: string; consoleUrl?: string; } } /** * Creates and registers an evaluation suite with the given name and parameters. * * This function sets up a complete evaluation pipeline that will run your {@link EvalTask} * against a collection, score the results, and provide detailed {@link EvalCaseReport} reporting. * * * @param name - Human-readable name for the evaluation suite * @param params - {@link EvalParams} configuration parameters for the evaluation * * @example * ```typescript * import { Eval } from 'axiom/ai/evals'; * * Eval('Text Generation Quality', { * capability: 'capability-name', * data: async () => [ * { input: 'Explain photosynthesis', expected: 'Plants convert light to energy...' }, * { input: 'What is gravity?', expected: 'Gravity is a fundamental force...' } * ], * task: async ({ input }) => { * const result = await generateText({ * model: yourModel, * prompt: input * }); * return result.text; * }, * scorers: [similarityScorer, factualAccuracyScorer], * }); * ``` */ declare function Eval<TInput, TExpected, TOutput, Name extends string = string, Capability extends string = string, Step extends string = string>(name: ValidateName<Name>, params: Omit<EvalParams<TInput, TExpected, TOutput>, 'capability' | 'step' | 'scorers'> & { capability: ValidateName<Capability>; step?: ValidateName<Step> | undefined; scorers: ReadonlyArray<ScorerLike<NoInfer<TInput>, NoInfer<TExpected>, TOutput>>; }): void; /** * Custom Vitest reporter for Axiom AI evaluations. * * This reporter collects evaluation results and scores from tests * and processes them for further analysis and reporting. * */ declare class AxiomReporter implements Reporter { startTime: number; start: number; private _endOfRunConfigEnd; private _suiteData; private _printedFlagOverrides; private _config; onTestRunStart(): void; onTestSuiteReady(_testSuite: TestSuite): Promise<void>; onTestCaseReady(test: TestCase): void; onTestSuiteResult(testSuite: TestSuite): Promise<void>; onTestRunEnd(_testModules: ReadonlyArray<TestModule>, _errors: ReadonlyArray<SerializedError>, _reason: TestRunEndReason): Promise<void>; } interface EvalContextData<Flags = any, Facts = any> { flags: Partial<Flags>; facts: Partial<Facts>; configScope?: ReturnType<typeof createAppScope>; pickedFlags?: string[]; outOfScopeFlags?: OutOfScopeFlagAccess[]; parent?: EvalContextData<Flags, Facts>; overrides?: Record<string, any>; accessedFlagKeys?: string[]; } interface EvalBuilder<AllowedFlags extends Record<string, any> = {}, TInput extends string | Record<string, any> = string, TExpected extends string | Record<string, any> = string, TOutput extends string | Record<string, any> = string> { withFlags<F extends Partial<AllowedFlags>>(flags: F): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>; withModel(model: string): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>; withTimeout(timeout: number): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>; /** * Set the number of times to run each case. * Each trial runs the task independently, and scores are aggregated per scorer. */ withTrials(trials: number): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput>; run(suffix?: string): void; } export { AxiomReporter, type Case, type Chat, Eval, type EvalBuilder, type EvalContextData, type EvalParams, type EvalTask, type Evaluation, type Task };