UNPKG

axiom

Version:

Axiom AI SDK provides - an API to wrap your AI calls with observability instrumentation. - offline evals - online evals

135 lines (131 loc) 7.58 kB
import { SpanContext } from '@opentelemetry/api'; import { S as Score, V as ValidateName } from '../name-validation.d-BKPGh6r3.js'; import '../scorers/aggregations.js'; /** * Sampling decision for an individual scorer. */ type ScorerSampling<TInput = unknown, TOutput = unknown> = number | ((args: { input?: TInput; output: TOutput; }) => boolean | Promise<boolean>); /** * Online scorer result payload. * - For scorer functions, `name` is inferred from the function name/property. * - For precomputed results passed into `onlineEval`, `name` is required. */ type ScorerResult<TMetadata extends Record<string, unknown> = Record<string, unknown>> = { name: string; score: Score['score']; metadata?: TMetadata; error?: string; }; /** * Online scorer function. */ type Scorer<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = (args: { input?: TInput; output: TOutput; }) => Omit<ScorerResult<TMetadata>, 'name'> | Promise<Omit<ScorerResult<TMetadata>, 'name'>>; type OnlineEvalScorerInput<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = Scorer<TInput, TOutput, TMetadata> | ScorerResult<TMetadata>; type SampledOnlineEvalScorer<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = { scorer: OnlineEvalScorerInput<TInput, TOutput, TMetadata>; sampling?: ScorerSampling<TInput, TOutput>; }; type OnlineEvalScorerEntry<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = OnlineEvalScorerInput<TInput, TOutput, TMetadata> | SampledOnlineEvalScorer<TInput, TOutput, TMetadata>; type ScorerEntry<TInput, TOutput> = OnlineEvalScorerEntry<TInput, TOutput, any>; type InferScorerMetadata<TScorerEntry> = TScorerEntry extends SampledOnlineEvalScorer<any, any, any> ? InferScorerMetadata<TScorerEntry['scorer']> : TScorerEntry extends Scorer<any, any, infer TMetadata> ? TMetadata : TScorerEntry extends ScorerResult<infer TMetadata> ? TMetadata : never; type InferScorerName<TScorerEntry> = TScorerEntry extends SampledOnlineEvalScorer<any, any, any> ? InferScorerName<TScorerEntry['scorer']> : TScorerEntry extends ScorerResult<any> ? TScorerEntry['name'] : TScorerEntry extends Scorer<any, any, any> ? string : never; type IsBroadString<T extends string> = string extends T ? true : false; type DuplicateScorerNames<TEntries extends readonly unknown[], Seen extends string = never, Duplicates extends string = never> = TEntries extends readonly [infer Head, ...infer Tail] ? InferScorerName<Head> extends infer ScorerName ? ScorerName extends string ? IsBroadString<ScorerName> extends true ? DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen, Duplicates> : ScorerName extends Seen ? DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen, Duplicates | ScorerName> : DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen | ScorerName, Duplicates> : DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen, Duplicates> : never : Duplicates; type EnsureUniqueScorerNames<TEntries extends readonly unknown[]> = [ DuplicateScorerNames<TEntries> ] extends [never] ? TEntries : never & { __axiomDuplicateScorerNames__: DuplicateScorerNames<TEntries>; }; type InferOnlineEvalResult<TScorers extends readonly unknown[]> = ScorerResult<InferScorerMetadata<TScorers[number]>>; type InferOnlineEvalResultRecord<TScorers extends readonly unknown[]> = Partial<Record<Extract<InferScorerName<TScorers[number]>, string>, InferOnlineEvalResult<TScorers>>>; /** * Options for online evaluation. */ type OnlineEvalParams<TInput, TOutput, TScorers extends readonly ScorerEntry<TInput, TOutput>[] = readonly ScorerEntry<TInput, TOutput>[]> = { /** High-level capability being evaluated (e.g., 'qa', 'summarization') */ capability: string; /** Specific step within the capability (e.g., 'answer', 'extract') */ step?: string; /** * Explicit SpanContext(s) to link the eval span to originating generation span(s). * When omitted, the active span's context is used automatically. * Use this for deferred evaluation when onlineEval is called after the * originating span has completed. * Supports both single context and multiple contexts for multi-span linking. */ links?: SpanContext | SpanContext[]; /** Input to pass to scorers (optional - only needed for input+output scorers) */ input?: TInput; /** Output to evaluate */ output: TOutput; /** Scorers or precomputed scores. Supports optional per-scorer sampling. */ scorers: EnsureUniqueScorerNames<TScorers>; }; /** * Run online evaluation scorers against production outputs. * * Returns a promise that resolves to scorer results. Use `void onlineEval(...)` * for fire-and-forget, or `await onlineEval(...)` when you need to wait for * completion (e.g., before flushing telemetry in short-lived processes). * * Each eval span links back to the originating generation span via an * OpenTelemetry span link. Parent/child hierarchy follows natural context * propagation — inside `withSpan` the eval is a child, outside it depends * on the active context. * * ## Usage Patterns * * **Inside withSpan (recommended):** * Active span is automatically detected and linked. * ```ts * await withSpan({ capability: 'qa', step: 'answer' }, async () => { * const response = await generateText({ ... }); * void onlineEval( * 'my-eval', * { capability: 'qa', step: 'answer' }, * { output: response.text, scorers: [formatScorer] } * ); * return response.text; * }); * ``` * * **Deferred evaluation with explicit link:** * Pass the originating span's context for linking when evaluating later. * Supports single or multiple span contexts. * ```ts * let spanCtx: SpanContext; * const result = await withSpan({ ... }, async (span) => { * spanCtx = span.spanContext(); * return await generateText({ ... }); * }); * void onlineEval('my-eval', { ..., links: spanCtx }, { output: result, scorers }); * * // Or link to multiple spans: * void onlineEval('my-eval', { ..., links: [spanCtx1, spanCtx2] }, { output, scorers }); * ``` * * **Awaiting for flush (short-lived processes):** * ```ts * await onlineEval('my-eval', { ... }, { output, scorers }); * await flushTelemetry(); * ``` * * @param name - Eval name (A-Z, a-z, 0-9, -, _ only). Used as the span name and `eval.name` attribute. * @param meta - Evaluation metadata for categorization * @param meta.capability - High-level capability being evaluated * @param meta.step - Optional step within the capability * @param meta.links - Optional SpanContext(s) to link to (auto-detected if omitted) * @param params - Evaluation configuration * @param options.input - Input to pass to scorers * @param options.output - Output to evaluate * @param options.scorers - Scorer entries with optional per-scorer sampling * @returns Promise resolving to scorer results keyed by scorer name */ declare function onlineEval<TInput, TOutput, Name extends string, const TScorers extends readonly ScorerEntry<TInput, TOutput>[]>(name: ValidateName<Name>, params: OnlineEvalParams<TInput, TOutput, TScorers>): Promise<InferOnlineEvalResultRecord<TScorers>>; export { type OnlineEvalParams, type OnlineEvalScorerEntry, type OnlineEvalScorerInput, type SampledOnlineEvalScorer, type ScorerSampling, onlineEval };