axiom
Version:
Axiom AI SDK provides - an API to wrap your AI calls with observability instrumentation. - offline evals - online evals
135 lines (131 loc) • 7.58 kB
TypeScript
import { SpanContext } from '@opentelemetry/api';
import { S as Score, V as ValidateName } from '../name-validation.d-BKPGh6r3.js';
import '../scorers/aggregations.js';
/**
* Sampling decision for an individual scorer.
*/
type ScorerSampling<TInput = unknown, TOutput = unknown> = number | ((args: {
input?: TInput;
output: TOutput;
}) => boolean | Promise<boolean>);
/**
* Online scorer result payload.
* - For scorer functions, `name` is inferred from the function name/property.
* - For precomputed results passed into `onlineEval`, `name` is required.
*/
type ScorerResult<TMetadata extends Record<string, unknown> = Record<string, unknown>> = {
name: string;
score: Score['score'];
metadata?: TMetadata;
error?: string;
};
/**
* Online scorer function.
*/
type Scorer<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = (args: {
input?: TInput;
output: TOutput;
}) => Omit<ScorerResult<TMetadata>, 'name'> | Promise<Omit<ScorerResult<TMetadata>, 'name'>>;
type OnlineEvalScorerInput<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = Scorer<TInput, TOutput, TMetadata> | ScorerResult<TMetadata>;
type SampledOnlineEvalScorer<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = {
scorer: OnlineEvalScorerInput<TInput, TOutput, TMetadata>;
sampling?: ScorerSampling<TInput, TOutput>;
};
type OnlineEvalScorerEntry<TInput = unknown, TOutput = unknown, TMetadata extends Record<string, unknown> = Record<string, unknown>> = OnlineEvalScorerInput<TInput, TOutput, TMetadata> | SampledOnlineEvalScorer<TInput, TOutput, TMetadata>;
type ScorerEntry<TInput, TOutput> = OnlineEvalScorerEntry<TInput, TOutput, any>;
type InferScorerMetadata<TScorerEntry> = TScorerEntry extends SampledOnlineEvalScorer<any, any, any> ? InferScorerMetadata<TScorerEntry['scorer']> : TScorerEntry extends Scorer<any, any, infer TMetadata> ? TMetadata : TScorerEntry extends ScorerResult<infer TMetadata> ? TMetadata : never;
type InferScorerName<TScorerEntry> = TScorerEntry extends SampledOnlineEvalScorer<any, any, any> ? InferScorerName<TScorerEntry['scorer']> : TScorerEntry extends ScorerResult<any> ? TScorerEntry['name'] : TScorerEntry extends Scorer<any, any, any> ? string : never;
type IsBroadString<T extends string> = string extends T ? true : false;
type DuplicateScorerNames<TEntries extends readonly unknown[], Seen extends string = never, Duplicates extends string = never> = TEntries extends readonly [infer Head, ...infer Tail] ? InferScorerName<Head> extends infer ScorerName ? ScorerName extends string ? IsBroadString<ScorerName> extends true ? DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen, Duplicates> : ScorerName extends Seen ? DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen, Duplicates | ScorerName> : DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen | ScorerName, Duplicates> : DuplicateScorerNames<Tail extends readonly unknown[] ? Tail : never, Seen, Duplicates> : never : Duplicates;
type EnsureUniqueScorerNames<TEntries extends readonly unknown[]> = [
DuplicateScorerNames<TEntries>
] extends [never] ? TEntries : never & {
__axiomDuplicateScorerNames__: DuplicateScorerNames<TEntries>;
};
type InferOnlineEvalResult<TScorers extends readonly unknown[]> = ScorerResult<InferScorerMetadata<TScorers[number]>>;
type InferOnlineEvalResultRecord<TScorers extends readonly unknown[]> = Partial<Record<Extract<InferScorerName<TScorers[number]>, string>, InferOnlineEvalResult<TScorers>>>;
/**
* Options for online evaluation.
*/
type OnlineEvalParams<TInput, TOutput, TScorers extends readonly ScorerEntry<TInput, TOutput>[] = readonly ScorerEntry<TInput, TOutput>[]> = {
/** High-level capability being evaluated (e.g., 'qa', 'summarization') */
capability: string;
/** Specific step within the capability (e.g., 'answer', 'extract') */
step?: string;
/**
* Explicit SpanContext(s) to link the eval span to originating generation span(s).
* When omitted, the active span's context is used automatically.
* Use this for deferred evaluation when onlineEval is called after the
* originating span has completed.
* Supports both single context and multiple contexts for multi-span linking.
*/
links?: SpanContext | SpanContext[];
/** Input to pass to scorers (optional - only needed for input+output scorers) */
input?: TInput;
/** Output to evaluate */
output: TOutput;
/** Scorers or precomputed scores. Supports optional per-scorer sampling. */
scorers: EnsureUniqueScorerNames<TScorers>;
};
/**
* Run online evaluation scorers against production outputs.
*
* Returns a promise that resolves to scorer results. Use `void onlineEval(...)`
* for fire-and-forget, or `await onlineEval(...)` when you need to wait for
* completion (e.g., before flushing telemetry in short-lived processes).
*
* Each eval span links back to the originating generation span via an
* OpenTelemetry span link. Parent/child hierarchy follows natural context
* propagation — inside `withSpan` the eval is a child, outside it depends
* on the active context.
*
* ## Usage Patterns
*
* **Inside withSpan (recommended):**
* Active span is automatically detected and linked.
* ```ts
* await withSpan({ capability: 'qa', step: 'answer' }, async () => {
* const response = await generateText({ ... });
* void onlineEval(
* 'my-eval',
* { capability: 'qa', step: 'answer' },
* { output: response.text, scorers: [formatScorer] }
* );
* return response.text;
* });
* ```
*
* **Deferred evaluation with explicit link:**
* Pass the originating span's context for linking when evaluating later.
* Supports single or multiple span contexts.
* ```ts
* let spanCtx: SpanContext;
* const result = await withSpan({ ... }, async (span) => {
* spanCtx = span.spanContext();
* return await generateText({ ... });
* });
* void onlineEval('my-eval', { ..., links: spanCtx }, { output: result, scorers });
*
* // Or link to multiple spans:
* void onlineEval('my-eval', { ..., links: [spanCtx1, spanCtx2] }, { output, scorers });
* ```
*
* **Awaiting for flush (short-lived processes):**
* ```ts
* await onlineEval('my-eval', { ... }, { output, scorers });
* await flushTelemetry();
* ```
*
* @param name - Eval name (A-Z, a-z, 0-9, -, _ only). Used as the span name and `eval.name` attribute.
* @param meta - Evaluation metadata for categorization
* @param meta.capability - High-level capability being evaluated
* @param meta.step - Optional step within the capability
* @param meta.links - Optional SpanContext(s) to link to (auto-detected if omitted)
* @param params - Evaluation configuration
* @param options.input - Input to pass to scorers
* @param options.output - Output to evaluate
* @param options.scorers - Scorer entries with optional per-scorer sampling
* @returns Promise resolving to scorer results keyed by scorer name
*/
declare function onlineEval<TInput, TOutput, Name extends string, const TScorers extends readonly ScorerEntry<TInput, TOutput>[]>(name: ValidateName<Name>, params: OnlineEvalParams<TInput, TOutput, TScorers>): Promise<InferOnlineEvalResultRecord<TScorers>>;
export { type OnlineEvalParams, type OnlineEvalScorerEntry, type OnlineEvalScorerInput, type SampledOnlineEvalScorer, type ScorerSampling, onlineEval };