@axiomhq/ai
Version:
Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.
144 lines (141 loc) • 5.28 kB
TypeScript
import { TestError } from 'vitest';
export { A as experimental_AxiomReporter } from './reporter-B9Ly79ok.js';
import 'vitest/node.js';
declare module 'vitest' {
interface TaskMeta {
eval?: EvalReport;
}
}
/**
* Function type for evaluation tasks that process input data and produce output.
*
* Used with {@link EvalParams} to define the task that will be evaluated against a dataset.
* The task output will be scored by functions defined in {@link EvalParams.scorers}.
*
* @experimental This API is experimental and may change in future versions.
*
* @param input - The input data to process
* @param expected - The expected output for comparison/validation
* @returns Promise that resolves to the task output, or the output directly
*
* @example
* ```typescript
* const textGenerationTask: EvalTask<string, string> = async (input, expected) => {
* const result = await generateText({
* model: myModel,
* prompt: input
* });
* return result.text;
* };
* ```
*/
type EvalTask<TInput, TExpected> = (input: TInput, expected: TExpected) => Promise<any> | any;
/**
* Configuration parameters for running an evaluation.
*
* Used with {@link Eval} to define how an evaluation should be executed.
* Results are captured in {@link EvalReport} format.
*
* @experimental This API is experimental and may change in future versions.
*/
type EvalParams = {
/** Function that returns the dataset with input/expected pairs for evaluation */
data: () => Promise<{
input: any;
expected: any;
}[]>;
/** The {@link EvalTask} function to execute for each data item */
task: EvalTask<any, any>;
/** Array of scoring functions to evaluate the task output, producing {@link Score} results */
scorers: any[];
/** Minimum score threshold for passing (0.0 to 1.0) */
threshold: number;
/** Optional timeout in milliseconds for task execution */
timeout?: number;
/** Optional function to conditionally skip the evaluation */
skipIf?: () => boolean;
};
/**
* Represents a score result from an evaluation scorer.
*
* Produced by scorer functions defined in {@link EvalParams.scorers} and
* included in the {@link EvalReport} for each evaluation case.
*
* @experimental This API is experimental and may change in future versions.
*/
type Score = {
/** Name of the scorer that produced this score */
name: string;
/** Numerical score value (typically 0.0 to 1.0) */
score: number;
/** Duration in milliseconds that the scoring took */
duration: number;
/** Timestamp when scoring started */
startedAt: number;
};
/**
* Complete report for a single evaluation case including results and metadata.
*
* Generated for each test case when running {@link Eval} with {@link EvalParams}.
* Contains all {@link Score} results and execution metadata.
*
* @experimental This API is experimental and may change in future versions.
*/
type EvalReport = {
/** Order/index of this case in the evaluation suite */
order: number;
/** Name of the evaluation */
name: string;
/** Input data that was provided to the {@link EvalTask} */
input: string;
/** Output produced by the {@link EvalTask} */
output: string;
/** Expected output for comparison */
expected: string;
/** Array of {@link Score} results from all scorers that were run */
scores: Score[];
/** Any errors that occurred during evaluation */
errors: TestError[] | null;
/** Status of the evaluation case */
status: 'success' | 'fail' | 'pending';
/** Duration in milliseconds for the entire case */
duration: number | undefined;
/** Timestamp when the case started */
startedAt: number | undefined;
/** Score threshold from {@link EvalParams.threshold} that was used for pass/fail determination */
threshold: number | undefined;
};
/**
* Creates and registers an evaluation suite with the given name and parameters.
*
* This function sets up a complete evaluation pipeline that will run your {@link EvalTask}
* against a dataset, score the results, and provide detailed {@link EvalReport} reporting.
*
* @experimental This API is experimental and may change in future versions.
*
* @param name - Human-readable name for the evaluation suite
* @param params - {@link EvalParams} configuration parameters for the evaluation
*
* @example
* ```typescript
* import { experimental_Eval as Eval } from '@axiomhq/ai';
*
* Eval('Text Generation Quality', {
* data: async () => [
* { input: 'Explain photosynthesis', expected: 'Plants convert light to energy...' },
* { input: 'What is gravity?', expected: 'Gravity is a fundamental force...' }
* ],
* task: async (input) => {
* const result = await generateText({
* model: yourModel,
* prompt: input
* });
* return result.text;
* },
* scorers: [similarityScorer, factualAccuracyScorer],
* threshold: 0.7
* });
* ```
*/
declare const Eval: (name: string, params: EvalParams) => void;
export { Eval as experimental_Eval, type EvalParams as experimental_EvalParams, type EvalReport as experimental_EvalReport, type EvalTask as experimental_EvalTask, type Score as experimental_Score };