UNPKG

@axiomhq/ai

Version:

Axiom AI SDK provides an API to wrap your AI calls with observability instrumentation.

144 lines (141 loc) 5.28 kB
import { TestError } from 'vitest'; export { A as experimental_AxiomReporter } from './reporter-B9Ly79ok.cjs'; import 'vitest/node.js'; declare module 'vitest' { interface TaskMeta { eval?: EvalReport; } } /** * Function type for evaluation tasks that process input data and produce output. * * Used with {@link EvalParams} to define the task that will be evaluated against a dataset. * The task output will be scored by functions defined in {@link EvalParams.scorers}. * * @experimental This API is experimental and may change in future versions. * * @param input - The input data to process * @param expected - The expected output for comparison/validation * @returns Promise that resolves to the task output, or the output directly * * @example * ```typescript * const textGenerationTask: EvalTask<string, string> = async (input, expected) => { * const result = await generateText({ * model: myModel, * prompt: input * }); * return result.text; * }; * ``` */ type EvalTask<TInput, TExpected> = (input: TInput, expected: TExpected) => Promise<any> | any; /** * Configuration parameters for running an evaluation. * * Used with {@link Eval} to define how an evaluation should be executed. * Results are captured in {@link EvalReport} format. * * @experimental This API is experimental and may change in future versions. */ type EvalParams = { /** Function that returns the dataset with input/expected pairs for evaluation */ data: () => Promise<{ input: any; expected: any; }[]>; /** The {@link EvalTask} function to execute for each data item */ task: EvalTask<any, any>; /** Array of scoring functions to evaluate the task output, producing {@link Score} results */ scorers: any[]; /** Minimum score threshold for passing (0.0 to 1.0) */ threshold: number; /** Optional timeout in milliseconds for task execution */ timeout?: number; /** Optional function to conditionally skip the evaluation */ skipIf?: () => boolean; }; /** * Represents a score result from an evaluation scorer. * * Produced by scorer functions defined in {@link EvalParams.scorers} and * included in the {@link EvalReport} for each evaluation case. * * @experimental This API is experimental and may change in future versions. */ type Score = { /** Name of the scorer that produced this score */ name: string; /** Numerical score value (typically 0.0 to 1.0) */ score: number; /** Duration in milliseconds that the scoring took */ duration: number; /** Timestamp when scoring started */ startedAt: number; }; /** * Complete report for a single evaluation case including results and metadata. * * Generated for each test case when running {@link Eval} with {@link EvalParams}. * Contains all {@link Score} results and execution metadata. * * @experimental This API is experimental and may change in future versions. */ type EvalReport = { /** Order/index of this case in the evaluation suite */ order: number; /** Name of the evaluation */ name: string; /** Input data that was provided to the {@link EvalTask} */ input: string; /** Output produced by the {@link EvalTask} */ output: string; /** Expected output for comparison */ expected: string; /** Array of {@link Score} results from all scorers that were run */ scores: Score[]; /** Any errors that occurred during evaluation */ errors: TestError[] | null; /** Status of the evaluation case */ status: 'success' | 'fail' | 'pending'; /** Duration in milliseconds for the entire case */ duration: number | undefined; /** Timestamp when the case started */ startedAt: number | undefined; /** Score threshold from {@link EvalParams.threshold} that was used for pass/fail determination */ threshold: number | undefined; }; /** * Creates and registers an evaluation suite with the given name and parameters. * * This function sets up a complete evaluation pipeline that will run your {@link EvalTask} * against a dataset, score the results, and provide detailed {@link EvalReport} reporting. * * @experimental This API is experimental and may change in future versions. * * @param name - Human-readable name for the evaluation suite * @param params - {@link EvalParams} configuration parameters for the evaluation * * @example * ```typescript * import { experimental_Eval as Eval } from '@axiomhq/ai'; * * Eval('Text Generation Quality', { * data: async () => [ * { input: 'Explain photosynthesis', expected: 'Plants convert light to energy...' }, * { input: 'What is gravity?', expected: 'Gravity is a fundamental force...' } * ], * task: async (input) => { * const result = await generateText({ * model: yourModel, * prompt: input * }); * return result.text; * }, * scorers: [similarityScorer, factualAccuracyScorer], * threshold: 0.7 * }); * ``` */ declare const Eval: (name: string, params: EvalParams) => void; export { Eval as experimental_Eval, type EvalParams as experimental_EvalParams, type EvalReport as experimental_EvalReport, type EvalTask as experimental_EvalTask, type Score as experimental_Score };