UNPKG

evalite

Version:

Test your LLM-powered apps with a TypeScript-native, Vitest-based eval runner. No API key required.

636 lines 22.6 kB
export declare namespace Evalite { /** * Configuration options for Evalite */ interface Config { /** * Factory function to create a custom storage backend. * Can be async if the storage requires async initialization. * * @example * ```ts * import { createSqliteStorage } from "evalite/sqlite-storage" * * export default defineConfig({ * storage: () => createSqliteStorage("./custom.db") * }) * ``` */ storage?: () => Evalite.Storage | Promise<Evalite.Storage>; /** * Server configuration options */ server?: { /** * Port for the Evalite UI server * @default 3006 */ port?: number; }; /** * Minimum average score threshold (0-100). * If the average score falls below this threshold, the process will exit with code 1. * * @example * ```ts * export default defineConfig({ * scoreThreshold: 80 // Fail if average score < 80 * }) * ``` */ scoreThreshold?: number; /** * Hide the results table in terminal output * @default false */ hideTable?: boolean; /** * Maximum time (in milliseconds) a test can run before timing out * @default 30000 * @example * ```ts * export default defineConfig({ * testTimeout: 60000 // 60 seconds * }) * ``` */ testTimeout?: number; /** * Maximum number of test cases to run in parallel * @default 5 * @example * ```ts * export default defineConfig({ * maxConcurrency: 100 // Run up to 100 tests in parallel * }) * ``` */ maxConcurrency?: number; /** * Number of times to run each test case for non-deterministic evaluations * @default 1 * @example * ```ts * export default defineConfig({ * trialCount: 3 // Run each test case 3 times * }) * ``` */ trialCount?: number; /** * Setup files to run before tests (e.g., for loading environment variables) * @example * ```ts * export default defineConfig({ * setupFiles: ["dotenv/config"] * }) * ``` */ setupFiles?: string[]; } type RunType = "full" | "partial"; type RunningServerState = { type: "running"; runType: RunType; filepaths: string[]; runId: number | bigint | undefined; evalNamesRunning: string[]; resultIdsRunning: (number | bigint)[]; }; type ServerState = RunningServerState | { type: "idle"; }; type MaybePromise<T> = T | Promise<T>; interface InitialResult { evalName: string; filepath: string; order: number; status: ResultStatus; variantName: string | undefined; variantGroup: string | undefined; trialIndex: number | undefined; } type ResultStatus = "success" | "fail" | "running"; type RenderedColumn = { label: string; value: unknown; }; interface Result { evalName: string; filepath: string; order: number; status: ResultStatus; variantName: string | undefined; variantGroup: string | undefined; trialIndex: number | undefined; /** * Technically, input and expected are known at the start * of the evaluation. But because they may be files, they * need to be saved asynchronously. * * This is why they are only included in the final result. */ input: unknown; expected?: unknown; output: unknown; scores: Score[]; duration: number; traces: Trace[]; renderedColumns: RenderedColumn[]; } type Score = { /** * A number between 0 and 1. * * Added null for compatibility with {@link https://github.com/braintrustdata/autoevals | autoevals}. * null scores will be reported as 0. */ score: number | null; name: string; description?: string; metadata?: unknown; }; type UserProvidedScoreWithMetadata = { score: number; metadata?: unknown; }; type ScoreInput<TInput, TOutput, TExpected> = { input: TInput; output: TOutput; expected?: TExpected; }; type ColumnInput<TInput, TOutput, TExpected> = { input: TInput; output: TOutput; expected?: TExpected; scores: Score[]; traces: Trace[]; }; type Task<TInput, TOutput, TVariant = undefined> = (input: TInput, variant: TVariant) => MaybePromise<TOutput | AsyncIterable<TOutput>>; type Scorer<TInput, TOutput, TExpected> = (opts: ScoreInput<TInput, TOutput, TExpected>) => MaybePromise<Score>; type DataShape<TInput, TExpected> = { input: TInput; expected?: TExpected; only?: boolean; }; type DataShapeAsyncResolver<TInput, TExpected> = () => MaybePromise<DataShape<TInput, TExpected>[]>; type RunnerOpts<TInput, TOutput, TExpected, TVariant = undefined> = { data: DataShape<TInput, TExpected>[] | DataShapeAsyncResolver<TInput, TExpected>; task: Task<TInput, TOutput, TVariant>; scorers?: Array<Scorer<TInput, TOutput, TExpected> | ScorerOpts<TInput, TOutput, TExpected>>; /** * @deprecated Use `columns` instead. */ experimental_customColumns?: (opts: ColumnInput<TInput, TOutput, TExpected>) => MaybePromise<RenderedColumn[]>; columns?: (opts: ColumnInput<TInput, TOutput, TExpected>) => MaybePromise<RenderedColumn[]>; /** * Number of times to run each test case for non-deterministic evaluations * @default 1 * @example * ```ts * evalite("My Eval", { * data: [...], * task: ..., * trialCount: 5 // Run each data point 5 times * }) * ``` */ trialCount?: number; }; type ScorerOpts<TInput, TOutput, TExpected> = { name: string; description?: string; scorer: (input: Evalite.ScoreInput<TInput, TOutput, TExpected>) => Evalite.MaybePromise<number | Evalite.UserProvidedScoreWithMetadata>; }; interface Trace { input: unknown; usage?: { inputTokens: number; outputTokens: number; totalTokens: number; }; output: unknown; start: number; end: number; } type TracePrompt = { role: string; content: TracePromptTextContent[] | string; }; type TracePromptTextContent = { type: "text"; text: string; }; type File = { __EvaliteFile: true; path: string; }; namespace SDK { type GetEvalByNameResult = { history: { score: number; date: string; }[]; evaluation: Evalite.Storage.Entities.Eval & { results: (Evalite.Storage.Entities.Result & { scores: Evalite.Storage.Entities.Score[]; })[]; }; prevEvaluation: (Evalite.Storage.Entities.Eval & { results: (Evalite.Storage.Entities.Result & { scores: Evalite.Storage.Entities.Score[]; })[]; }) | undefined; }; type GetMenuItemsResultEval = { filepath: string; score: number; name: string; prevScore: number | undefined; evalStatus: Evalite.Storage.Entities.EvalStatus; variantName: string | undefined; variantGroup: string | undefined; hasScores: boolean; }; type GetMenuItemsResult = { evals: GetMenuItemsResultEval[]; score: number; prevScore: number | undefined; evalStatus: Evalite.Storage.Entities.EvalStatus; }; type GetResultResult = { result: Evalite.Storage.Entities.Result & { traces: Evalite.Storage.Entities.Trace[]; score: number; scores: Evalite.Storage.Entities.Score[]; }; prevResult: (Evalite.Storage.Entities.Result & { score: number; scores: Evalite.Storage.Entities.Score[]; }) | undefined; evaluation: Evalite.Storage.Entities.Eval; }; } /** * Storage interface for storage backends in Evalite. * Implement this interface to create custom storage backends (e.g., Postgres, Turso, in-memory). */ interface Storage { /** * Operations for managing test runs. */ runs: { /** * Create a new run and return the complete run entity. */ create(opts: Evalite.Storage.Runs.CreateOpts): Promise<Evalite.Storage.Entities.Run>; /** * Get runs matching the specified criteria. */ getMany(opts?: Evalite.Storage.Runs.GetManyOpts): Promise<Evalite.Storage.Entities.Run[]>; }; /** * Operations for managing evaluations. */ evals: { /** * Create a new eval and return the complete eval entity. */ create(opts: Evalite.Storage.Evals.CreateOpts): Promise<Evalite.Storage.Entities.Eval>; /** * Update an eval and return the updated entity. */ update(opts: Evalite.Storage.Evals.UpdateOpts): Promise<Evalite.Storage.Entities.Eval>; /** * Get evals matching the specified criteria. */ getMany(opts?: Evalite.Storage.Evals.GetManyOpts): Promise<Evalite.Storage.Entities.Eval[]>; }; /** * Operations for managing test results. */ results: { /** * Create a new result and return the complete result entity. */ create(opts: Evalite.Storage.Results.CreateOpts): Promise<Evalite.Storage.Entities.Result>; /** * Update a result and return the updated entity. */ update(opts: Evalite.Storage.Results.UpdateOpts): Promise<Evalite.Storage.Entities.Result>; /** * Get results matching the specified criteria. */ getMany(opts?: Evalite.Storage.Results.GetManyOpts): Promise<Evalite.Storage.Entities.Result[]>; }; /** * Operations for managing scores. */ scores: { /** * Create a new score and return the complete score entity. */ create(opts: Evalite.Storage.Scores.CreateOpts): Promise<Evalite.Storage.Entities.Score>; /** * Get scores matching the specified criteria. */ getMany(opts?: Evalite.Storage.Scores.GetManyOpts): Promise<Evalite.Storage.Entities.Score[]>; }; /** * Operations for managing traces. */ traces: { /** * Create a new trace and return the complete trace entity. */ create(opts: Evalite.Storage.Traces.CreateOpts): Promise<Evalite.Storage.Entities.Trace>; /** * Get traces matching the specified criteria. */ getMany(opts?: Evalite.Storage.Traces.GetManyOpts): Promise<Evalite.Storage.Entities.Trace[]>; }; /** * Close/cleanup the storage (e.g., close database connection). */ close(): Promise<void>; /** * Symbol.asyncDispose for use with `await using` syntax. */ [Symbol.asyncDispose](): Promise<void>; } /** * Types for the Storage API. * These types define the interface for pluggable storage backends. */ namespace Storage { /** * Database entity types that storage backends must return. * These are the canonical types for the storage contract. */ namespace Entities { type Run = { id: number; runType: RunType; created_at: string; }; type EvalStatus = "fail" | "success" | "running"; type Eval = { id: number; run_id: number; name: string; status: EvalStatus; filepath: string; duration: number; created_at: string; variant_name?: string; variant_group?: string; }; type Result = { id: number; eval_id: number; duration: number; input: unknown; output: unknown; expected?: unknown; created_at: string; col_order: number; status: ResultStatus; rendered_columns?: unknown; trial_index?: number | null; }; type Score = { id: number; result_id: number; name: string; score: number; description?: string; metadata?: unknown; created_at: string; }; type Trace = { id: number; result_id: number; input: unknown; output: unknown; start_time: number; end_time: number; input_tokens?: number; output_tokens?: number; total_tokens?: number; col_order: number; }; } namespace Runs { interface CreateOpts { runType: RunType; } interface GetManyOpts { ids?: number[]; runType?: RunType; createdAfter?: string; createdBefore?: string; createdAt?: string; limit?: number; orderBy?: "created_at" | "id"; orderDirection?: "asc" | "desc"; } } namespace Evals { interface CreateOpts { runId: number; name: string; filepath: string; variantName?: string; variantGroup?: string; } interface UpdateOpts { id: number; status: Entities.EvalStatus; } interface GetManyOpts { ids?: number[]; runIds?: number[]; name?: string; statuses?: Entities.EvalStatus[]; createdAt?: string; createdAfter?: string; createdBefore?: string; limit?: number; orderBy?: "created_at" | "name" | "id"; orderDirection?: "asc" | "desc"; } } namespace Results { interface CreateOpts { evalId: number; order: number; input: unknown; expected: unknown; output: unknown; duration: number; status: ResultStatus; renderedColumns: unknown; trialIndex?: number; } interface UpdateOpts { id: number; output: unknown; duration: number; input: unknown; expected: unknown; status: ResultStatus; renderedColumns: unknown; trialIndex?: number; } interface GetManyOpts { ids?: number[]; evalIds?: number[]; order?: number; statuses?: ResultStatus[]; } } namespace Scores { interface CreateOpts { resultId: number; name: string; score: number; description?: string; metadata: unknown; } interface GetManyOpts { ids?: number[]; resultIds?: number[]; } } namespace Traces { interface CreateOpts { resultId: number; input: unknown; output: unknown; start: number; end: number; inputTokens?: number; outputTokens?: number; totalTokens?: number; order: number; } interface GetManyOpts { ids?: number[]; resultIds?: number[]; } } } /** * Types for the exported evaluation output format. * These types represent the structure of JSON files created by the --outputPath flag. */ namespace Exported { /** Metadata about a test run */ type Run = { /** Unique identifier for this run */ id: number; /** Type of run: "full" runs all tests, "partial" runs only changed tests */ runType: "full" | "partial"; /** ISO 8601 timestamp when the run was created */ createdAt: string; }; /** Score from a scorer function */ type Score = { /** Unique identifier for this score */ id: number; /** Name of the scorer that produced this score */ name: string; /** The score value (0-1 scale, where 1 is best) */ score: number; /** Optional human-readable description of what this score measures */ description?: string; /** Optional additional data attached to this score by the scorer */ metadata?: unknown; /** ISO 8601 timestamp when the score was created */ createdAt: string; }; /** Trace of an LLM call (for debugging and cost tracking) */ type Trace = { /** Unique identifier for this trace */ id: number; /** The input/prompt sent to the LLM */ input: unknown; /** The response received from the LLM */ output: unknown; /** Unix timestamp in milliseconds when the LLM call started */ startTime: number; /** Unix timestamp in milliseconds when the LLM call completed */ endTime: number; /** Number of tokens in the input/prompt (if available from LLM provider) */ inputTokens?: number; /** Number of tokens in the output/completion (if available from LLM provider) */ outputTokens?: number; /** Total tokens used (input + output, if available from LLM provider) */ totalTokens?: number; /** Zero-based order of this trace within the result */ colOrder: number; }; /** Individual test result for a single data point */ type Result = { /** Unique identifier for this result */ id: number; /** Duration of this specific test case in milliseconds */ duration: number; /** The input data that was passed to the task function */ input: unknown; /** The output produced by the task function */ output: unknown; /** The expected output for comparison (optional) */ expected?: unknown; /** Status of this specific test: "success" or "fail" */ status: "success" | "fail" | "running"; /** Zero-based order of this result within the evaluation */ colOrder: number; /** Custom columns rendered for display in the UI (if any) */ renderedColumns?: unknown; /** ISO 8601 timestamp when the result was created */ createdAt: string; /** Average score for this result across all scorers (0-1 scale) */ averageScore: number; /** Scores from all scorer functions applied to this result */ scores: Score[]; /** Traces of LLM calls made during this test */ traces: Trace[]; }; /** Evaluation containing multiple test results */ type Eval = { /** Unique identifier for this evaluation */ id: number; /** The name of the evaluation as defined in the evalite() call */ name: string; /** Absolute path to the .eval.ts file containing this evaluation */ filepath: string; /** Total duration of the evaluation in milliseconds */ duration: number; /** Overall status of the evaluation: "success" means all tests passed, "fail" means at least one failed */ status: "fail" | "success" | "running"; /** Optional variant name if using A/B testing or experimentation features */ variantName?: string; /** Optional variant group name for organizing related variants */ variantGroup?: string; /** ISO 8601 timestamp when the evaluation was created */ createdAt: string; /** Average score across all results in this evaluation (0-1 scale) */ averageScore: number; /** Individual test results for each data point in the evaluation */ results: Result[]; }; /** * The complete output structure for exporting evaluation results. * This format is designed to be a comprehensive snapshot of a test run, * suitable for archiving, analysis, or importing into other systems. */ type Output = { /** Metadata about the test run */ run: Run; /** Array of evaluations that were executed in this run */ evals: Eval[]; }; } } //# sourceMappingURL=types.d.ts.map