UNPKG

evalz

Version:

Model graded evals with typescript

171 lines (164 loc) 6.07 kB
import { z } from 'zod'; import OpenAI from 'openai'; declare const BaseEvaluationDataItemSchema: z.ZodObject<{ prompt: z.ZodOptional<z.ZodString>; completion: z.ZodString; expectedCompletion: z.ZodOptional<z.ZodString>; contexts: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; groundTruth: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }, { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }>; declare const EvaluationDataItemSchema: z.ZodObject<{ prompt: z.ZodOptional<z.ZodString>; completion: z.ZodString; expectedCompletion: z.ZodOptional<z.ZodString>; contexts: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; groundTruth: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }, { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }>; declare const EvaluationDataItemResultSchema: z.ZodObject<{ score: z.ZodNumber; scores: z.ZodOptional<z.ZodArray<z.ZodObject<{ score: z.ZodNumber; evaluator: z.ZodString; evaluatorType: z.ZodString; }, "strip", z.ZodTypeAny, { score: number; evaluator: string; evaluatorType: string; }, { score: number; evaluator: string; evaluatorType: string; }>, "many">>; item: z.ZodObject<{ prompt: z.ZodOptional<z.ZodString>; completion: z.ZodString; expectedCompletion: z.ZodOptional<z.ZodString>; contexts: z.ZodOptional<z.ZodArray<z.ZodString, "many">>; groundTruth: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }, { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }>; }, "strip", z.ZodTypeAny, { score: number; item: { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }; scores?: { score: number; evaluator: string; evaluatorType: string; }[] | undefined; }, { score: number; item: { completion: string; prompt?: string | undefined; expectedCompletion?: string | undefined; contexts?: string[] | undefined; groundTruth?: string | undefined; }; scores?: { score: number; evaluator: string; evaluatorType: string; }[] | undefined; }>; type ResultsType = "score" | "binary"; type BinaryResults = { trueCount: number; falseCount: number; }; type AvgScoreResults = { value: number; individual?: Record<string, number>; }; type EvaluationDataItem = z.infer<typeof EvaluationDataItemSchema>; type EvaluationDataItemResult = z.infer<typeof EvaluationDataItemResultSchema>; type EvaluationResponse<T extends ResultsType> = { results: EvaluationDataItemResult[]; } & (T extends "score" ? { scoreResults: AvgScoreResults; } : { binaryResults: BinaryResults; }); type ExecuteEvalParams = { data: EvaluationDataItem[]; }; interface EvalFunction extends Function { evalType: "model-graded" | "accuracy" | `context-${ContextEvaluatorType}` | "weighted"; } type _Evaluator<T extends ResultsType> = ({ data }: ExecuteEvalParams) => Promise<EvaluationResponse<T>>; interface Evaluator<T extends ResultsType> extends _Evaluator<T>, EvalFunction { } type ContextEvaluatorType = "entities-recall" | "precision" | "recall" | "relevance"; type ContextEvaluator = Evaluator<"score">; type AccuracyEvaluator = Evaluator<"score">; declare function createEvaluator<T extends ResultsType>({ resultsType, evaluationDescription, model, messages, client }: { resultsType?: T; evaluationDescription: string; model?: OpenAI.Model["id"]; messages?: OpenAI.ChatCompletionMessageParam[]; client: OpenAI; }): Evaluator<T>; /** * @name createWeightedEvaluator * @description * Create a weighted evaluator that combines the results of multiple evaluators * @param evaluators - A record of evaluators to combine * @param weights - A record of weights for each evaluator * @returns A weighted evaluator */ declare function createWeightedEvaluator({ evaluators, weights }: { evaluators: Record<string, Evaluator<"score"> | AccuracyEvaluator>; weights: Record<string, number>; }): Evaluator<"score">; declare function createAccuracyEvaluator({ model, weights }: { model?: OpenAI.Embeddings.EmbeddingCreateParams["model"]; weights?: { factual: number; semantic: number; }; }): Evaluator<"score">; declare function createContextEvaluator({ type, model }: { type: ContextEvaluatorType; model?: OpenAI.Embeddings.EmbeddingCreateParams["model"]; }): ContextEvaluator; export { type AccuracyEvaluator, type AvgScoreResults, BaseEvaluationDataItemSchema, type BinaryResults, type ContextEvaluator, type ContextEvaluatorType, type EvaluationDataItem, type EvaluationDataItemResult, EvaluationDataItemResultSchema, EvaluationDataItemSchema, type EvaluationResponse, type Evaluator, type ExecuteEvalParams, type ResultsType, type _Evaluator, createAccuracyEvaluator, createContextEvaluator, createEvaluator, createWeightedEvaluator };