evalz
Version:
Model graded evals with typescript
171 lines (164 loc) • 6.07 kB
text/typescript
import { z } from 'zod';
import OpenAI from 'openai';
declare const BaseEvaluationDataItemSchema: z.ZodObject<{
prompt: z.ZodOptional<z.ZodString>;
completion: z.ZodString;
expectedCompletion: z.ZodOptional<z.ZodString>;
contexts: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
groundTruth: z.ZodOptional<z.ZodString>;
}, "strip", z.ZodTypeAny, {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
}, {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
}>;
declare const EvaluationDataItemSchema: z.ZodObject<{
prompt: z.ZodOptional<z.ZodString>;
completion: z.ZodString;
expectedCompletion: z.ZodOptional<z.ZodString>;
contexts: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
groundTruth: z.ZodOptional<z.ZodString>;
}, "strip", z.ZodTypeAny, {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
}, {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
}>;
declare const EvaluationDataItemResultSchema: z.ZodObject<{
score: z.ZodNumber;
scores: z.ZodOptional<z.ZodArray<z.ZodObject<{
score: z.ZodNumber;
evaluator: z.ZodString;
evaluatorType: z.ZodString;
}, "strip", z.ZodTypeAny, {
score: number;
evaluator: string;
evaluatorType: string;
}, {
score: number;
evaluator: string;
evaluatorType: string;
}>, "many">>;
item: z.ZodObject<{
prompt: z.ZodOptional<z.ZodString>;
completion: z.ZodString;
expectedCompletion: z.ZodOptional<z.ZodString>;
contexts: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
groundTruth: z.ZodOptional<z.ZodString>;
}, "strip", z.ZodTypeAny, {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
}, {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
}>;
}, "strip", z.ZodTypeAny, {
score: number;
item: {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
};
scores?: {
score: number;
evaluator: string;
evaluatorType: string;
}[] | undefined;
}, {
score: number;
item: {
completion: string;
prompt?: string | undefined;
expectedCompletion?: string | undefined;
contexts?: string[] | undefined;
groundTruth?: string | undefined;
};
scores?: {
score: number;
evaluator: string;
evaluatorType: string;
}[] | undefined;
}>;
type ResultsType = "score" | "binary";
type BinaryResults = {
trueCount: number;
falseCount: number;
};
type AvgScoreResults = {
value: number;
individual?: Record<string, number>;
};
type EvaluationDataItem = z.infer<typeof EvaluationDataItemSchema>;
type EvaluationDataItemResult = z.infer<typeof EvaluationDataItemResultSchema>;
type EvaluationResponse<T extends ResultsType> = {
results: EvaluationDataItemResult[];
} & (T extends "score" ? {
scoreResults: AvgScoreResults;
} : {
binaryResults: BinaryResults;
});
type ExecuteEvalParams = {
data: EvaluationDataItem[];
};
interface EvalFunction extends Function {
evalType: "model-graded" | "accuracy" | `context-${ContextEvaluatorType}` | "weighted";
}
type _Evaluator<T extends ResultsType> = ({ data }: ExecuteEvalParams) => Promise<EvaluationResponse<T>>;
interface Evaluator<T extends ResultsType> extends _Evaluator<T>, EvalFunction {
}
type ContextEvaluatorType = "entities-recall" | "precision" | "recall" | "relevance";
type ContextEvaluator = Evaluator<"score">;
type AccuracyEvaluator = Evaluator<"score">;
declare function createEvaluator<T extends ResultsType>({ resultsType, evaluationDescription, model, messages, client }: {
resultsType?: T;
evaluationDescription: string;
model?: OpenAI.Model["id"];
messages?: OpenAI.ChatCompletionMessageParam[];
client: OpenAI;
}): Evaluator<T>;
/**
* @name createWeightedEvaluator
* @description
* Create a weighted evaluator that combines the results of multiple evaluators
* @param evaluators - A record of evaluators to combine
* @param weights - A record of weights for each evaluator
* @returns A weighted evaluator
*/
declare function createWeightedEvaluator({ evaluators, weights }: {
evaluators: Record<string, Evaluator<"score"> | AccuracyEvaluator>;
weights: Record<string, number>;
}): Evaluator<"score">;
declare function createAccuracyEvaluator({ model, weights }: {
model?: OpenAI.Embeddings.EmbeddingCreateParams["model"];
weights?: {
factual: number;
semantic: number;
};
}): Evaluator<"score">;
declare function createContextEvaluator({ type, model }: {
type: ContextEvaluatorType;
model?: OpenAI.Embeddings.EmbeddingCreateParams["model"];
}): ContextEvaluator;
export { type AccuracyEvaluator, type AvgScoreResults, BaseEvaluationDataItemSchema, type BinaryResults, type ContextEvaluator, type ContextEvaluatorType, type EvaluationDataItem, type EvaluationDataItemResult, EvaluationDataItemResultSchema, EvaluationDataItemSchema, type EvaluationResponse, type Evaluator, type ExecuteEvalParams, type ResultsType, type _Evaluator, createAccuracyEvaluator, createContextEvaluator, createEvaluator, createWeightedEvaluator };