@genkit-ai/ai
Version:
Genkit AI framework generative AI APIs.
1 lines • 17.4 kB
Source Map (JSON)
{"version":3,"sources":["../src/evaluator.ts"],"sourcesContent":["/**\n * Copyright 2024 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport { action, z, type Action } from '@genkit-ai/core';\nimport { logger } from '@genkit-ai/core/logging';\nimport type { Registry } from '@genkit-ai/core/registry';\nimport { toJsonSchema } from '@genkit-ai/core/schema';\nimport { SPAN_TYPE_ATTR, runInNewSpan } from '@genkit-ai/core/tracing';\nimport { randomUUID } from 'crypto';\n\nexport const ATTR_PREFIX = 'genkit';\nexport const SPAN_STATE_ATTR = ATTR_PREFIX + ':state';\n\nexport const BaseDataPointSchema = z.object({\n input: z.unknown(),\n output: z.unknown().optional(),\n context: z.array(z.unknown()).optional(),\n reference: z.unknown().optional(),\n testCaseId: z.string().optional(),\n traceIds: z.array(z.string()).optional(),\n});\n\n// DataPoint that is to be used for actions. This needs testCaseId to be present.\nexport const BaseEvalDataPointSchema = BaseDataPointSchema.extend({\n testCaseId: z.string(),\n});\nexport type BaseEvalDataPoint = z.infer<typeof BaseEvalDataPointSchema>;\n\nconst EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);\n\n/** Enum that indicates if an evaluation has passed or failed */\nexport enum EvalStatusEnum {\n UNKNOWN = 'UNKNOWN',\n PASS = 'PASS',\n FAIL = 'FAIL',\n}\n\nexport const ScoreSchema = z.object({\n id: z\n .string()\n .describe(\n 'Optional ID to differentiate different scores if applying in a single evaluation'\n )\n .optional(),\n score: z.union([z.number(), z.string(), z.boolean()]).optional(),\n status: EvalStatusEnumSchema.optional(),\n error: z.string().optional(),\n details: z\n .object({\n reasoning: z.string().optional(),\n })\n .passthrough()\n .optional(),\n});\n\n// Update genkit-tools/src/utils/evals.ts if you change this value\nexport const EVALUATOR_METADATA_KEY_DISPLAY_NAME = 'evaluatorDisplayName';\nexport const EVALUATOR_METADATA_KEY_DEFINITION = 'evaluatorDefinition';\nexport const EVALUATOR_METADATA_KEY_IS_BILLED = 'evaluatorIsBilled';\n\nexport type Score = z.infer<typeof ScoreSchema>;\nexport type BaseDataPoint = z.infer<typeof BaseDataPointSchema>;\nexport type Dataset<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n> = Array<z.infer<DataPoint>>;\n\nexport const EvalResponseSchema = z.object({\n sampleIndex: z.number().optional(),\n testCaseId: z.string(),\n traceId: z.string().optional(),\n spanId: z.string().optional(),\n evaluation: z.union([ScoreSchema, z.array(ScoreSchema)]),\n});\nexport type EvalResponse = z.infer<typeof EvalResponseSchema>;\n\nexport const EvalResponsesSchema = z.array(EvalResponseSchema);\nexport type EvalResponses = z.infer<typeof EvalResponsesSchema>;\n\nexport type EvaluatorFn<\n EvalDataPoint extends\n typeof BaseEvalDataPointSchema = typeof BaseEvalDataPointSchema,\n CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n> = (\n input: z.infer<EvalDataPoint>,\n evaluatorOptions?: z.infer<CustomOptions>\n) => Promise<EvalResponse>;\n\nexport type EvaluatorAction<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n> = Action<typeof EvalRequestSchema, typeof EvalResponsesSchema> & {\n __dataPointType?: DataPoint;\n __configSchema?: CustomOptions;\n};\n\nfunction withMetadata<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n>(\n evaluator: Action<typeof EvalRequestSchema, typeof EvalResponsesSchema>,\n dataPointType?: DataPoint,\n configSchema?: CustomOptions\n): EvaluatorAction<DataPoint, CustomOptions> {\n const withMeta = evaluator as EvaluatorAction<DataPoint, CustomOptions>;\n withMeta.__dataPointType = dataPointType;\n withMeta.__configSchema = configSchema;\n return withMeta;\n}\n\nconst EvalRequestSchema = z.object({\n dataset: z.array(BaseDataPointSchema),\n evalRunId: z.string(),\n options: z.unknown(),\n});\n\nexport interface EvaluatorParams<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n> {\n evaluator: EvaluatorArgument<DataPoint, CustomOptions>;\n dataset: Dataset<DataPoint>;\n evalRunId?: string;\n options?: z.infer<CustomOptions>;\n}\n\nexport interface EvaluatorOptions<\n DataPoint extends typeof BaseDataPointSchema,\n EvaluatorOpts extends z.ZodTypeAny,\n> {\n name: string;\n displayName: string;\n definition: string;\n dataPointType?: DataPoint;\n configSchema?: EvaluatorOpts;\n isBilled?: boolean;\n}\n\n/**\n * Creates evaluator action for the provided {@link EvaluatorFn} implementation.\n */\nexport function defineEvaluator<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n EvalDataPoint extends\n typeof BaseEvalDataPointSchema = typeof BaseEvalDataPointSchema,\n EvaluatorOpts extends z.ZodTypeAny = z.ZodTypeAny,\n>(\n registry: Registry,\n options: EvaluatorOptions<DataPoint, EvaluatorOpts>,\n runner: EvaluatorFn<EvalDataPoint, EvaluatorOpts>\n): EvaluatorAction {\n const e = evaluator(options, runner);\n\n registry.registerAction('evaluator', e);\n\n return e;\n}\n\n/**\n * Creates evaluator action for the provided {@link EvaluatorFn} implementation.\n */\nexport function evaluator<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n EvalDataPoint extends\n typeof BaseEvalDataPointSchema = typeof BaseEvalDataPointSchema,\n EvaluatorOpts extends z.ZodTypeAny = z.ZodTypeAny,\n>(\n options: EvaluatorOptions<DataPoint, EvaluatorOpts>,\n runner: EvaluatorFn<EvalDataPoint, EvaluatorOpts>\n): EvaluatorAction {\n const evalMetadata = {};\n evalMetadata[EVALUATOR_METADATA_KEY_IS_BILLED] =\n options.isBilled == undefined ? true : options.isBilled;\n evalMetadata[EVALUATOR_METADATA_KEY_DISPLAY_NAME] = options.displayName;\n evalMetadata[EVALUATOR_METADATA_KEY_DEFINITION] = options.definition;\n if (options.configSchema) {\n evalMetadata['customOptions'] = toJsonSchema({\n schema: options.configSchema,\n });\n }\n const evaluator = action(\n {\n actionType: 'evaluator',\n name: options.name,\n inputSchema: EvalRequestSchema.extend({\n dataset: options.dataPointType\n ? z.array(options.dataPointType)\n : z.array(BaseDataPointSchema),\n options: options.configSchema ?? z.unknown(),\n evalRunId: z.string(),\n batchSize: z.number().optional(),\n }),\n outputSchema: EvalResponsesSchema,\n metadata: {\n type: 'evaluator',\n evaluator: evalMetadata,\n },\n },\n async (i) => {\n const evalResponses: EvalResponses = [];\n // This also populates missing testCaseIds\n const batches = getBatchedArray(i.dataset, i.batchSize);\n\n for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) {\n const batch = batches[batchIndex];\n try {\n await runInNewSpan(\n {\n metadata: {\n name: i.batchSize\n ? `Batch ${batchIndex}`\n : `Test Case ${batch[0].testCaseId}`,\n metadata: { 'evaluator:evalRunId': i.evalRunId },\n },\n labels: {\n [SPAN_TYPE_ATTR]: 'evaluator',\n },\n },\n async (metadata, otSpan) => {\n const spanId = otSpan.spanContext().spanId;\n const traceId = otSpan.spanContext().traceId;\n const evalRunPromises = batch.map((d, index) => {\n const sampleIndex = i.batchSize\n ? i.batchSize * batchIndex + index\n : batchIndex;\n const datapoint = d as BaseEvalDataPoint;\n metadata.input = {\n input: datapoint.input,\n output: datapoint.output,\n context: datapoint.context,\n };\n const evalOutputPromise = runner(datapoint, i.options)\n .then((result) => ({\n ...result,\n traceId,\n spanId,\n sampleIndex,\n }))\n .catch((error) => {\n return {\n sampleIndex,\n spanId,\n traceId,\n testCaseId: datapoint.testCaseId,\n evaluation: {\n error: `Evaluation of test case ${datapoint.testCaseId} failed: \\n${error}`,\n },\n };\n });\n return evalOutputPromise;\n });\n\n const allResults = await Promise.all(evalRunPromises);\n metadata.output =\n allResults.length === 1 ? allResults[0] : allResults;\n allResults.map((result) => {\n evalResponses.push(result);\n });\n }\n );\n } catch (e) {\n logger.error(\n `Evaluation of batch ${batchIndex} failed: \\n${(e as Error).stack}`\n );\n continue;\n }\n }\n return evalResponses;\n }\n );\n const ewm = withMetadata(\n evaluator as any as Action<\n typeof EvalRequestSchema,\n typeof EvalResponsesSchema\n >,\n options.dataPointType,\n options.configSchema\n );\n return ewm;\n}\n\nexport type EvaluatorArgument<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n> =\n | string\n | EvaluatorAction<DataPoint, CustomOptions>\n | EvaluatorReference<CustomOptions>;\n\n/**\n * A veneer for interacting with evaluators.\n */\nexport async function evaluate<\n DataPoint extends typeof BaseDataPointSchema = typeof BaseDataPointSchema,\n CustomOptions extends z.ZodTypeAny = z.ZodTypeAny,\n>(\n registry: Registry,\n params: EvaluatorParams<DataPoint, CustomOptions>\n): Promise<EvalResponses> {\n let evaluator: EvaluatorAction<DataPoint, CustomOptions>;\n if (typeof params.evaluator === 'string') {\n evaluator = await registry.lookupAction(`/evaluator/${params.evaluator}`);\n } else if (Object.hasOwnProperty.call(params.evaluator, 'info')) {\n evaluator = await registry.lookupAction(\n `/evaluator/${params.evaluator.name}`\n );\n } else {\n evaluator = params.evaluator as EvaluatorAction<DataPoint, CustomOptions>;\n }\n if (!evaluator) {\n throw new Error('Unable to utilize the provided evaluator');\n }\n return (await evaluator({\n dataset: params.dataset,\n options: params.options,\n evalRunId: params.evalRunId ?? randomUUID(),\n })) as EvalResponses;\n}\n\nexport const EvaluatorInfoSchema = z.object({\n /** Friendly label for this evaluator */\n label: z.string().optional(),\n metrics: z.array(z.string()),\n});\nexport type EvaluatorInfo = z.infer<typeof EvaluatorInfoSchema>;\n\nexport interface EvaluatorReference<CustomOptions extends z.ZodTypeAny> {\n name: string;\n configSchema?: CustomOptions;\n info?: EvaluatorInfo;\n}\n\n/**\n * Helper method to configure a {@link EvaluatorReference} to a plugin.\n */\nexport function evaluatorRef<\n CustomOptionsSchema extends z.ZodTypeAny = z.ZodTypeAny,\n>(\n options: EvaluatorReference<CustomOptionsSchema>\n): EvaluatorReference<CustomOptionsSchema> {\n return { ...options };\n}\n\n/**\n * Helper method to generated batched array. Also ensures each testCase has a\n * testCaseId\n */\nfunction getBatchedArray<T extends { testCaseId?: string }>(\n arr: T[],\n batchSize?: number\n): T[][] {\n let size: number;\n if (!batchSize) {\n size = 1;\n } else {\n size = batchSize;\n }\n\n const batches: T[][] = [];\n for (var i = 0; i < arr.length; i += size) {\n batches.push(\n arr.slice(i, i + size).map((d) => ({\n ...d,\n testCaseId: d.testCaseId ?? randomUUID(),\n }))\n );\n }\n\n return batches;\n}\n"],"mappings":"AAgBA,SAAS,QAAQ,SAAsB;AACvC,SAAS,cAAc;AAEvB,SAAS,oBAAoB;AAC7B,SAAS,gBAAgB,oBAAoB;AAC7C,SAAS,kBAAkB;AAEpB,MAAM,cAAc;AACpB,MAAM,kBAAkB,cAAc;AAEtC,MAAM,sBAAsB,EAAE,OAAO;AAAA,EAC1C,OAAO,EAAE,QAAQ;AAAA,EACjB,QAAQ,EAAE,QAAQ,EAAE,SAAS;AAAA,EAC7B,SAAS,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,SAAS;AAAA,EACvC,WAAW,EAAE,QAAQ,EAAE,SAAS;AAAA,EAChC,YAAY,EAAE,OAAO,EAAE,SAAS;AAAA,EAChC,UAAU,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,SAAS;AACzC,CAAC;AAGM,MAAM,0BAA0B,oBAAoB,OAAO;AAAA,EAChE,YAAY,EAAE,OAAO;AACvB,CAAC;AAGD,MAAM,uBAAuB,EAAE,KAAK,CAAC,WAAW,QAAQ,MAAM,CAAC;AAGxD,IAAK,iBAAL,kBAAKA,oBAAL;AACL,EAAAA,gBAAA,aAAU;AACV,EAAAA,gBAAA,UAAO;AACP,EAAAA,gBAAA,UAAO;AAHG,SAAAA;AAAA,GAAA;AAML,MAAM,cAAc,EAAE,OAAO;AAAA,EAClC,IAAI,EACD,OAAO,EACP;AAAA,IACC;AAAA,EACF,EACC,SAAS;AAAA,EACZ,OAAO,EAAE,MAAM,CAAC,EAAE,OAAO,GAAG,EAAE,OAAO,GAAG,EAAE,QAAQ,CAAC,CAAC,EAAE,SAAS;AAAA,EAC/D,QAAQ,qBAAqB,SAAS;AAAA,EACtC,OAAO,EAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,SAAS,EACN,OAAO;AAAA,IACN,WAAW,EAAE,OAAO,EAAE,SAAS;AAAA,EACjC,CAAC,EACA,YAAY,EACZ,SAAS;AACd,CAAC;AAGM,MAAM,sCAAsC;AAC5C,MAAM,oCAAoC;AAC1C,MAAM,mCAAmC;AAQzC,MAAM,qBAAqB,EAAE,OAAO;AAAA,EACzC,aAAa,EAAE,OAAO,EAAE,SAAS;AAAA,EACjC,YAAY,EAAE,OAAO;AAAA,EACrB,SAAS,EAAE,OAAO,EAAE,SAAS;AAAA,EAC7B,QAAQ,EAAE,OAAO,EAAE,SAAS;AAAA,EAC5B,YAAY,EAAE,MAAM,CAAC,aAAa,EAAE,MAAM,WAAW,CAAC,CAAC;AACzD,CAAC;AAGM,MAAM,sBAAsB,EAAE,MAAM,kBAAkB;AAoB7D,SAAS,aAIPC,YACA,eACA,cAC2C;AAC3C,QAAM,WAAWA;AACjB,WAAS,kBAAkB;AAC3B,WAAS,iBAAiB;AAC1B,SAAO;AACT;AAEA,MAAM,oBAAoB,EAAE,OAAO;AAAA,EACjC,SAAS,EAAE,MAAM,mBAAmB;AAAA,EACpC,WAAW,EAAE,OAAO;AAAA,EACpB,SAAS,EAAE,QAAQ;AACrB,CAAC;AA2BM,SAAS,gBAMd,UACA,SACA,QACiB;AACjB,QAAM,IAAI,UAAU,SAAS,MAAM;AAEnC,WAAS,eAAe,aAAa,CAAC;AAEtC,SAAO;AACT;AAKO,SAAS,UAMd,SACA,QACiB;AACjB,QAAM,eAAe,CAAC;AACtB,eAAa,gCAAgC,IAC3C,QAAQ,YAAY,SAAY,OAAO,QAAQ;AACjD,eAAa,mCAAmC,IAAI,QAAQ;AAC5D,eAAa,iCAAiC,IAAI,QAAQ;AAC1D,MAAI,QAAQ,cAAc;AACxB,iBAAa,eAAe,IAAI,aAAa;AAAA,MAC3C,QAAQ,QAAQ;AAAA,IAClB,CAAC;AAAA,EACH;AACA,QAAMA,aAAY;AAAA,IAChB;AAAA,MACE,YAAY;AAAA,MACZ,MAAM,QAAQ;AAAA,MACd,aAAa,kBAAkB,OAAO;AAAA,QACpC,SAAS,QAAQ,gBACb,EAAE,MAAM,QAAQ,aAAa,IAC7B,EAAE,MAAM,mBAAmB;AAAA,QAC/B,SAAS,QAAQ,gBAAgB,EAAE,QAAQ;AAAA,QAC3C,WAAW,EAAE,OAAO;AAAA,QACpB,WAAW,EAAE,OAAO,EAAE,SAAS;AAAA,MACjC,CAAC;AAAA,MACD,cAAc;AAAA,MACd,UAAU;AAAA,QACR,MAAM;AAAA,QACN,WAAW;AAAA,MACb;AAAA,IACF;AAAA,IACA,OAAO,MAAM;AACX,YAAM,gBAA+B,CAAC;AAEtC,YAAM,UAAU,gBAAgB,EAAE,SAAS,EAAE,SAAS;AAEtD,eAAS,aAAa,GAAG,aAAa,QAAQ,QAAQ,cAAc;AAClE,cAAM,QAAQ,QAAQ,UAAU;AAChC,YAAI;AACF,gBAAM;AAAA,YACJ;AAAA,cACE,UAAU;AAAA,gBACR,MAAM,EAAE,YACJ,SAAS,UAAU,KACnB,aAAa,MAAM,CAAC,EAAE,UAAU;AAAA,gBACpC,UAAU,EAAE,uBAAuB,EAAE,UAAU;AAAA,cACjD;AAAA,cACA,QAAQ;AAAA,gBACN,CAAC,cAAc,GAAG;AAAA,cACpB;AAAA,YACF;AAAA,YACA,OAAO,UAAU,WAAW;AAC1B,oBAAM,SAAS,OAAO,YAAY,EAAE;AACpC,oBAAM,UAAU,OAAO,YAAY,EAAE;AACrC,oBAAM,kBAAkB,MAAM,IAAI,CAAC,GAAG,UAAU;AAC9C,sBAAM,cAAc,EAAE,YAClB,EAAE,YAAY,aAAa,QAC3B;AACJ,sBAAM,YAAY;AAClB,yBAAS,QAAQ;AAAA,kBACf,OAAO,UAAU;AAAA,kBACjB,QAAQ,UAAU;AAAA,kBAClB,SAAS,UAAU;AAAA,gBACrB;AACA,sBAAM,oBAAoB,OAAO,WAAW,EAAE,OAAO,EAClD,KAAK,CAAC,YAAY;AAAA,kBACjB,GAAG;AAAA,kBACH;AAAA,kBACA;AAAA,kBACA;AAAA,gBACF,EAAE,EACD,MAAM,CAAC,UAAU;AAChB,yBAAO;AAAA,oBACL;AAAA,oBACA;AAAA,oBACA;AAAA,oBACA,YAAY,UAAU;AAAA,oBACtB,YAAY;AAAA,sBACV,OAAO,2BAA2B,UAAU,UAAU;AAAA,EAAc,KAAK;AAAA,oBAC3E;AAAA,kBACF;AAAA,gBACF,CAAC;AACH,uBAAO;AAAA,cACT,CAAC;AAED,oBAAM,aAAa,MAAM,QAAQ,IAAI,eAAe;AACpD,uBAAS,SACP,WAAW,WAAW,IAAI,WAAW,CAAC,IAAI;AAC5C,yBAAW,IAAI,CAAC,WAAW;AACzB,8BAAc,KAAK,MAAM;AAAA,cAC3B,CAAC;AAAA,YACH;AAAA,UACF;AAAA,QACF,SAAS,GAAG;AACV,iBAAO;AAAA,YACL,uBAAuB,UAAU;AAAA,EAAe,EAAY,KAAK;AAAA,UACnE;AACA;AAAA,QACF;AAAA,MACF;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACA,QAAM,MAAM;AAAA,IACVA;AAAA,IAIA,QAAQ;AAAA,IACR,QAAQ;AAAA,EACV;AACA,SAAO;AACT;AAaA,eAAsB,SAIpB,UACA,QACwB;AACxB,MAAIA;AACJ,MAAI,OAAO,OAAO,cAAc,UAAU;AACxC,IAAAA,aAAY,MAAM,SAAS,aAAa,cAAc,OAAO,SAAS,EAAE;AAAA,EAC1E,WAAW,OAAO,eAAe,KAAK,OAAO,WAAW,MAAM,GAAG;AAC/D,IAAAA,aAAY,MAAM,SAAS;AAAA,MACzB,cAAc,OAAO,UAAU,IAAI;AAAA,IACrC;AAAA,EACF,OAAO;AACL,IAAAA,aAAY,OAAO;AAAA,EACrB;AACA,MAAI,CAACA,YAAW;AACd,UAAM,IAAI,MAAM,0CAA0C;AAAA,EAC5D;AACA,SAAQ,MAAMA,WAAU;AAAA,IACtB,SAAS,OAAO;AAAA,IAChB,SAAS,OAAO;AAAA,IAChB,WAAW,OAAO,aAAa,WAAW;AAAA,EAC5C,CAAC;AACH;AAEO,MAAM,sBAAsB,EAAE,OAAO;AAAA;AAAA,EAE1C,OAAO,EAAE,OAAO,EAAE,SAAS;AAAA,EAC3B,SAAS,EAAE,MAAM,EAAE,OAAO,CAAC;AAC7B,CAAC;AAYM,SAAS,aAGd,SACyC;AACzC,SAAO,EAAE,GAAG,QAAQ;AACtB;AAMA,SAAS,gBACP,KACA,WACO;AACP,MAAI;AACJ,MAAI,CAAC,WAAW;AACd,WAAO;AAAA,EACT,OAAO;AACL,WAAO;AAAA,EACT;AAEA,QAAM,UAAiB,CAAC;AACxB,WAAS,IAAI,GAAG,IAAI,IAAI,QAAQ,KAAK,MAAM;AACzC,YAAQ;AAAA,MACN,IAAI,MAAM,GAAG,IAAI,IAAI,EAAE,IAAI,CAAC,OAAO;AAAA,QACjC,GAAG;AAAA,QACH,YAAY,EAAE,cAAc,WAAW;AAAA,MACzC,EAAE;AAAA,IACJ;AAAA,EACF;AAEA,SAAO;AACT;","names":["EvalStatusEnum","evaluator"]}