UNPKG

@genkit-ai/ai

Version:

Genkit AI framework generative AI APIs.

181 lines 5.89 kB
import { defineAction, z } from "@genkit-ai/core"; import { logger } from "@genkit-ai/core/logging"; import { SPAN_TYPE_ATTR, runInNewSpan } from "@genkit-ai/core/tracing"; import { randomUUID } from "crypto"; const ATTR_PREFIX = "genkit"; const SPAN_STATE_ATTR = ATTR_PREFIX + ":state"; const BaseDataPointSchema = z.object({ input: z.unknown(), output: z.unknown().optional(), context: z.array(z.unknown()).optional(), reference: z.unknown().optional(), testCaseId: z.string().optional(), traceIds: z.array(z.string()).optional() }); const BaseEvalDataPointSchema = BaseDataPointSchema.extend({ testCaseId: z.string() }); const ScoreSchema = z.object({ id: z.string().describe( "Optional ID to differentiate different scores if applying in a single evaluation" ).optional(), score: z.union([z.number(), z.string(), z.boolean()]).optional(), // TODO: use StatusSchema error: z.string().optional(), details: z.object({ reasoning: z.string().optional() }).passthrough().optional() }); const EVALUATOR_METADATA_KEY_DISPLAY_NAME = "evaluatorDisplayName"; const EVALUATOR_METADATA_KEY_DEFINITION = "evaluatorDefinition"; const EVALUATOR_METADATA_KEY_IS_BILLED = "evaluatorIsBilled"; const EvalResponseSchema = z.object({ sampleIndex: z.number().optional(), testCaseId: z.string(), traceId: z.string().optional(), spanId: z.string().optional(), evaluation: z.union([ScoreSchema, z.array(ScoreSchema)]) }); const EvalResponsesSchema = z.array(EvalResponseSchema); function withMetadata(evaluator, dataPointType, configSchema) { const withMeta = evaluator; withMeta.__dataPointType = dataPointType; withMeta.__configSchema = configSchema; return withMeta; } const EvalRequestSchema = z.object({ dataset: z.array(BaseDataPointSchema), evalRunId: z.string(), options: z.unknown() }); function defineEvaluator(registry, options, runner) { const metadata = {}; metadata[EVALUATOR_METADATA_KEY_IS_BILLED] = options.isBilled == void 0 ? true : options.isBilled; metadata[EVALUATOR_METADATA_KEY_DISPLAY_NAME] = options.displayName; metadata[EVALUATOR_METADATA_KEY_DEFINITION] = options.definition; const evaluator = defineAction( registry, { actionType: "evaluator", name: options.name, inputSchema: EvalRequestSchema.extend({ dataset: options.dataPointType ? z.array(options.dataPointType) : z.array(BaseDataPointSchema), options: options.configSchema ?? z.unknown(), evalRunId: z.string() }), outputSchema: EvalResponsesSchema, metadata }, async (i) => { let evalResponses = []; for (let index = 0; index < i.dataset.length; index++) { const datapoint = { ...i.dataset[index], testCaseId: i.dataset[index].testCaseId ?? randomUUID() }; try { await runInNewSpan( registry, { metadata: { name: `Test Case ${datapoint.testCaseId}`, metadata: { "evaluator:evalRunId": i.evalRunId } }, labels: { [SPAN_TYPE_ATTR]: "evaluator" } }, async (metadata2, otSpan) => { const spanId = otSpan.spanContext().spanId; const traceId = otSpan.spanContext().traceId; try { metadata2.input = { input: datapoint.input, output: datapoint.output, context: datapoint.context }; const testCaseOutput = await runner(datapoint, i.options); testCaseOutput.sampleIndex = index; testCaseOutput.spanId = spanId; testCaseOutput.traceId = traceId; metadata2.output = testCaseOutput; evalResponses.push(testCaseOutput); return testCaseOutput; } catch (e) { evalResponses.push({ sampleIndex: index, spanId, traceId, testCaseId: datapoint.testCaseId, evaluation: { error: `Evaluation of test case ${datapoint.testCaseId} failed: ${e.stack}` } }); throw e; } } ); } catch (e) { logger.error( `Evaluation of test case ${datapoint.testCaseId} failed: ${e.stack}` ); continue; } } return evalResponses; } ); const ewm = withMetadata( evaluator, options.dataPointType, options.configSchema ); return ewm; } async function evaluate(registry, params) { let evaluator; if (typeof params.evaluator === "string") { evaluator = await registry.lookupAction(`/evaluator/${params.evaluator}`); } else if (Object.hasOwnProperty.call(params.evaluator, "info")) { evaluator = await registry.lookupAction( `/evaluator/${params.evaluator.name}` ); } else { evaluator = params.evaluator; } if (!evaluator) { throw new Error("Unable to utilize the provided evaluator"); } return await evaluator({ dataset: params.dataset, options: params.options, evalRunId: params.evalRunId ?? randomUUID() }); } const EvaluatorInfoSchema = z.object({ /** Friendly label for this evaluator */ label: z.string().optional(), metrics: z.array(z.string()) }); function evaluatorRef(options) { return { ...options }; } export { ATTR_PREFIX, BaseDataPointSchema, BaseEvalDataPointSchema, EVALUATOR_METADATA_KEY_DEFINITION, EVALUATOR_METADATA_KEY_DISPLAY_NAME, EVALUATOR_METADATA_KEY_IS_BILLED, EvalResponseSchema, EvalResponsesSchema, EvaluatorInfoSchema, SPAN_STATE_ATTR, ScoreSchema, defineEvaluator, evaluate, evaluatorRef }; //# sourceMappingURL=evaluator.mjs.map