@genkit-ai/ai
Version:
Genkit AI framework generative AI APIs.
181 lines • 5.89 kB
JavaScript
import { defineAction, z } from "@genkit-ai/core";
import { logger } from "@genkit-ai/core/logging";
import { SPAN_TYPE_ATTR, runInNewSpan } from "@genkit-ai/core/tracing";
import { randomUUID } from "crypto";
const ATTR_PREFIX = "genkit";
const SPAN_STATE_ATTR = ATTR_PREFIX + ":state";
const BaseDataPointSchema = z.object({
input: z.unknown(),
output: z.unknown().optional(),
context: z.array(z.unknown()).optional(),
reference: z.unknown().optional(),
testCaseId: z.string().optional(),
traceIds: z.array(z.string()).optional()
});
const BaseEvalDataPointSchema = BaseDataPointSchema.extend({
testCaseId: z.string()
});
const ScoreSchema = z.object({
id: z.string().describe(
"Optional ID to differentiate different scores if applying in a single evaluation"
).optional(),
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
// TODO: use StatusSchema
error: z.string().optional(),
details: z.object({
reasoning: z.string().optional()
}).passthrough().optional()
});
const EVALUATOR_METADATA_KEY_DISPLAY_NAME = "evaluatorDisplayName";
const EVALUATOR_METADATA_KEY_DEFINITION = "evaluatorDefinition";
const EVALUATOR_METADATA_KEY_IS_BILLED = "evaluatorIsBilled";
const EvalResponseSchema = z.object({
sampleIndex: z.number().optional(),
testCaseId: z.string(),
traceId: z.string().optional(),
spanId: z.string().optional(),
evaluation: z.union([ScoreSchema, z.array(ScoreSchema)])
});
const EvalResponsesSchema = z.array(EvalResponseSchema);
function withMetadata(evaluator, dataPointType, configSchema) {
const withMeta = evaluator;
withMeta.__dataPointType = dataPointType;
withMeta.__configSchema = configSchema;
return withMeta;
}
const EvalRequestSchema = z.object({
dataset: z.array(BaseDataPointSchema),
evalRunId: z.string(),
options: z.unknown()
});
function defineEvaluator(registry, options, runner) {
const metadata = {};
metadata[EVALUATOR_METADATA_KEY_IS_BILLED] = options.isBilled == void 0 ? true : options.isBilled;
metadata[EVALUATOR_METADATA_KEY_DISPLAY_NAME] = options.displayName;
metadata[EVALUATOR_METADATA_KEY_DEFINITION] = options.definition;
const evaluator = defineAction(
registry,
{
actionType: "evaluator",
name: options.name,
inputSchema: EvalRequestSchema.extend({
dataset: options.dataPointType ? z.array(options.dataPointType) : z.array(BaseDataPointSchema),
options: options.configSchema ?? z.unknown(),
evalRunId: z.string()
}),
outputSchema: EvalResponsesSchema,
metadata
},
async (i) => {
let evalResponses = [];
for (let index = 0; index < i.dataset.length; index++) {
const datapoint = {
...i.dataset[index],
testCaseId: i.dataset[index].testCaseId ?? randomUUID()
};
try {
await runInNewSpan(
registry,
{
metadata: {
name: `Test Case ${datapoint.testCaseId}`,
metadata: { "evaluator:evalRunId": i.evalRunId }
},
labels: {
[SPAN_TYPE_ATTR]: "evaluator"
}
},
async (metadata2, otSpan) => {
const spanId = otSpan.spanContext().spanId;
const traceId = otSpan.spanContext().traceId;
try {
metadata2.input = {
input: datapoint.input,
output: datapoint.output,
context: datapoint.context
};
const testCaseOutput = await runner(datapoint, i.options);
testCaseOutput.sampleIndex = index;
testCaseOutput.spanId = spanId;
testCaseOutput.traceId = traceId;
metadata2.output = testCaseOutput;
evalResponses.push(testCaseOutput);
return testCaseOutput;
} catch (e) {
evalResponses.push({
sampleIndex: index,
spanId,
traceId,
testCaseId: datapoint.testCaseId,
evaluation: {
error: `Evaluation of test case ${datapoint.testCaseId} failed:
${e.stack}`
}
});
throw e;
}
}
);
} catch (e) {
logger.error(
`Evaluation of test case ${datapoint.testCaseId} failed:
${e.stack}`
);
continue;
}
}
return evalResponses;
}
);
const ewm = withMetadata(
evaluator,
options.dataPointType,
options.configSchema
);
return ewm;
}
async function evaluate(registry, params) {
let evaluator;
if (typeof params.evaluator === "string") {
evaluator = await registry.lookupAction(`/evaluator/${params.evaluator}`);
} else if (Object.hasOwnProperty.call(params.evaluator, "info")) {
evaluator = await registry.lookupAction(
`/evaluator/${params.evaluator.name}`
);
} else {
evaluator = params.evaluator;
}
if (!evaluator) {
throw new Error("Unable to utilize the provided evaluator");
}
return await evaluator({
dataset: params.dataset,
options: params.options,
evalRunId: params.evalRunId ?? randomUUID()
});
}
const EvaluatorInfoSchema = z.object({
/** Friendly label for this evaluator */
label: z.string().optional(),
metrics: z.array(z.string())
});
function evaluatorRef(options) {
return { ...options };
}
export {
ATTR_PREFIX,
BaseDataPointSchema,
BaseEvalDataPointSchema,
EVALUATOR_METADATA_KEY_DEFINITION,
EVALUATOR_METADATA_KEY_DISPLAY_NAME,
EVALUATOR_METADATA_KEY_IS_BILLED,
EvalResponseSchema,
EvalResponsesSchema,
EvaluatorInfoSchema,
SPAN_STATE_ATTR,
ScoreSchema,
defineEvaluator,
evaluate,
evaluatorRef
};
//# sourceMappingURL=evaluator.mjs.map