mcp-evals
Version:
GitHub Action for evaluating MCP server tool calls using LLM-based scoring
112 lines • 4.35 kB
JavaScript
import { Experimental_StdioMCPTransport } from "ai/mcp-stdio";
import { experimental_createMCPClient, streamText, } from "ai";
import { openai } from "@ai-sdk/openai";
const defaultModel = openai("gpt-4o");
export async function runEvals(model = defaultModel, prompt, serverPath) {
const transport = new Experimental_StdioMCPTransport({
command: "tsx",
args: [serverPath],
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
});
const client = await experimental_createMCPClient({
transport,
});
const tools = await client.tools();
try {
const result = streamText({
model,
tools,
system: "You are an assistant responsible for evaluating the results of calling various tools. Given the user's query, use the tools available to you to answer the question.",
prompt,
maxRetries: 1,
maxSteps: 10,
onError: ({ error }) => {
console.error(error);
},
});
let fullText = '';
for await (const chunk of result.fullStream) {
if (chunk.type === 'text-delta') {
fullText += chunk.textDelta;
}
}
return fullText;
}
catch (error) {
console.error('Error in runEvals:', error);
throw error;
}
}
export async function grade(model = defaultModel, prompt, serverPath) {
const finalServerPath = serverPath || process.argv[3]; // Use provided serverPath or CLI args
if (!finalServerPath) {
throw new Error('Server path not provided');
}
const result = await runEvals(model, prompt, finalServerPath);
const evalSystemPromt = `You are an expert evaluator assessing how well an LLM answers a given question. Review the provided answer and score it from 1 to 5 in each of the following categories:
Accuracy – Does the answer contain factual errors or hallucinations?
Completeness – Does the answer fully address all parts of the question?
Relevance – Is the information directly related to the question?
Clarity – Is the explanation easy to understand and well-structured?
Reasoning – Does the answer show logical thinking or provide evidence or rationale?
Return your evaluation as a JSON object in the format:
{
"accuracy": 1-5,
"completeness": 1-5,
"relevance": 1-5,
"clarity": 1-5,
"reasoning": 1-5,
"overall_comments": "A short paragraph summarizing the strengths and weaknesses of the answer."
}`;
const evalPromt = `Here is the user input: ${prompt}
Here is the LLM's answer: ${result}`;
const evalResult = streamText({
model,
maxRetries: 1,
maxSteps: 10,
system: evalSystemPromt,
prompt: evalPromt,
onError: ({ error }) => {
console.error(error);
},
});
for await (const _ of evalResult.fullStream) {
}
return await evalResult.text;
}
export async function runAllEvals(config, serverPath) {
const results = new Map();
let transport;
try {
transport = new Experimental_StdioMCPTransport({
command: "tsx",
args: [serverPath],
env: Object.fromEntries(Object.entries(process.env).filter(([_, v]) => v !== undefined))
});
const client = await experimental_createMCPClient({
transport,
});
for (const evaluation of config.evals) {
console.log(`Running ${evaluation.name}...`);
try {
const result = await evaluation.run(config.model);
results.set(evaluation.name, result);
}
catch (error) {
console.error(`Error running ${evaluation.name}:`, error);
results.set(evaluation.name, { error: error instanceof Error ? error.message : String(error) });
}
}
return results;
}
finally {
// Clean up the transport
if (transport) {
await transport.close?.();
}
}
}
// Export everything needed by consumers
export * from './types.js';
export { metrics } from './metrics.js';
//# sourceMappingURL=index.js.map