agentrails
Version:
Safeguard your AI agents - keep them grounded and on the rails
472 lines (458 loc) • 18.6 kB
HTML
<html lang="en">
<head>
<title>Code coverage report for evaluator.ts</title>
<meta charset="utf-8" />
<link rel="stylesheet" href="prettify.css" />
<link rel="stylesheet" href="base.css" />
<link rel="shortcut icon" type="image/x-icon" href="favicon.png" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style type='text/css'>
.coverage-summary .sorter {
background-image: url(sort-arrow-sprite.png);
}
</style>
</head>
<body>
<div class='wrapper'>
<div class='pad1'>
<h1><a href="index.html">All files</a> evaluator.ts</h1>
<div class='clearfix'>
<div class='fl pad1y space-right2'>
<span class="strong">100% </span>
<span class="quiet">Statements</span>
<span class='fraction'>28/28</span>
</div>
<div class='fl pad1y space-right2'>
<span class="strong">82.35% </span>
<span class="quiet">Branches</span>
<span class='fraction'>14/17</span>
</div>
<div class='fl pad1y space-right2'>
<span class="strong">100% </span>
<span class="quiet">Functions</span>
<span class='fraction'>5/5</span>
</div>
<div class='fl pad1y space-right2'>
<span class="strong">100% </span>
<span class="quiet">Lines</span>
<span class='fraction'>28/28</span>
</div>
</div>
<p class="quiet">
Press <em>n</em> or <em>j</em> to go to the next uncovered block, <em>b</em>, <em>p</em> or <em>k</em> for the previous block.
</p>
<template id="filterTemplate">
<div class="quiet">
Filter:
<input type="search" id="fileSearch">
</div>
</template>
</div>
<div class='status-line high'></div>
<pre><table class="coverage">
<tr><td class="line-count quiet"><a name='L1'></a><a href='#L1'>1</a>
<a name='L2'></a><a href='#L2'>2</a>
<a name='L3'></a><a href='#L3'>3</a>
<a name='L4'></a><a href='#L4'>4</a>
<a name='L5'></a><a href='#L5'>5</a>
<a name='L6'></a><a href='#L6'>6</a>
<a name='L7'></a><a href='#L7'>7</a>
<a name='L8'></a><a href='#L8'>8</a>
<a name='L9'></a><a href='#L9'>9</a>
<a name='L10'></a><a href='#L10'>10</a>
<a name='L11'></a><a href='#L11'>11</a>
<a name='L12'></a><a href='#L12'>12</a>
<a name='L13'></a><a href='#L13'>13</a>
<a name='L14'></a><a href='#L14'>14</a>
<a name='L15'></a><a href='#L15'>15</a>
<a name='L16'></a><a href='#L16'>16</a>
<a name='L17'></a><a href='#L17'>17</a>
<a name='L18'></a><a href='#L18'>18</a>
<a name='L19'></a><a href='#L19'>19</a>
<a name='L20'></a><a href='#L20'>20</a>
<a name='L21'></a><a href='#L21'>21</a>
<a name='L22'></a><a href='#L22'>22</a>
<a name='L23'></a><a href='#L23'>23</a>
<a name='L24'></a><a href='#L24'>24</a>
<a name='L25'></a><a href='#L25'>25</a>
<a name='L26'></a><a href='#L26'>26</a>
<a name='L27'></a><a href='#L27'>27</a>
<a name='L28'></a><a href='#L28'>28</a>
<a name='L29'></a><a href='#L29'>29</a>
<a name='L30'></a><a href='#L30'>30</a>
<a name='L31'></a><a href='#L31'>31</a>
<a name='L32'></a><a href='#L32'>32</a>
<a name='L33'></a><a href='#L33'>33</a>
<a name='L34'></a><a href='#L34'>34</a>
<a name='L35'></a><a href='#L35'>35</a>
<a name='L36'></a><a href='#L36'>36</a>
<a name='L37'></a><a href='#L37'>37</a>
<a name='L38'></a><a href='#L38'>38</a>
<a name='L39'></a><a href='#L39'>39</a>
<a name='L40'></a><a href='#L40'>40</a>
<a name='L41'></a><a href='#L41'>41</a>
<a name='L42'></a><a href='#L42'>42</a>
<a name='L43'></a><a href='#L43'>43</a>
<a name='L44'></a><a href='#L44'>44</a>
<a name='L45'></a><a href='#L45'>45</a>
<a name='L46'></a><a href='#L46'>46</a>
<a name='L47'></a><a href='#L47'>47</a>
<a name='L48'></a><a href='#L48'>48</a>
<a name='L49'></a><a href='#L49'>49</a>
<a name='L50'></a><a href='#L50'>50</a>
<a name='L51'></a><a href='#L51'>51</a>
<a name='L52'></a><a href='#L52'>52</a>
<a name='L53'></a><a href='#L53'>53</a>
<a name='L54'></a><a href='#L54'>54</a>
<a name='L55'></a><a href='#L55'>55</a>
<a name='L56'></a><a href='#L56'>56</a>
<a name='L57'></a><a href='#L57'>57</a>
<a name='L58'></a><a href='#L58'>58</a>
<a name='L59'></a><a href='#L59'>59</a>
<a name='L60'></a><a href='#L60'>60</a>
<a name='L61'></a><a href='#L61'>61</a>
<a name='L62'></a><a href='#L62'>62</a>
<a name='L63'></a><a href='#L63'>63</a>
<a name='L64'></a><a href='#L64'>64</a>
<a name='L65'></a><a href='#L65'>65</a>
<a name='L66'></a><a href='#L66'>66</a>
<a name='L67'></a><a href='#L67'>67</a>
<a name='L68'></a><a href='#L68'>68</a>
<a name='L69'></a><a href='#L69'>69</a>
<a name='L70'></a><a href='#L70'>70</a>
<a name='L71'></a><a href='#L71'>71</a>
<a name='L72'></a><a href='#L72'>72</a>
<a name='L73'></a><a href='#L73'>73</a>
<a name='L74'></a><a href='#L74'>74</a>
<a name='L75'></a><a href='#L75'>75</a>
<a name='L76'></a><a href='#L76'>76</a>
<a name='L77'></a><a href='#L77'>77</a>
<a name='L78'></a><a href='#L78'>78</a>
<a name='L79'></a><a href='#L79'>79</a>
<a name='L80'></a><a href='#L80'>80</a>
<a name='L81'></a><a href='#L81'>81</a>
<a name='L82'></a><a href='#L82'>82</a>
<a name='L83'></a><a href='#L83'>83</a>
<a name='L84'></a><a href='#L84'>84</a>
<a name='L85'></a><a href='#L85'>85</a>
<a name='L86'></a><a href='#L86'>86</a>
<a name='L87'></a><a href='#L87'>87</a>
<a name='L88'></a><a href='#L88'>88</a>
<a name='L89'></a><a href='#L89'>89</a>
<a name='L90'></a><a href='#L90'>90</a>
<a name='L91'></a><a href='#L91'>91</a>
<a name='L92'></a><a href='#L92'>92</a>
<a name='L93'></a><a href='#L93'>93</a>
<a name='L94'></a><a href='#L94'>94</a>
<a name='L95'></a><a href='#L95'>95</a>
<a name='L96'></a><a href='#L96'>96</a>
<a name='L97'></a><a href='#L97'>97</a>
<a name='L98'></a><a href='#L98'>98</a>
<a name='L99'></a><a href='#L99'>99</a>
<a name='L100'></a><a href='#L100'>100</a>
<a name='L101'></a><a href='#L101'>101</a>
<a name='L102'></a><a href='#L102'>102</a>
<a name='L103'></a><a href='#L103'>103</a>
<a name='L104'></a><a href='#L104'>104</a>
<a name='L105'></a><a href='#L105'>105</a>
<a name='L106'></a><a href='#L106'>106</a>
<a name='L107'></a><a href='#L107'>107</a>
<a name='L108'></a><a href='#L108'>108</a>
<a name='L109'></a><a href='#L109'>109</a>
<a name='L110'></a><a href='#L110'>110</a>
<a name='L111'></a><a href='#L111'>111</a>
<a name='L112'></a><a href='#L112'>112</a>
<a name='L113'></a><a href='#L113'>113</a>
<a name='L114'></a><a href='#L114'>114</a>
<a name='L115'></a><a href='#L115'>115</a>
<a name='L116'></a><a href='#L116'>116</a>
<a name='L117'></a><a href='#L117'>117</a>
<a name='L118'></a><a href='#L118'>118</a>
<a name='L119'></a><a href='#L119'>119</a>
<a name='L120'></a><a href='#L120'>120</a>
<a name='L121'></a><a href='#L121'>121</a>
<a name='L122'></a><a href='#L122'>122</a>
<a name='L123'></a><a href='#L123'>123</a>
<a name='L124'></a><a href='#L124'>124</a>
<a name='L125'></a><a href='#L125'>125</a>
<a name='L126'></a><a href='#L126'>126</a>
<a name='L127'></a><a href='#L127'>127</a>
<a name='L128'></a><a href='#L128'>128</a>
<a name='L129'></a><a href='#L129'>129</a>
<a name='L130'></a><a href='#L130'>130</a></td><td class="line-coverage quiet"><span class="cline-any cline-yes">1x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">1x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">9x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">9x</span>
<span class="cline-any cline-yes">9x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">7x</span>
<span class="cline-any cline-yes">7x</span>
<span class="cline-any cline-yes">1x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">6x</span>
<span class="cline-any cline-yes">6x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">2x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-yes">3x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-yes">2x</span>
<span class="cline-any cline-yes">4x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">8x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">1x</span>
<span class="cline-any cline-yes">2x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">1x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-yes">1x</span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span>
<span class="cline-any cline-neutral"> </span></td><td class="text"><pre class="prettyprint lang-js">import OpenAI from "openai";
import { LLMEvaluator, AgentLabConfig } from "./types";
/**
* OpenAI-based evaluator for agent responses
*/
export class OpenAIEvaluator implements LLMEvaluator {
private client: OpenAI;
private model: string;
private temperature: number;
constructor(config: AgentLabConfig) {
this.client = new OpenAI({
apiKey: config.llm.apiKey,
});
this.model = config.llm.model || <span class="branch-1 cbranch-no" title="branch not covered" >"gpt-4-turbo-preview";</span>
this.temperature = config.llm.temperature || <span class="branch-1 cbranch-no" title="branch not covered" >0.3;</span>
}
async evaluate(
input: string | Record<string, any>,
actualResponse: string | Record<string, any>,
expectedBehavior?: string,
exampleResponses?: string[]
): Promise<{ passed: boolean; reasoning: string }> {
const prompt = this.buildEvaluationPrompt(
input,
actualResponse,
expectedBehavior,
exampleResponses
);
try {
const response = await this.client.chat.completions.create({
model: this.model,
temperature: this.temperature,
messages: [
{
role: "system",
content: `You are an expert evaluator for AI agent responses. Your job is to determine if an agent's response is appropriate given the input and expected behavior.
You should evaluate based on:
1. Correctness: Does the response address the input appropriately?
2. Relevance: Is the response relevant to what was asked?
3. Quality: Is the response well-formed and useful?
4. Alignment: Does it match the expected behavior and example responses (if provided)?
Respond in the following JSON format:
{
"passed": true/false,
"reasoning": "Brief explanation of your evaluation"
}`,
},
{
role: "user",
content: prompt,
},
],
});
const content = response.choices[0]?.message?.content;
if (!content) {
throw new Error("No response from LLM evaluator");
}
// Parse JSON response
const result = JSON.parse(content);
return {
passed: Boolean(result.passed),
reasoning: result.reasoning || <span class="branch-1 cbranch-no" title="branch not covered" >"No reasoning provided",</span>
};
} catch (error: any) {
throw new Error(`LLM evaluation failed: ${error.message}`);
}
}
private buildEvaluationPrompt(
input: string | Record<string, any>,
actualResponse: string | Record<string, any>,
expectedBehavior?: string,
exampleResponses?: string[]
): string {
const inputStr =
typeof input === "string" ? input : JSON.stringify(input, null, 2);
const responseStr =
typeof actualResponse === "string"
? actualResponse
: JSON.stringify(actualResponse, null, 2);
let prompt = `**Input:**
\`\`\`
${inputStr}
\`\`\`
**Actual Response:**
\`\`\`
${responseStr}
\`\`\`
`;
if (expectedBehavior) {
prompt += `\n**Expected Behavior:**
${expectedBehavior}
`;
}
if (exampleResponses && exampleResponses.length > 0) {
prompt += `\n**Example Appropriate Responses:**
${exampleResponses.map((ex, i) => `${i + 1}. ${ex}`).join("\n")}
`;
}
prompt += `\n\nPlease evaluate if the actual response is appropriate.`;
return prompt;
}
}
/**
* Factory function to create evaluator based on config
*/
export function createEvaluator(config: AgentLabConfig): LLMEvaluator {
switch (config.llm.provider) {
case "openai":
return new OpenAIEvaluator(config);
default:
throw new Error(`Unsupported LLM provider: ${config.llm.provider}`);
}
}
</pre></td></tr></table></pre>
<div class='push'></div><!-- for sticky footer -->
</div><!-- /wrapper -->
<div class='footer quiet pad2 space-top1 center small'>
Code coverage generated by
<a href="https://istanbul.js.org/" target="_blank" rel="noopener noreferrer">istanbul</a>
at 2025-10-06T16:35:20.912Z
</div>
<script src="prettify.js"></script>
<script>
window.onload = function () {
prettyPrint();
};
</script>
<script src="sorter.js"></script>
<script src="block-navigation.js"></script>
</body>
</html>