evalz
Version:
Model graded evals with typescript
5 lines • 7.21 kB
JavaScript
import D from"@instructor-ai/instructor";import I from"zod";var A="You are an AI evaluator tasked with scoring a language model's responses. You'll be presented with a 'prompt:' and 'response:' pair (and optionally an 'expectedResponse') and should evaluate based on the criteria provided in the subsequent system prompts. Provide only a numerical score in the range defined, not a descriptive response and no other prose.",_="Your task is to provide a numerical score ranging from 0 to 1 based on the criteria in the subsequent system prompts. The score should precisely reflect the performance of the language model's response. Do not provide any text explanation or feedback, only the numerical score.",N="Your task is to provide a binary score of either 0 or 1 based on the criteria in the subsequent system prompts. This should precisely reflect the language model's performance. Do not provide any text explanation or feedback, only a singular digit: 1 or 0.",C={score:_,binary:N};var $=I.object({score:I.number()});function W({resultsType:o="score",evaluationDescription:r,model:E,messages:g,client:a}){if(!r||typeof r!="string")throw new Error("Evaluation description was not provided.");let h=D({client:a,mode:"TOOLS"}),p=async({data:y})=>{let c=await Promise.all(y.map(async n=>{let{prompt:t,completion:l,expectedCompletion:s}=n;return{score:(await h.chat.completions.create({max_retries:3,model:E??"gpt-4-turbo",response_model:{schema:$,name:"Scoring"},messages:[{role:"system",content:A},{role:"system",content:C[o]},{role:"system",content:r},...g??[],{role:"system",content:`prompt: ${t}
completion: ${l}
${s?.length?`expectedCompletion: ${s}
`:" "}Please provide your score now:`}]})).score,item:n}})),e;if(o==="score"){let n=c.reduce((t,{score:l=0})=>t+l,0)/c.length;e={results:c,scoreResults:{value:n}}}if(o==="binary"){let n=c.reduce((t,{score:l})=>(l>=0?t.trueCount++:t.falseCount++,t),{trueCount:0,falseCount:0});e={results:c,binaryResults:n}}if(!e)throw new Error("No result object was created");return e};return p.evalType="model-graded",p}function G({evaluators:o,weights:r}){if(Object.values(r).reduce((a,h)=>a+h,0)!==1)throw new Error("The sum of weights must be 1");if(Object.keys(r).length!==Object.keys(o).length||!Object.keys(r).every(a=>a in o))throw new Error("Each evaluator must have a corresponding weight and vice versa.");let g=async({data:a})=>{let p=(await Promise.all(a.map(async e=>{let{prompt:n="",completion:t,expectedCompletion:l="",contexts:s=[],groundTruth:m=""}=e,i=(await Promise.all(Object.keys(o).map(async u=>{let x=o[u],f=x.evalType==="accuracy",T=x.evalType==="model-graded",P=x.evalType?.startsWith("context-");f?console.log(`Evaluating ${u} with accuracy`):T?console.log(`Evaluating ${u} with model-graded`):P&&console.log(`Evaluating ${u} with ${x.evalType}`);try{let w=f?await x({data:[{completion:t,expectedCompletion:l}]}):await x({data:[{prompt:n,completion:t,expectedCompletion:l,contexts:s,groundTruth:m}]});return w?.scoreResults?.value!==void 0?{score:w?.scoreResults?.value,evaluator:u,evaluatorType:x.evalType}:void 0}catch(w){console.error(`Error evaluating ${u}:`,w);return}}))).filter(u=>u!==void 0);return i.length===0?(console.warn("No valid results for",e),{score:NaN,scores:[],item:e}):{score:Object.keys(r).reduce((u,x,f)=>u+r[x]*(i?.[f]?.score??0),0),scores:i,item:e}}))).filter(e=>!isNaN(e.score)),y=p.length>0?p.reduce((e,{score:n=0})=>e+n,0)/p.length:0,c=Object.keys(o).reduce((e,n)=>{let t=p.map(s=>s.scores.find(m=>m.evaluator===n)?.score??0),l=t.reduce((s,m)=>s+m,0)/t.length;return e[n]=l,e},{});return{results:p.map(e=>({...e,score:e.score})),scoreResults:{value:y,individual:c}}};return g.evalType="weighted",g}import{z as v}from"zod";var M=v.object({prompt:v.string().optional(),completion:v.string(),expectedCompletion:v.string().optional(),contexts:v.array(v.string()).optional(),groundTruth:v.string().optional()}),j=M,J=v.object({score:v.number(),scores:v.array(v.object({score:v.number(),evaluator:v.string(),evaluatorType:v.string()})).optional(),item:j});import{distance as Y}from"fastest-levenshtein";import k from"openai";function S(o,r){return o.reduce((E,g,a)=>E+g*r[a],0)}function R(o,r){let E=S(o,r),g=Math.sqrt(S(o,o)),a=Math.sqrt(S(r,r));return g&&a?E/(g*a):0}function re({model:o,weights:r={factual:.5,semantic:.5}}){let E=async({data:g})=>{let a=new k({apiKey:process.env.OPENAI_API_KEY}),p=(await Promise.all(g.map(async c=>{let{completion:e,expectedCompletion:n}=c;if(!e||!n){console.warn("Completion or expected completion is missing.");return}try{let l=1-Y(e,n)/Math.max(e.length,n.length),[s,m]=await Promise.all([a.embeddings.create({input:[e],model:o??"text-embedding-ada-002"}),a.embeddings.create({input:[n],model:o??"text-embedding-ada-002"})]),b=R(s.data[0].embedding,m.data[0].embedding),i=r.factual*l+r.semantic*b;return{item:{completion:e,expectedCompletion:n},score:i}}catch(t){console.error("Error in accuracy evaluation:",t);return}}))).filter(c=>c!==void 0),y=p.length>0?p.reduce((c,{score:e})=>c+e,0)/p.length:0;return{results:p,scoreResults:{value:y}}};return E.evalType="accuracy",E}import{distance as q}from"fastest-levenshtein";import U from"openai";function O(o){return o.match(/\b[A-Z][a-z]*\b/g)||[]}function ce({type:o,model:r="text-embedding-3-small"}){let E=async({data:g})=>{let a=new U({apiKey:process.env.OPENAI_API_KEY}),h=await Promise.all(g.map(async y=>{let{prompt:c,contexts:e=[],groundTruth:n="",completion:t}=y,l=0;switch(o){case"entities-recall":{if(!t)throw new Error("Completion is required for entities-recall evaluation.");let s=O(t);l=e.flatMap(i=>O(i)).filter(i=>s.includes(i)).length/s.length;break}case"precision":{if(!t)throw new Error("Completion is required for precision evaluation.");let s=c?`${c} ${t}`:t,m=await a.embeddings.create({input:[s],model:r}),b=n?await a.embeddings.create({input:[n],model:r}):null,i=await Promise.all(e.map(f=>a.embeddings.create({input:[f],model:r}))),d=0,u=0;l=i.map(f=>{let T=R(m.data[0].embedding,f.data[0].embedding),P=b?R(b.data[0].embedding,f.data[0].embedding):0;return Math.max(T,P)>.5?d+=1:u+=1,d/(d+u)}).reduce((f,T)=>f+T,0)/e.length;break}case"recall":{if(!t)throw new Error("Completion is required for recall evaluation.");let s=t.split(".").map(i=>i.trim()).filter(Boolean),m=e.flatMap(i=>i.split(".").map(d=>d.trim())),b=s.filter(i=>m.some(d=>q(i,d)<Math.max(i.length,d.length)*.5));l=s.length>0?b.length/s.length:0;break}case"relevance":{if(!t)throw new Error("Completion is required for relevance evaluation.");let s=c&&t?`${c} ${t}`:t,m=await a.embeddings.create({input:[s],model:r}),i=(await Promise.all(e.map(d=>a.embeddings.create({input:[d],model:r})))).map(d=>R(m.data[0].embedding,d.data[0].embedding));l=i.reduce((d,u)=>d+u,0)/i.length;break}default:throw new Error(`Unsupported evaluation type: ${o}`)}return{item:y,score:l}})),p=h.reduce((y,{score:c})=>y+c,0)/h.length;return{results:h,scoreResults:{value:p}}};return E.evalType=`context-${o}`,E}export{M as BaseEvaluationDataItemSchema,J as EvaluationDataItemResultSchema,j as EvaluationDataItemSchema,re as createAccuracyEvaluator,ce as createContextEvaluator,W as createEvaluator,G as createWeightedEvaluator};
//# sourceMappingURL=index.js.map