UNPKG

evalz

Version:

Model graded evals with typescript

5 lines 9.07 kB
"use strict";Object.defineProperty(exports, "__esModule", {value: true}); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } function _nullishCoalesce(lhs, rhsFn) { if (lhs != null) { return lhs; } else { return rhsFn(); } } function _optionalChain(ops) { let lastAccessLHS = undefined; let value = ops[0]; let i = 1; while (i < ops.length) { const op = ops[i]; const fn = ops[i + 1]; i += 2; if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { return undefined; } if (op === 'access' || op === 'optionalAccess') { lastAccessLHS = value; value = fn(value); } else if (op === 'call' || op === 'optionalCall') { value = fn((...args) => value.call(lastAccessLHS, ...args)); lastAccessLHS = undefined; } } return value; }var _instructor = require('@instructor-ai/instructor'); var _instructor2 = _interopRequireDefault(_instructor);var _zod = require('zod'); var _zod2 = _interopRequireDefault(_zod);var A="You are an AI evaluator tasked with scoring a language model's responses. You'll be presented with a 'prompt:' and 'response:' pair (and optionally an 'expectedResponse') and should evaluate based on the criteria provided in the subsequent system prompts. Provide only a numerical score in the range defined, not a descriptive response and no other prose.",_="Your task is to provide a numerical score ranging from 0 to 1 based on the criteria in the subsequent system prompts. The score should precisely reflect the performance of the language model's response. Do not provide any text explanation or feedback, only the numerical score.",N="Your task is to provide a binary score of either 0 or 1 based on the criteria in the subsequent system prompts. This should precisely reflect the language model's performance. Do not provide any text explanation or feedback, only a singular digit: 1 or 0.",C={score:_,binary:N};var $=_zod2.default.object({score:_zod2.default.number()});function W({resultsType:o="score",evaluationDescription:r,model:E,messages:g,client:a}){if(!r||typeof r!="string")throw new Error("Evaluation description was not provided.");let h=_instructor2.default.call(void 0, {client:a,mode:"TOOLS"}),p=async({data:y})=>{let c=await Promise.all(y.map(async n=>{let{prompt:t,completion:l,expectedCompletion:s}=n;return{score:(await h.chat.completions.create({max_retries:3,model:_nullishCoalesce(E, () => ("gpt-4-turbo")),response_model:{schema:$,name:"Scoring"},messages:[{role:"system",content:A},{role:"system",content:C[o]},{role:"system",content:r},..._nullishCoalesce(g, () => ([])),{role:"system",content:`prompt: ${t} completion: ${l} ${_optionalChain([s, 'optionalAccess', _2 => _2.length])?`expectedCompletion: ${s} `:" "}Please provide your score now:`}]})).score,item:n}})),e;if(o==="score"){let n=c.reduce((t,{score:l=0})=>t+l,0)/c.length;e={results:c,scoreResults:{value:n}}}if(o==="binary"){let n=c.reduce((t,{score:l})=>(l>=0?t.trueCount++:t.falseCount++,t),{trueCount:0,falseCount:0});e={results:c,binaryResults:n}}if(!e)throw new Error("No result object was created");return e};return p.evalType="model-graded",p}function G({evaluators:o,weights:r}){if(Object.values(r).reduce((a,h)=>a+h,0)!==1)throw new Error("The sum of weights must be 1");if(Object.keys(r).length!==Object.keys(o).length||!Object.keys(r).every(a=>a in o))throw new Error("Each evaluator must have a corresponding weight and vice versa.");let g=async({data:a})=>{let p=(await Promise.all(a.map(async e=>{let{prompt:n="",completion:t,expectedCompletion:l="",contexts:s=[],groundTruth:m=""}=e,i=(await Promise.all(Object.keys(o).map(async u=>{let x=o[u],f=x.evalType==="accuracy",T=x.evalType==="model-graded",P=_optionalChain([x, 'access', _3 => _3.evalType, 'optionalAccess', _4 => _4.startsWith, 'call', _5 => _5("context-")]);f?console.log(`Evaluating ${u} with accuracy`):T?console.log(`Evaluating ${u} with model-graded`):P&&console.log(`Evaluating ${u} with ${x.evalType}`);try{let w=f?await x({data:[{completion:t,expectedCompletion:l}]}):await x({data:[{prompt:n,completion:t,expectedCompletion:l,contexts:s,groundTruth:m}]});return _optionalChain([w, 'optionalAccess', _6 => _6.scoreResults, 'optionalAccess', _7 => _7.value])!==void 0?{score:_optionalChain([w, 'optionalAccess', _8 => _8.scoreResults, 'optionalAccess', _9 => _9.value]),evaluator:u,evaluatorType:x.evalType}:void 0}catch(w){console.error(`Error evaluating ${u}:`,w);return}}))).filter(u=>u!==void 0);return i.length===0?(console.warn("No valid results for",e),{score:NaN,scores:[],item:e}):{score:Object.keys(r).reduce((u,x,f)=>u+r[x]*(_nullishCoalesce(_optionalChain([i, 'optionalAccess', _10 => _10[f], 'optionalAccess', _11 => _11.score]), () => (0))),0),scores:i,item:e}}))).filter(e=>!isNaN(e.score)),y=p.length>0?p.reduce((e,{score:n=0})=>e+n,0)/p.length:0,c=Object.keys(o).reduce((e,n)=>{let t=p.map(s=>_nullishCoalesce(_optionalChain([s, 'access', _12 => _12.scores, 'access', _13 => _13.find, 'call', _14 => _14(m=>m.evaluator===n), 'optionalAccess', _15 => _15.score]), () => (0))),l=t.reduce((s,m)=>s+m,0)/t.length;return e[n]=l,e},{});return{results:p.map(e=>({...e,score:e.score})),scoreResults:{value:y,individual:c}}};return g.evalType="weighted",g}var M=_zod.z.object({prompt:_zod.z.string().optional(),completion:_zod.z.string(),expectedCompletion:_zod.z.string().optional(),contexts:_zod.z.array(_zod.z.string()).optional(),groundTruth:_zod.z.string().optional()}),j= exports.EvaluationDataItemSchema =M,J= exports.EvaluationDataItemResultSchema =_zod.z.object({score:_zod.z.number(),scores:_zod.z.array(_zod.z.object({score:_zod.z.number(),evaluator:_zod.z.string(),evaluatorType:_zod.z.string()})).optional(),item:j});var _fastestlevenshtein = require('fastest-levenshtein');var _openai = require('openai'); var _openai2 = _interopRequireDefault(_openai);function S(o,r){return o.reduce((E,g,a)=>E+g*r[a],0)}function R(o,r){let E=S(o,r),g=Math.sqrt(S(o,o)),a=Math.sqrt(S(r,r));return g&&a?E/(g*a):0}function re({model:o,weights:r={factual:.5,semantic:.5}}){let E=async({data:g})=>{let a=new (0, _openai2.default)({apiKey:process.env.OPENAI_API_KEY}),p=(await Promise.all(g.map(async c=>{let{completion:e,expectedCompletion:n}=c;if(!e||!n){console.warn("Completion or expected completion is missing.");return}try{let l=1-_fastestlevenshtein.distance.call(void 0, e,n)/Math.max(e.length,n.length),[s,m]=await Promise.all([a.embeddings.create({input:[e],model:_nullishCoalesce(o, () => ("text-embedding-ada-002"))}),a.embeddings.create({input:[n],model:_nullishCoalesce(o, () => ("text-embedding-ada-002"))})]),b=R(s.data[0].embedding,m.data[0].embedding),i=r.factual*l+r.semantic*b;return{item:{completion:e,expectedCompletion:n},score:i}}catch(t){console.error("Error in accuracy evaluation:",t);return}}))).filter(c=>c!==void 0),y=p.length>0?p.reduce((c,{score:e})=>c+e,0)/p.length:0;return{results:p,scoreResults:{value:y}}};return E.evalType="accuracy",E}function O(o){return o.match(/\b[A-Z][a-z]*\b/g)||[]}function ce({type:o,model:r="text-embedding-3-small"}){let E=async({data:g})=>{let a=new (0, _openai2.default)({apiKey:process.env.OPENAI_API_KEY}),h=await Promise.all(g.map(async y=>{let{prompt:c,contexts:e=[],groundTruth:n="",completion:t}=y,l=0;switch(o){case"entities-recall":{if(!t)throw new Error("Completion is required for entities-recall evaluation.");let s=O(t);l=e.flatMap(i=>O(i)).filter(i=>s.includes(i)).length/s.length;break}case"precision":{if(!t)throw new Error("Completion is required for precision evaluation.");let s=c?`${c} ${t}`:t,m=await a.embeddings.create({input:[s],model:r}),b=n?await a.embeddings.create({input:[n],model:r}):null,i=await Promise.all(e.map(f=>a.embeddings.create({input:[f],model:r}))),d=0,u=0;l=i.map(f=>{let T=R(m.data[0].embedding,f.data[0].embedding),P=b?R(b.data[0].embedding,f.data[0].embedding):0;return Math.max(T,P)>.5?d+=1:u+=1,d/(d+u)}).reduce((f,T)=>f+T,0)/e.length;break}case"recall":{if(!t)throw new Error("Completion is required for recall evaluation.");let s=t.split(".").map(i=>i.trim()).filter(Boolean),m=e.flatMap(i=>i.split(".").map(d=>d.trim())),b=s.filter(i=>m.some(d=>_fastestlevenshtein.distance.call(void 0, i,d)<Math.max(i.length,d.length)*.5));l=s.length>0?b.length/s.length:0;break}case"relevance":{if(!t)throw new Error("Completion is required for relevance evaluation.");let s=c&&t?`${c} ${t}`:t,m=await a.embeddings.create({input:[s],model:r}),i=(await Promise.all(e.map(d=>a.embeddings.create({input:[d],model:r})))).map(d=>R(m.data[0].embedding,d.data[0].embedding));l=i.reduce((d,u)=>d+u,0)/i.length;break}default:throw new Error(`Unsupported evaluation type: ${o}`)}return{item:y,score:l}})),p=h.reduce((y,{score:c})=>y+c,0)/h.length;return{results:h,scoreResults:{value:p}}};return E.evalType=`context-${o}`,E}exports.BaseEvaluationDataItemSchema = M; exports.EvaluationDataItemResultSchema = J; exports.EvaluationDataItemSchema = j; exports.createAccuracyEvaluator = re; exports.createContextEvaluator = ce; exports.createEvaluator = W; exports.createWeightedEvaluator = G; //# sourceMappingURL=index.cjs.map