UNPKG

@dooor-ai/toolkit

Version:

Guards, Evals & Observability for AI applications - works seamlessly with LangChain/LangGraph

170 lines (146 loc) 4.96 kB
import { Eval } from "./base"; import { EvalResult, EvalConfig } from "../core/types"; import { getCortexDBClient, getGlobalProviderName } from "../observability/cortexdb-client"; export interface SummarizationConfig extends EvalConfig { /** Original text that was summarized */ originalText?: string; } /** * SummarizationEval - Evaluates quality of text summarization * * Checks if summary captures key points, maintains accuracy, and is concise. * * Example: * ```typescript * const eval = new SummarizationEval({ * threshold: 0.8, * originalText: "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower from 1887 to 1889." * }); * const result = await eval.evaluate( * "Summarize this text.", * "The Eiffel Tower in Paris was designed by Gustave Eiffel and built 1887-1889." * ); * // result.score = 0.9 (good summary), result.passed = true * ``` */ export class SummarizationEval extends Eval { private originalText?: string; constructor(config: SummarizationConfig = {}) { super(config); this.originalText = config.originalText; } get name(): string { return "SummarizationEval"; } /** * Set original text dynamically */ setOriginalText(text: string): void { this.originalText = text; } async evaluate( input: string, output: string, metadata?: Record<string, any> ): Promise<EvalResult> { const startTime = Date.now(); const original = this.originalText || metadata?.originalText || metadata?.sourceText; if (!original) { return { name: this.name, score: 0.5, passed: false, details: "No original text provided for summarization evaluation. Pass 'originalText' via config or metadata.", metadata: { latency: Date.now() - startTime, }, timestamp: new Date(), }; } try { const cortexClient = getCortexDBClient(); const providerName = getGlobalProviderName(); const prompt = this.buildPrompt(original, output); const response = await cortexClient.invokeAI({ prompt, usage: "evaluation", providerName: providerName || undefined, temperature: 0.0, maxTokens: 300, }); const score = this.parseScore(response.text); const passed = score >= this.getThreshold(); return { name: this.name, score, passed, details: `Summarization quality score: ${score.toFixed(2)}. ${passed ? "PASSED" : "FAILED"} (threshold: ${this.getThreshold()})`, metadata: { latency: Date.now() - startTime, judgeResponse: response.text, originalLength: original.length, summaryLength: output.length, compressionRatio: (output.length / original.length).toFixed(2), }, timestamp: new Date(), }; } catch (error) { console.error("SummarizationEval failed:", error); return { name: this.name, score: 0.5, passed: false, details: `Eval failed: ${error instanceof Error ? error.message : "Unknown error"}`, metadata: { error: String(error), latency: Date.now() - startTime, }, timestamp: new Date(), }; } } private buildPrompt(originalText: string, summary: string): string { return `You are an expert evaluator. Your task is to evaluate the quality of a SUMMARY. Original Text: """ ${originalText} """ Summary: """ ${summary} """ Evaluate summarization quality based on: 1. **Coverage**: Does it capture the main points? 2. **Accuracy**: Is it factually correct? 3. **Conciseness**: Is it appropriately brief? 4. **Coherence**: Is it well-structured and readable? Score: - 1.0 = Excellent summary, captures all key points accurately and concisely - 0.7-0.9 = Good summary, minor omissions or verbosity - 0.4-0.6 = Mediocre, misses important points or too verbose - 0.0-0.3 = Poor, inaccurate or misses most key information Output ONLY a JSON object in this exact format: { "score": 0.85, "reasoning": "Assessment of coverage, accuracy, conciseness, and coherence" }`; } private parseScore(response: string): number { try { const jsonMatch = response.match(/\{[\s\S]*\}/); if (jsonMatch) { const parsed = JSON.parse(jsonMatch[0]); if (typeof parsed.score === "number") { return Math.max(0, Math.min(1, parsed.score)); } } const numberMatch = response.match(/\b0?\.\d+\b|\b1\.0\b|\b[01]\b/); if (numberMatch) { return Math.max(0, Math.min(1, parseFloat(numberMatch[0]))); } console.warn("Could not parse score from response:", response); return 0.5; } catch (error) { console.error("Error parsing score:", error); return 0.5; } } }