UNPKG

promptforge

Version:

Adaptive Prompt Intelligence & Orchestration SDK - Manage, optimize, and serve prompts for LLMs with versioning, feedback loops, and multi-provider support

144 lines 5.5 kB
import { v4 as uuidv4 } from 'uuid'; import { LLMProvider, EvaluationResultSchema, } from '../types.js'; export class EvaluationEngine { /** * Evaluate a prompt against a dataset or examples */ async evaluate(request, forge) { const promptVersion = await forge.getPromptVersion(request.promptId, request.version); if (!promptVersion) { throw new Error('Prompt version not found'); } // Get examples from dataset or request const examples = request.examples || []; if (examples.length === 0) { throw new Error('No examples provided for evaluation'); } // Execute prompt for each example const exampleResults = []; let totalScore = 0; for (const example of examples) { try { const result = await forge.executePrompt({ promptName: request.promptId, version: request.version, input: example.input, llmConfig: request.llmConfig, useCache: false, trackMetrics: true, }); // Calculate score for this example const score = example.expectedOutput ? this.calculateSimilarity(result.output, example.expectedOutput) : 1.0; exampleResults.push({ input: example.input, output: result.output, expectedOutput: example.expectedOutput, passed: score >= 0.7, // Threshold for passing score, }); totalScore += score; } catch (error) { exampleResults.push({ input: example.input, output: '', expectedOutput: example.expectedOutput, passed: false, score: 0, }); } } const overallScore = exampleResults.length > 0 ? totalScore / exampleResults.length : 0; // Calculate metrics const metrics = this.calculateMetrics(exampleResults); const result = { id: uuidv4(), promptId: request.promptId, version: promptVersion.version, datasetId: request.datasetId || uuidv4(), provider: request.llmConfig?.provider || LLMProvider.OPENAI, model: request.llmConfig?.model || 'gpt-4o-mini', metrics, exampleResults, overallScore, timestamp: new Date(), }; return EvaluationResultSchema.parse(result); } /** * Calculate evaluation metrics */ calculateMetrics(results) { const passedCount = results.filter(r => r.passed).length; const accuracy = results.length > 0 ? passedCount / results.length : 0; // Calculate average semantic similarity const scoresWithExpected = results.filter(r => r.expectedOutput); const semanticSimilarity = scoresWithExpected.length > 0 ? scoresWithExpected.reduce((sum, r) => sum + r.score, 0) / scoresWithExpected.length : undefined; // Calculate consistency (variance in scores) const scores = results.map(r => r.score); const avgScore = scores.reduce((sum, s) => sum + s, 0) / scores.length; const variance = scores.reduce((sum, s) => sum + Math.pow(s - avgScore, 2), 0) / scores.length; const consistency = 1 - Math.min(variance, 1); // Higher is more consistent return { accuracy, consistency, coherence: avgScore, // Simplified relevance: avgScore, // Simplified semanticSimilarity, // Other metrics would be calculated with actual NLP tools }; } /** * Calculate similarity between two strings * Simple implementation - would use embeddings in production */ calculateSimilarity(text1, text2) { const words1 = new Set(text1.toLowerCase().split(/\s+/)); const words2 = new Set(text2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter(w => words2.has(w))); const union = new Set([...words1, ...words2]); return union.size > 0 ? intersection.size / union.size : 0; } /** * Create evaluation dataset */ async createDataset(name, examples) { return { id: uuidv4(), name, description: `Evaluation dataset: ${name}`, examples, createdAt: new Date(), }; } /** * Compare two prompt versions */ async compareVersions(promptId, version1, version2, examples, forge) { const eval1 = await this.evaluate({ promptId, version: version1, examples, }, forge); const eval2 = await this.evaluate({ promptId, version: version2, examples, }, forge); const improvement = eval2.overallScore - eval1.overallScore; const winner = eval2.overallScore > eval1.overallScore ? version2 : version1; return { version1Score: eval1.overallScore, version2Score: eval2.overallScore, winner, improvement, }; } } //# sourceMappingURL=evaluation.js.map