promptforge
Version:
Adaptive Prompt Intelligence & Orchestration SDK - Manage, optimize, and serve prompts for LLMs with versioning, feedback loops, and multi-provider support
144 lines • 5.5 kB
JavaScript
import { v4 as uuidv4 } from 'uuid';
import { LLMProvider, EvaluationResultSchema, } from '../types.js';
export class EvaluationEngine {
/**
* Evaluate a prompt against a dataset or examples
*/
async evaluate(request, forge) {
const promptVersion = await forge.getPromptVersion(request.promptId, request.version);
if (!promptVersion) {
throw new Error('Prompt version not found');
}
// Get examples from dataset or request
const examples = request.examples || [];
if (examples.length === 0) {
throw new Error('No examples provided for evaluation');
}
// Execute prompt for each example
const exampleResults = [];
let totalScore = 0;
for (const example of examples) {
try {
const result = await forge.executePrompt({
promptName: request.promptId,
version: request.version,
input: example.input,
llmConfig: request.llmConfig,
useCache: false,
trackMetrics: true,
});
// Calculate score for this example
const score = example.expectedOutput
? this.calculateSimilarity(result.output, example.expectedOutput)
: 1.0;
exampleResults.push({
input: example.input,
output: result.output,
expectedOutput: example.expectedOutput,
passed: score >= 0.7, // Threshold for passing
score,
});
totalScore += score;
}
catch (error) {
exampleResults.push({
input: example.input,
output: '',
expectedOutput: example.expectedOutput,
passed: false,
score: 0,
});
}
}
const overallScore = exampleResults.length > 0
? totalScore / exampleResults.length
: 0;
// Calculate metrics
const metrics = this.calculateMetrics(exampleResults);
const result = {
id: uuidv4(),
promptId: request.promptId,
version: promptVersion.version,
datasetId: request.datasetId || uuidv4(),
provider: request.llmConfig?.provider || LLMProvider.OPENAI,
model: request.llmConfig?.model || 'gpt-4o-mini',
metrics,
exampleResults,
overallScore,
timestamp: new Date(),
};
return EvaluationResultSchema.parse(result);
}
/**
* Calculate evaluation metrics
*/
calculateMetrics(results) {
const passedCount = results.filter(r => r.passed).length;
const accuracy = results.length > 0 ? passedCount / results.length : 0;
// Calculate average semantic similarity
const scoresWithExpected = results.filter(r => r.expectedOutput);
const semanticSimilarity = scoresWithExpected.length > 0
? scoresWithExpected.reduce((sum, r) => sum + r.score, 0) / scoresWithExpected.length
: undefined;
// Calculate consistency (variance in scores)
const scores = results.map(r => r.score);
const avgScore = scores.reduce((sum, s) => sum + s, 0) / scores.length;
const variance = scores.reduce((sum, s) => sum + Math.pow(s - avgScore, 2), 0) / scores.length;
const consistency = 1 - Math.min(variance, 1); // Higher is more consistent
return {
accuracy,
consistency,
coherence: avgScore, // Simplified
relevance: avgScore, // Simplified
semanticSimilarity,
// Other metrics would be calculated with actual NLP tools
};
}
/**
* Calculate similarity between two strings
* Simple implementation - would use embeddings in production
*/
calculateSimilarity(text1, text2) {
const words1 = new Set(text1.toLowerCase().split(/\s+/));
const words2 = new Set(text2.toLowerCase().split(/\s+/));
const intersection = new Set([...words1].filter(w => words2.has(w)));
const union = new Set([...words1, ...words2]);
return union.size > 0 ? intersection.size / union.size : 0;
}
/**
* Create evaluation dataset
*/
async createDataset(name, examples) {
return {
id: uuidv4(),
name,
description: `Evaluation dataset: ${name}`,
examples,
createdAt: new Date(),
};
}
/**
* Compare two prompt versions
*/
async compareVersions(promptId, version1, version2, examples, forge) {
const eval1 = await this.evaluate({
promptId,
version: version1,
examples,
}, forge);
const eval2 = await this.evaluate({
promptId,
version: version2,
examples,
}, forge);
const improvement = eval2.overallScore - eval1.overallScore;
const winner = eval2.overallScore > eval1.overallScore ? version2 : version1;
return {
version1Score: eval1.overallScore,
version2Score: eval2.overallScore,
winner,
improvement,
};
}
}
//# sourceMappingURL=evaluation.js.map