sf-agent-framework
Version:
AI Agent Orchestration Framework for Salesforce Development - Two-phase architecture with 70% context reduction
594 lines (501 loc) ⢠17 kB
JavaScript
/**
* Benchmark Integration Manager
*
* Purpose: Track, measure, and compare framework performance against
* industry benchmarks and internal baselines
*
* Key Features:
* - Multiple benchmark types (quality, performance, cost, time)
* - Automatic benchmark execution
* - Historical tracking and trending
* - Comparison against baselines
* - Report generation
*
* @module BenchmarkManager
* @version 1.0.0
* @date 2025-11-25
*/
const fs = require('fs-extra');
const path = require('path');
class BenchmarkManager {
constructor(rootDir = process.cwd()) {
this.rootDir = rootDir;
this.benchmarksDir = path.join(rootDir, '.sf-agent', 'benchmarks');
this.resultsDir = path.join(this.benchmarksDir, 'results');
this.baselinesPath = path.join(this.benchmarksDir, 'baselines.json');
// Benchmark categories
this.categories = {
quality: {
name: 'Code Quality',
metrics: [
'test-coverage',
'code-complexity',
'maintainability-index',
'technical-debt-ratio',
],
},
performance: {
name: 'Performance',
metrics: ['execution-time', 'token-usage', 'context-efficiency', 'shard-savings'],
},
cost: {
name: 'Cost Efficiency',
metrics: ['api-cost-per-workflow', 'token-cost', 'time-to-completion', 'cost-per-feature'],
},
accuracy: {
name: 'Output Accuracy',
metrics: ['requirements-met', 'test-pass-rate', 'defect-rate', 'first-time-quality'],
},
scalability: {
name: 'Scalability',
metrics: [
'large-codebase-handling',
'parallel-workflow-capacity',
'memory-efficiency',
'multi-agent-coordination',
],
},
};
// Industry baselines (updated for November 2025)
this.industryBaselines = {
'test-coverage': { target: 75, excellent: 90 },
'code-complexity': { target: 10, excellent: 5 },
'execution-time': { target: 1800, excellent: 900 }, // seconds
'token-usage': { target: 100000, excellent: 50000 },
'api-cost-per-workflow': { target: 5.0, excellent: 1.0 }, // USD
'requirements-met': { target: 90, excellent: 98 }, // percentage
'test-pass-rate': { target: 95, excellent: 99 },
'shard-savings': { target: 70, excellent: 90 }, // percentage
};
}
/**
* Initialize benchmark manager
*/
async initialize() {
await fs.ensureDir(this.benchmarksDir);
await fs.ensureDir(this.resultsDir);
// Create baselines file if doesn't exist
if (!(await fs.pathExists(this.baselinesPath))) {
await this.createDefaultBaselines();
}
console.log('ā Benchmark manager initialized');
return true;
}
/**
* Create default baselines
*/
async createDefaultBaselines() {
const baselines = {
version: '1.0.0',
created: new Date().toISOString(),
lastUpdated: new Date().toISOString(),
industry: this.industryBaselines,
internal: {},
custom: {},
};
await fs.writeJson(this.baselinesPath, baselines, { spaces: 2 });
}
/**
* Run benchmark suite
*/
async runBenchmarks(workflowId, options = {}) {
console.log(`\nš Running benchmarks for workflow: ${workflowId}\n`);
const benchmarkRun = {
id: this.generateBenchmarkId(),
workflowId,
timestamp: new Date().toISOString(),
categories: {},
overallScore: 0,
comparison: {},
};
try {
// Run benchmarks for each category
for (const [categoryId, category] of Object.entries(this.categories)) {
if (options.categories && !options.categories.includes(categoryId)) {
continue; // Skip if not requested
}
console.log(`š Category: ${category.name}`);
const categoryResults = await this.runCategoryBenchmarks(categoryId, category, workflowId);
benchmarkRun.categories[categoryId] = categoryResults;
}
// Calculate overall score
benchmarkRun.overallScore = this.calculateOverallScore(benchmarkRun.categories);
// Compare against baselines
benchmarkRun.comparison = await this.compareAgainstBaselines(benchmarkRun);
// Save results
await this.saveBenchmarkResults(benchmarkRun);
console.log(`\nā
Benchmarks complete!`);
console.log(` Overall Score: ${benchmarkRun.overallScore}/100\n`);
return benchmarkRun;
} catch (error) {
console.error(`ā Benchmark execution failed: ${error.message}`);
throw error;
}
}
/**
* Run benchmarks for a category
*/
async runCategoryBenchmarks(categoryId, category, workflowId) {
const results = {
name: category.name,
metrics: {},
score: 0,
};
let totalScore = 0;
let metricsCount = 0;
for (const metric of category.metrics) {
const metricResult = await this.measureMetric(metric, workflowId);
results.metrics[metric] = metricResult;
if (metricResult.score !== null) {
totalScore += metricResult.score;
metricsCount++;
}
console.log(` ${metric}: ${metricResult.value} (${metricResult.score}/100)`);
}
results.score = metricsCount > 0 ? Math.round(totalScore / metricsCount) : 0;
console.log(` Category Score: ${results.score}/100\n`);
return results;
}
/**
* Measure individual metric
*/
async measureMetric(metric, workflowId) {
// This would integrate with actual measurement tools
// For now, we'll return simulated measurements
const measurements = {
'test-coverage': () => ({
value: Math.floor(Math.random() * 30) + 70, // 70-100%
unit: '%',
baseline: this.industryBaselines[metric],
}),
'code-complexity': () => ({
value: Math.floor(Math.random() * 10) + 3, // 3-13
unit: 'avg',
baseline: this.industryBaselines[metric],
inverted: true, // Lower is better
}),
'execution-time': () => ({
value: Math.floor(Math.random() * 1200) + 600, // 600-1800s
unit: 's',
baseline: this.industryBaselines[metric],
inverted: true,
}),
'token-usage': () => ({
value: Math.floor(Math.random() * 60000) + 40000, // 40-100k
unit: 'tokens',
baseline: this.industryBaselines[metric],
inverted: true,
}),
'api-cost-per-workflow': () => ({
value: (Math.random() * 4 + 1).toFixed(2), // $1-5
unit: 'USD',
baseline: this.industryBaselines[metric],
inverted: true,
}),
'requirements-met': () => ({
value: Math.floor(Math.random() * 15) + 85, // 85-100%
unit: '%',
baseline: this.industryBaselines[metric],
}),
'test-pass-rate': () => ({
value: Math.floor(Math.random() * 10) + 90, // 90-100%
unit: '%',
baseline: this.industryBaselines[metric],
}),
'shard-savings': () => ({
value: Math.floor(Math.random() * 30) + 70, // 70-100%
unit: '%',
baseline: this.industryBaselines[metric],
}),
};
const measurement = measurements[metric]
? measurements[metric]()
: {
value: 0,
unit: '',
baseline: { target: 0, excellent: 0 },
};
// Calculate score (0-100)
measurement.score = this.calculateMetricScore(
measurement.value,
measurement.baseline,
measurement.inverted
);
return measurement;
}
/**
* Calculate metric score
*/
calculateMetricScore(value, baseline, inverted = false) {
if (!baseline) return null;
const target = baseline.target;
const excellent = baseline.excellent;
let score;
if (inverted) {
// Lower is better (e.g., execution time, complexity)
if (value <= excellent) {
score = 100;
} else if (value >= target) {
score = 50;
} else {
// Linear interpolation between excellent and target
score = 50 + (50 * (target - value)) / (target - excellent);
}
} else {
// Higher is better (e.g., test coverage, accuracy)
if (value >= excellent) {
score = 100;
} else if (value <= target) {
score = 50;
} else {
// Linear interpolation between target and excellent
score = 50 + (50 * (value - target)) / (excellent - target);
}
}
return Math.max(0, Math.min(100, Math.round(score)));
}
/**
* Calculate overall score
*/
calculateOverallScore(categories) {
const categoryScores = Object.values(categories).map((c) => c.score);
if (categoryScores.length === 0) return 0;
const sum = categoryScores.reduce((total, score) => total + score, 0);
return Math.round(sum / categoryScores.length);
}
/**
* Compare against baselines
*/
async compareAgainstBaselines(benchmarkRun) {
const baselines = await fs.readJson(this.baselinesPath);
const comparison = {
vsIndustry: {},
vsInternal: {},
summary: {
aboveIndustry: 0,
belowIndustry: 0,
aboveInternal: 0,
belowInternal: 0,
},
};
// Compare each metric against industry baseline
for (const [categoryId, category] of Object.entries(benchmarkRun.categories)) {
for (const [metricId, metric] of Object.entries(category.metrics)) {
const industryBaseline = baselines.industry[metricId];
if (industryBaseline) {
const isAbove = metric.score >= 75; // 75+ = above baseline
comparison.vsIndustry[metricId] = {
status: isAbove ? 'above' : 'below',
score: metric.score,
targetScore: 75,
};
if (isAbove) {
comparison.summary.aboveIndustry++;
} else {
comparison.summary.belowIndustry++;
}
}
// Compare against internal baseline if exists
const internalBaseline = baselines.internal[metricId];
if (internalBaseline) {
const isAbove = metric.value >= internalBaseline.value;
comparison.vsInternal[metricId] = {
status: isAbove ? 'above' : 'below',
current: metric.value,
baseline: internalBaseline.value,
};
if (isAbove) {
comparison.summary.aboveInternal++;
} else {
comparison.summary.belowInternal++;
}
}
}
}
return comparison;
}
/**
* Set internal baseline
*/
async setInternalBaseline(metric, value) {
const baselines = await fs.readJson(this.baselinesPath);
baselines.internal[metric] = {
value,
setAt: new Date().toISOString(),
description: `Internal baseline for ${metric}`,
};
baselines.lastUpdated = new Date().toISOString();
await fs.writeJson(this.baselinesPath, baselines, { spaces: 2 });
console.log(`ā Internal baseline set: ${metric} = ${value}`);
}
/**
* Get benchmark history
*/
async getBenchmarkHistory(workflowId, limit = 10) {
const files = await fs.readdir(this.resultsDir);
const history = [];
for (const file of files) {
if (file.endsWith('.json')) {
const result = await fs.readJson(path.join(this.resultsDir, file));
if (!workflowId || result.workflowId === workflowId) {
history.push({
id: result.id,
timestamp: result.timestamp,
workflowId: result.workflowId,
overallScore: result.overallScore,
});
}
}
}
// Sort by timestamp (newest first)
history.sort((a, b) => new Date(b.timestamp) - new Date(a.timestamp));
return history.slice(0, limit);
}
/**
* Generate trend report
*/
async generateTrendReport(workflowId, days = 30) {
const history = await this.getBenchmarkHistory(workflowId, 100);
const cutoffDate = new Date();
cutoffDate.setDate(cutoffDate.getDate() - days);
const recentHistory = history.filter((h) => new Date(h.timestamp) >= cutoffDate);
if (recentHistory.length === 0) {
return {
message: `No benchmark data in last ${days} days`,
runs: 0,
};
}
const scores = recentHistory.map((h) => h.overallScore);
const avgScore = scores.reduce((sum, s) => sum + s, 0) / scores.length;
const minScore = Math.min(...scores);
const maxScore = Math.max(...scores);
// Calculate trend
const firstHalf = scores.slice(0, Math.floor(scores.length / 2));
const secondHalf = scores.slice(Math.floor(scores.length / 2));
const firstAvg = firstHalf.reduce((s, v) => s + v, 0) / firstHalf.length;
const secondAvg = secondHalf.reduce((s, v) => s + v, 0) / secondHalf.length;
const trend =
secondAvg > firstAvg ? 'improving' : secondAvg < firstAvg ? 'declining' : 'stable';
return {
period: `${days} days`,
runs: recentHistory.length,
average: Math.round(avgScore),
min: minScore,
max: maxScore,
trend,
change: Math.round(secondAvg - firstAvg),
history: recentHistory.slice(0, 10), // Latest 10
};
}
/**
* Generate comprehensive report
*/
async generateReport(benchmarkId) {
const resultPath = path.join(this.resultsDir, `${benchmarkId}.json`);
if (!(await fs.pathExists(resultPath))) {
throw new Error(`Benchmark result not found: ${benchmarkId}`);
}
const benchmark = await fs.readJson(resultPath);
const baselines = await fs.readJson(this.baselinesPath);
const report = {
summary: {
id: benchmark.id,
workflowId: benchmark.workflowId,
timestamp: benchmark.timestamp,
overallScore: benchmark.overallScore,
grade: this.scoreToGrade(benchmark.overallScore),
},
categoryBreakdown: Object.entries(benchmark.categories).map(([id, cat]) => ({
category: cat.name,
score: cat.score,
grade: this.scoreToGrade(cat.score),
metrics: Object.entries(cat.metrics).map(([metricId, metric]) => ({
name: metricId,
value: metric.value,
unit: metric.unit,
score: metric.score,
baseline: metric.baseline,
})),
})),
comparison: {
vsIndustry: benchmark.comparison.vsIndustry,
summary: benchmark.comparison.summary,
},
recommendations: this.generateRecommendations(benchmark),
};
return report;
}
/**
* Convert score to grade
*/
scoreToGrade(score) {
if (score >= 90) return 'A';
if (score >= 80) return 'B';
if (score >= 70) return 'C';
if (score >= 60) return 'D';
return 'F';
}
/**
* Generate recommendations
*/
generateRecommendations(benchmark) {
const recommendations = [];
// Analyze each category
for (const [categoryId, category] of Object.entries(benchmark.categories)) {
if (category.score < 75) {
recommendations.push({
priority: 'high',
category: category.name,
issue: `Score below target (${category.score}/100)`,
suggestion: `Focus on improving ${category.name.toLowerCase()} metrics`,
});
}
// Analyze individual metrics
for (const [metricId, metric] of Object.entries(category.metrics)) {
if (metric.score < 60) {
recommendations.push({
priority: 'critical',
category: category.name,
metric: metricId,
issue: `${metricId} significantly below target`,
suggestion: this.getMetricSuggestion(metricId, metric),
});
}
}
}
// Sort by priority
recommendations.sort((a, b) => {
const priorities = { critical: 1, high: 2, medium: 3, low: 4 };
return priorities[a.priority] - priorities[b.priority];
});
return recommendations;
}
/**
* Get metric-specific suggestion
*/
getMetricSuggestion(metricId, metric) {
const suggestions = {
'test-coverage': 'Add more test classes and increase coverage to >75%',
'code-complexity': 'Refactor complex methods to reduce cyclomatic complexity',
'execution-time': 'Optimize workflow phases and use parallel processing',
'token-usage': 'Enable document sharding to reduce token consumption',
'api-cost-per-workflow': 'Use context budget manager and shard documents',
'shard-savings': 'Implement document sharding for large files',
};
return suggestions[metricId] || 'Review and optimize this metric';
}
/**
* Save benchmark results
*/
async saveBenchmarkResults(benchmark) {
const resultPath = path.join(this.resultsDir, `${benchmark.id}.json`);
await fs.writeJson(resultPath, benchmark, { spaces: 2 });
}
/**
* Generate benchmark ID
*/
generateBenchmarkId() {
return `benchmark_${Date.now()}_${Math.floor(Math.random() * 1000)}`;
}
}
module.exports = BenchmarkManager;