@vfarcic/dot-ai
Version:
AI-powered development productivity platform that enhances software development workflows through intelligent automation and AI-driven assistance
411 lines (410 loc) • 22 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.PlatformSynthesizer = void 0;
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
const graph_generator_js_1 = require("./graph-generator.js");
const metadata_loader_js_1 = require("./metadata-loader.js");
class PlatformSynthesizer {
aiProvider;
reportsDir;
constructor(aiProvider, reportsDir = './eval/analysis/individual') {
this.aiProvider = aiProvider;
this.reportsDir = reportsDir;
}
async generatePlatformWideAnalysis(graphsToGenerate, skipReport = false) {
console.log('🔍 Loading all evaluation reports...');
const allReports = await this.loadAllReports();
console.log('🔧 Loading tool metadata...');
const toolMetadata = this.loadToolMetadata();
console.log('📊 Analyzing cross-tool performance patterns...');
const crossToolAnalysis = await this.analyzeCrossToolPerformance(allReports);
let markdownReport;
if (skipReport) {
console.log('⏭️ Skipping AI report generation...');
// Return empty string if we're only generating graphs
markdownReport = '';
}
else {
console.log('🎯 Generating decision matrices...');
const decisionMatrices = this.generateDecisionMatrices(crossToolAnalysis.modelPerformances);
console.log('💡 Creating usage recommendations...');
const usageRecommendations = this.generateUsageRecommendations(crossToolAnalysis, decisionMatrices);
console.log('🚀 Generating comprehensive AI-powered report...');
markdownReport = await this.generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata);
}
console.log('📊 Generating data visualizations...');
const reportWithGraphs = await this.addGraphsToReport(markdownReport, crossToolAnalysis.modelPerformances, graphsToGenerate);
return reportWithGraphs;
}
loadToolMetadata() {
const metadata = (0, metadata_loader_js_1.loadEvaluationMetadata)();
return { tools: metadata.tools };
}
async loadAllReports() {
const reports = {};
// Load all JSON result files from the directory
const reportFiles = fs.readdirSync(this.reportsDir)
.filter(file => file.endsWith('-results.json'));
if (reportFiles.length === 0) {
throw new Error(`No evaluation result files found in ${this.reportsDir}`);
}
for (const fileName of reportFiles) {
const reportPath = path.join(this.reportsDir, fileName);
const reportContent = JSON.parse(fs.readFileSync(reportPath, 'utf8'));
// Extract tool type from filename (e.g., "capability-results.json" -> "capability")
const toolType = fileName.split('-results.json')[0];
reports[toolType] = reportContent;
console.log(`✅ Loaded ${toolType} report: ${fileName}`);
}
console.log(`📊 Total reports loaded: ${Object.keys(reports).length}`);
return reports;
}
async analyzeCrossToolPerformance(allReports) {
const modelPerformances = this.calculateModelPerformances(allReports);
// Calculate cross-tool consistency scores
const crossToolConsistency = {};
for (const model of modelPerformances) {
const scores = Object.values(model.toolScores);
if (scores.length > 1) {
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
const variance = scores.reduce((sum, score) => sum + Math.pow(score - mean, 2), 0) / scores.length;
const standardDeviation = Math.sqrt(variance);
// Lower standard deviation = higher consistency (invert for consistency score)
crossToolConsistency[model.modelId] = Math.max(0, 1 - (standardDeviation / mean));
}
}
// Identify tool-specific leaders and universal performers
const toolSpecificLeaders = {};
const toolTypes = Object.keys(allReports);
for (const toolType of toolTypes) {
const bestModel = modelPerformances
.filter(m => m.toolScores[toolType] !== undefined)
.sort((a, b) => b.toolScores[toolType] - a.toolScores[toolType])[0];
if (bestModel) {
toolSpecificLeaders[toolType] = bestModel.modelId;
}
}
// Universal performers = models that rank in top 3 across all tools they participate in
const universalPerformers = [];
for (const model of modelPerformances) {
const participatingTools = Object.keys(model.toolScores);
let topThreeCount = 0;
for (const toolType of participatingTools) {
const ranking = modelPerformances
.filter(m => m.toolScores[toolType] !== undefined)
.sort((a, b) => b.toolScores[toolType] - a.toolScores[toolType])
.findIndex(m => m.modelId === model.modelId);
if (ranking < 3)
topThreeCount++;
}
if (participatingTools.length >= 3 && topThreeCount >= participatingTools.length * 0.75) {
universalPerformers.push(model.modelId);
}
}
return {
modelPerformances,
crossToolConsistency,
toolSpecificLeaders,
universalPerformers
};
}
calculateModelPerformances(allReports) {
const modelMap = new Map();
// Process each tool's evaluation results
for (const [toolType, report] of Object.entries(allReports)) {
if (!report.overallAssessment?.detailed_analysis)
continue;
for (const [modelKey, assessment] of Object.entries(report.overallAssessment.detailed_analysis)) {
const modelId = modelKey;
if (!modelMap.has(modelId)) {
const metadata = report.modelMetadata?.[this.extractBaseModelId(modelId)] || {};
modelMap.set(modelId, {
modelId,
provider: metadata.provider || 'Unknown',
toolScores: {},
pricing: metadata.pricing || { input_cost_per_million_tokens: 0, output_cost_per_million_tokens: 0 },
capabilities: {
context_window: metadata.context_window || 0,
supports_function_calling: metadata.supports_function_calling || false
}
});
}
const modelData = modelMap.get(modelId);
const assessmentData = assessment;
// Extract average score for this tool
if (typeof assessmentData.average_score === 'number') {
modelData.toolScores[toolType] = assessmentData.average_score;
}
// Update participation and reliability metrics
if (typeof assessmentData.participation_rate === 'number') {
modelData.participationRate = (modelData.participationRate || 0) + assessmentData.participation_rate;
}
if (typeof assessmentData.reliability_score === 'number') {
modelData.reliabilityScore = (modelData.reliabilityScore || 0) + assessmentData.reliability_score;
}
}
}
// Calculate final metrics
const modelPerformances = [];
for (const [modelId, data] of modelMap.entries()) {
const toolCount = Object.keys(data.toolScores).length;
if (toolCount === 0)
continue;
const averageScore = Object.values(data.toolScores).reduce((a, b) => a + b, 0) / toolCount;
const participationRate = (data.participationRate || 0) / toolCount;
const reliabilityScore = (data.reliabilityScore || 0) / toolCount;
// Calculate consistency across tools
const scores = Object.values(data.toolScores);
const mean = scores.reduce((a, b) => a + b, 0) / scores.length;
const variance = scores.reduce((sum, score) => sum + Math.pow(score - mean, 2), 0) / scores.length;
const consistencyAcrossTools = Math.max(0, 1 - (Math.sqrt(variance) / mean));
modelPerformances.push({
modelId,
provider: data.provider,
toolScores: data.toolScores,
averageScore,
participationRate,
reliabilityScore,
consistencyAcrossTools,
pricing: data.pricing,
capabilities: data.capabilities
});
}
return modelPerformances.sort((a, b) => b.averageScore - a.averageScore);
}
generateDecisionMatrices(modelPerformances) {
// Sort models by different criteria
const qualityLeaders = [...modelPerformances]
.sort((a, b) => b.averageScore - a.averageScore)
.slice(0, 5);
const speedOptimized = [...modelPerformances]
.filter(m => m.pricing.input_cost_per_million_tokens > 0) // Filter out models with no pricing data
.sort((a, b) => a.pricing.input_cost_per_million_tokens - b.pricing.input_cost_per_million_tokens)
.slice(0, 5);
const costEffective = [...modelPerformances]
.filter(m => m.pricing.input_cost_per_million_tokens > 0 && m.pricing.output_cost_per_million_tokens > 0)
.map(model => ({
...model,
valueScore: model.averageScore / ((model.pricing.input_cost_per_million_tokens + model.pricing.output_cost_per_million_tokens) / 2)
}))
.sort((a, b) => b.valueScore - a.valueScore)
.slice(0, 5);
const balanced = [...modelPerformances]
.filter(m => m.pricing.input_cost_per_million_tokens > 0)
.map(model => ({
...model,
balancedScore: (model.averageScore * 0.4) + (model.consistencyAcrossTools * 0.3) +
(model.reliabilityScore * 0.3) -
((model.pricing.input_cost_per_million_tokens + model.pricing.output_cost_per_million_tokens) / 100)
}))
.sort((a, b) => b.balancedScore - a.balancedScore)
.slice(0, 5);
const reliabilityFocused = [...modelPerformances]
.sort((a, b) => {
if (b.reliabilityScore !== a.reliabilityScore) {
return b.reliabilityScore - a.reliabilityScore;
}
return b.consistencyAcrossTools - a.consistencyAcrossTools;
})
.slice(0, 5);
return {
qualityLeaders,
speedOptimized,
costEffective,
balanced,
reliabilityFocused
};
}
generateUsageRecommendations(crossToolAnalysis, decisionMatrices) {
const recommendations = [
{
priority: 'quality-first',
primaryModel: decisionMatrices.qualityLeaders[0]?.modelId || '',
fallbackModel: decisionMatrices.qualityLeaders[1]?.modelId || '',
reasoning: 'Optimized for maximum accuracy and completeness across all MCP tools',
costImplications: `Estimated cost: $${this.calculateCostEstimate(decisionMatrices.qualityLeaders[0])}/1M tokens`,
useCases: ['Production deployments', 'Critical troubleshooting', 'Complex recommendations']
},
{
priority: 'cost-first',
primaryModel: decisionMatrices.costEffective[0]?.modelId || '',
fallbackModel: decisionMatrices.costEffective[1]?.modelId || '',
reasoning: 'Best value ratio of performance per dollar spent',
costImplications: `Estimated cost: $${this.calculateCostEstimate(decisionMatrices.costEffective[0])}/1M tokens`,
useCases: ['Budget-conscious deployments', 'Frequent operations', 'Cost-sensitive workflows']
},
{
priority: 'speed-first',
primaryModel: decisionMatrices.speedOptimized[0]?.modelId || '',
fallbackModel: decisionMatrices.speedOptimized[1]?.modelId || '',
reasoning: 'Optimized for fastest response times and lowest latency',
costImplications: `Estimated cost: $${this.calculateCostEstimate(decisionMatrices.speedOptimized[0])}/1M tokens`,
useCases: ['Time-sensitive troubleshooting', 'Interactive debugging', 'Rapid prototyping']
},
{
priority: 'balanced',
primaryModel: decisionMatrices.balanced[0]?.modelId || '',
fallbackModel: decisionMatrices.balanced[1]?.modelId || '',
reasoning: 'Optimal balance of quality, reliability, and cost considerations',
costImplications: `Estimated cost: $${this.calculateCostEstimate(decisionMatrices.balanced[0])}/1M tokens`,
useCases: ['General purpose usage', 'Mixed workloads', 'Default recommendation']
}
];
return recommendations;
}
async generatePlatformInsights(crossToolAnalysis, decisionMatrices, usageRecommendations, toolMetadata) {
// Load prompt template from evaluation prompts directory
const promptPath = path.join(process.cwd(), 'src', 'evaluation', 'prompts', 'platform-synthesis.md');
const promptTemplate = fs.readFileSync(promptPath, 'utf8');
const promptWithData = promptTemplate
.replace('{crossToolAnalysisJson}', JSON.stringify(crossToolAnalysis, null, 2))
.replace('{decisionMatricesJson}', JSON.stringify(decisionMatrices, null, 2))
.replace('{usageRecommendationsJson}', JSON.stringify(usageRecommendations, null, 2))
.replace('{toolMetadataJson}', JSON.stringify(toolMetadata, null, 2));
const aiResponse = await this.aiProvider.sendMessage(promptWithData);
return aiResponse.content; // Return the AI-generated markdown directly
}
extractKeyFindings(crossToolAnalysis) {
const findings = [];
findings.push(`${crossToolAnalysis.modelPerformances.length} models evaluated across ${Object.keys(crossToolAnalysis.toolSpecificLeaders).length} tool types`);
findings.push(`${crossToolAnalysis.universalPerformers.length} models demonstrated consistent cross-tool performance`);
// Add performance spread analysis
const scores = crossToolAnalysis.modelPerformances.map((m) => m.averageScore);
const maxScore = Math.max(...scores);
const minScore = Math.min(...scores);
findings.push(`Performance spread: ${(maxScore - minScore).toFixed(3)} (${maxScore.toFixed(3)} - ${minScore.toFixed(3)})`);
return findings;
}
categorizeModelTiers(modelPerformances) {
const sorted = [...modelPerformances].sort((a, b) => b.averageScore - a.averageScore);
// Use reliability score and consistency to determine production readiness
const productionReady = sorted.filter(m => m.reliabilityScore >= 0.8 && m.consistencyAcrossTools >= 0.7);
const costOptimized = sorted.filter(m => m.reliabilityScore >= 0.7 &&
m.consistencyAcrossTools >= 0.6 &&
!productionReady.includes(m) &&
(m.pricing.input_cost_per_million_tokens + m.pricing.output_cost_per_million_tokens) < 10);
const avoidForProduction = sorted.filter(m => !productionReady.includes(m) && !costOptimized.includes(m));
return {
'Production Ready': productionReady.map(m => m.modelId),
'Cost-Optimized': costOptimized.map(m => m.modelId),
'Avoid for Production': avoidForProduction.map(m => m.modelId)
};
}
identifyCrossToolPatterns(crossToolAnalysis) {
return {
consistencyLeaders: Object.entries(crossToolAnalysis.crossToolConsistency)
.sort(([, a], [, b]) => b - a)
.slice(0, 3)
.map(([model]) => model),
toolSpecificLeaders: crossToolAnalysis.toolSpecificLeaders,
universalPerformers: crossToolAnalysis.universalPerformers
};
}
generateProductionRecommendations(decisionMatrices) {
return {
'Primary Production Model': decisionMatrices.qualityLeaders[0]?.modelId || 'None',
'Cost-Optimized Alternative': decisionMatrices.costEffective[0]?.modelId || 'None',
'High-Reliability Option': decisionMatrices.reliabilityFocused[0]?.modelId || 'None',
'Balanced General Use': decisionMatrices.balanced[0]?.modelId || 'None'
};
}
calculateCostEstimate(model) {
if (!model || !model.pricing.input_cost_per_million_tokens)
return '0.00';
// Estimate average cost per 1M tokens (assuming 50% input, 50% output)
const avgCost = (model.pricing.input_cost_per_million_tokens + model.pricing.output_cost_per_million_tokens) / 2;
return avgCost.toFixed(2);
}
extractBaseModelId(fullModelId) {
// Extract base model from full ID like "vercel_claude-sonnet-4-5-20250929_2025-10-15"
const parts = fullModelId.split('_');
if (parts.length >= 2) {
return parts[1]; // Return the middle part (actual model name)
}
return fullModelId;
}
/**
* Generates graphs and replaces placeholders in the markdown report
*/
async addGraphsToReport(markdownContent, modelPerformances, graphsToGenerate) {
const graphGenerator = new graph_generator_js_1.GraphGenerator('./eval/analysis/platform/graphs');
try {
// Generate all or specific graphs
const graphResults = await graphGenerator.generateAllGraphs(modelPerformances, graphsToGenerate);
// Replace placeholders with actual image markdown
let updatedMarkdown = markdownContent;
const graphMappings = {
'[GRAPH:performance-tiers]': '',
'[GRAPH:cost-vs-quality]': '',
'[GRAPH:reliability-comparison]': '',
'[GRAPH:tool-performance-heatmap]': '',
'[GRAPH:context-window-correlation]': ''
};
for (const [placeholder, imageMarkdown] of Object.entries(graphMappings)) {
updatedMarkdown = updatedMarkdown.replace(placeholder, imageMarkdown);
}
// Log graph generation results
for (const [graphName, result] of Object.entries(graphResults)) {
if (result.success) {
console.log(` ✅ ${graphName}: ${result.graphPath}`);
}
else {
console.warn(` ⚠️ ${graphName}: ${result.error}`);
// If graph generation failed, remove the placeholder to avoid broken markdown
const placeholderKey = `[GRAPH:${graphName}]`;
updatedMarkdown = updatedMarkdown.replace(placeholderKey, `*Graph generation failed: ${result.error}*`);
}
}
return updatedMarkdown;
}
catch (error) {
console.error('⚠️ Failed to generate graphs, returning report without visualizations:', error);
// If graph generation completely fails, remove all placeholders
return markdownContent.replace(/\[GRAPH:[^\]]+\]/g, '*Graph generation failed*');
}
}
async saveSynthesisReport(markdownContent, outputPath = './eval/analysis/platform/synthesis-report.md') {
const dir = path.dirname(outputPath);
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
// Save the AI-generated markdown directly
fs.writeFileSync(outputPath, markdownContent);
console.log(`✅ Platform synthesis report saved: ${outputPath}`);
}
}
exports.PlatformSynthesizer = PlatformSynthesizer;