@vfarcic/dot-ai
Version:
AI-powered development productivity platform that enhances software development workflows through intelligent automation and AI-driven assistance
411 lines (387 loc) โข 18 kB
JavaScript
;
/**
* Evaluation Runner for Multi-Model Comparative Analysis
*
* Runs comparative evaluation on available datasets from multiple models
* Automatically detects and evaluates both remediation and recommendation datasets
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
const remediation_comparative_js_1 = require("./evaluators/remediation-comparative.js");
const recommendation_comparative_js_1 = require("./evaluators/recommendation-comparative.js");
const capability_comparative_js_1 = require("./evaluators/capability-comparative.js");
const pattern_comparative_js_1 = require("./evaluators/pattern-comparative.js");
const policy_comparative_js_1 = require("./evaluators/policy-comparative.js");
const promises_1 = require("fs/promises");
const child_process_1 = require("child_process");
const util_1 = require("util");
const execAsync = (0, util_1.promisify)(child_process_1.exec);
const EVALUATOR_CONFIG = {
remediation: {
evaluator: remediation_comparative_js_1.RemediationComparativeEvaluator,
prefix: 'remediate_',
title: 'Remediation AI Model Comparison Report'
},
recommendation: {
evaluator: recommendation_comparative_js_1.RecommendationComparativeEvaluator,
prefix: 'recommend_',
title: 'Recommendation AI Model Comparison Report'
},
capability: {
evaluator: capability_comparative_js_1.CapabilityComparativeEvaluator,
prefix: 'capability_',
title: 'Capability AI Model Comparison Report'
},
pattern: {
evaluator: pattern_comparative_js_1.PatternComparativeEvaluator,
prefix: 'pattern_',
title: 'Pattern AI Model Comparison Report'
},
policy: {
evaluator: policy_comparative_js_1.PolicyComparativeEvaluator,
prefix: 'policy_',
title: 'Policy AI Model Comparison Report'
}
};
function generateMarkdownReport(results, stats, evaluationType, finalAssessment) {
const timestamp = new Date().toISOString();
// Use final assessment if provided
const overallAssessment = finalAssessment?.overall_assessment || null;
// Calculate basic statistics for reference
const modelScores = new Map();
results.forEach(result => {
if (result.modelRankings) {
result.modelRankings.forEach((ranking) => {
if (!modelScores.has(ranking.model)) {
modelScores.set(ranking.model, []);
}
modelScores.get(ranking.model).push(ranking.score);
});
}
});
// Calculate average scores for supplementary information
const modelAverages = new Map();
modelScores.forEach((scores, model) => {
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
modelAverages.set(model, Math.round(avg * 1000) / 1000);
});
const reportTitle = EVALUATOR_CONFIG[evaluationType].title;
return `# ${reportTitle}
**Generated**: ${timestamp}
**Scenarios Analyzed**: ${results.length}
**Models Evaluated**: ${stats.availableModels.length}
**Total Datasets**: ${stats.totalDatasets}
## Executive Summary
### ๐ Overall Winner (AI Assessment)
${overallAssessment ? `
**${overallAssessment.winner}**
${overallAssessment.rationale}
` : 'Overall assessment not available'}
### ๐ AI Reliability Rankings
${overallAssessment ? overallAssessment.reliability_ranking
.map((ranking, index) => `${index + 1}. **${ranking.model}** (${Math.round(ranking.reliability_score * 100)}%) - ${ranking.reliability_notes}`)
.join('\n') : 'Reliability rankings not available'}
### ๐ Production Recommendations
${overallAssessment ? `
- **Primary Choice**: ${overallAssessment.production_recommendations.primary}
- **Secondary Option**: ${overallAssessment.production_recommendations.secondary}
- **Avoid for Production**: ${overallAssessment.production_recommendations.avoid?.length ? overallAssessment.production_recommendations.avoid.join(', ') : 'None'}
${overallAssessment.production_recommendations.specialized_use && Object.keys(overallAssessment.production_recommendations.specialized_use).length > 0 ?
'\n**Specialized Use Cases:**\n' + Object.entries(overallAssessment.production_recommendations.specialized_use)
.map(([useCase, model]) => `- **${useCase}**: ${model}`)
.join('\n') : ''}
` : 'Production recommendations not available'}
### ๐ Supplementary Statistics (Reference Only)
| Model | Avg Score | Notes |
|-------|-----------|-------|
${Array.from(modelAverages.entries())
.sort((a, b) => b[1] - a[1])
.map(([model, avgScore]) => `| ${model} | ${avgScore} | See AI assessment above |`)
.join('\n')}
## Detailed Scenario Results
${results.map((result, index) => {
const scenarioTitle = (result.key || 'unknown').replace(/_/g, ' ').replace(/(remediation|recommendation) comparative /, '').toUpperCase();
return `### ${index + 1}. ${scenarioTitle}
**Winner**: ${result.bestModel} (Score: ${result.score})
**Models Compared**: ${result.modelCount}
**Confidence**: ${result.confidence ? Math.round(result.confidence * 100) : 0}%
#### Rankings
${result.modelRankings ? result.modelRankings.map((rank) => `${rank.rank}. **${rank.model}** - ${rank.score}`).join('\n') : 'No detailed rankings available'}
#### Analysis
${result.comment}
---`;
}).join('\n\n')}
## AI Model Selection Guide
${overallAssessment ? `
### Key Insights
${overallAssessment.key_insights}
### Recommended Selection Strategy
- **For Production Use**: Choose ${overallAssessment.production_recommendations.primary}
- **For Secondary Option**: Consider ${overallAssessment.production_recommendations.secondary}
${overallAssessment.production_recommendations.avoid?.length ?
`- **Avoid**: ${overallAssessment.production_recommendations.avoid.join(', ')} (reliability concerns)` : ''}
### Decision Framework
The AI assessment prioritizes **reliability and consistency** over peak performance. Models that fail completely in any scenario are heavily penalized, ensuring production-ready recommendations.
` : 'AI model selection guide not available'}
---
## Report Attribution
Report generated by DevOps AI Toolkit Comparative Evaluation System
`;
}
function loadModelMetadata() {
try {
// eslint-disable-next-line @typescript-eslint/no-require-imports -- Dynamic require for CLI script
const fs = require('fs');
// eslint-disable-next-line @typescript-eslint/no-require-imports -- Dynamic require for CLI script
const path = require('path');
const metadataPath = path.join(__dirname, 'model-metadata.json');
if (!fs.existsSync(metadataPath)) {
console.error('โ Model metadata file not found');
console.error('๐ Pricing and capabilities data required for cost analysis');
console.error('');
console.error('๐ To create model metadata, run:');
console.error(' /update-model-metadata');
console.error('');
process.exit(1);
}
const metadata = JSON.parse(fs.readFileSync(metadataPath, 'utf8'));
// Check if metadata is older than 30 days
const metadataAge = Date.now() - new Date(metadata.lastUpdated).getTime();
const thirtyDays = 30 * 24 * 60 * 60 * 1000;
if (metadataAge > thirtyDays) {
console.error('โ Model metadata is over 30 days old (last updated: ' + metadata.lastUpdated + ')');
console.error('๐ Pricing and capabilities data may be outdated, affecting cost analysis accuracy');
console.error('');
console.error('๐ To update model metadata, run:');
console.error(' /update-model-metadata');
console.error('');
process.exit(1);
}
console.log('โ
Model metadata loaded (updated: ' + metadata.lastUpdated + ')');
return metadata;
}
catch (error) {
console.error('โ Failed to load model metadata:', error instanceof Error ? error.message : String(error));
console.error('๐ To create model metadata, run: /update-model-metadata');
process.exit(1);
}
}
function generateJsonReport(results, stats, evaluationType, modelMetadata, finalAssessment) {
const timestamp = new Date().toISOString();
// Use final assessment if provided
const overallAssessment = finalAssessment || null;
return {
metadata: {
reportType: 'comparative-evaluation',
evaluationType: evaluationType,
generated: timestamp,
scenariosAnalyzed: results.length,
modelsEvaluated: stats.availableModels.length,
totalDatasets: stats.totalDatasets,
tool: EVALUATOR_CONFIG[evaluationType].title
},
modelMetadata: modelMetadata.models,
overallAssessment: overallAssessment,
results: results,
summary: stats
};
}
async function detectAvailableDatasets(datasetsDir, filterType) {
try {
const files = await (0, promises_1.readdir)(datasetsDir);
const result = {};
for (const [type, config] of Object.entries(EVALUATOR_CONFIG)) {
// If filter specified, only check for that type
if (filterType && type !== filterType) {
result[type] = false;
}
else {
result[type] = files.some(file => file.startsWith(config.prefix));
}
}
return result;
}
catch {
console.warn('Could not read datasets directory, assuming no datasets available');
const result = {};
for (const type of Object.keys(EVALUATOR_CONFIG)) {
result[type] = false;
}
return result;
}
}
async function runEvaluation(evaluatorType, datasetsDir, modelMetadata) {
const EvaluatorClass = EVALUATOR_CONFIG[evaluatorType].evaluator;
const evaluator = new EvaluatorClass(datasetsDir);
console.log(`\n๐ฌ Starting ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Evaluation\n`);
// Show dataset stats
console.log('๐ Dataset Analysis:');
const stats = evaluator.getDatasetStats();
console.log(`- Total datasets: ${stats.totalDatasets}`);
console.log(`- Available models: ${stats.availableModels.join(', ')}`);
console.log(`- Scenarios with multiple models: ${stats.scenariosWithMultipleModels}`);
console.log(`- Interaction types: ${stats.interactionTypes.join(', ')}`);
console.log();
// Show evaluation phases
console.log('๐ฏ Evaluation Phases:');
const phases = evaluator.getEvaluationPhases();
phases.forEach(phase => {
console.log(`- ${phase.phase}: ${phase.description}`);
console.log(` Models: ${phase.availableModels.join(', ')}`);
console.log(` Scenarios: ${phase.scenarioCount}`);
console.log();
});
// Run comparative evaluation on all scenarios
console.log('๐ Running Comparative Evaluation...\n');
const results = await evaluator.evaluateAllScenarios();
console.log(`โ
${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Evaluation Complete! Analyzed ${results.length} scenarios\n`);
// Conduct final assessment across all scenarios
const finalAssessment = await evaluator.conductFinalAssessment(results);
// Generate dual-format reports using final assessment
const reportContent = generateMarkdownReport(results, stats, evaluatorType, finalAssessment);
const jsonResults = generateJsonReport(results, stats, evaluatorType, modelMetadata, finalAssessment);
// Save reports to files
const markdownPath = `./eval/analysis/individual/${evaluatorType}-evaluation.md`;
const jsonPath = `./eval/analysis/individual/${evaluatorType}-results.json`;
const reportDir = './eval/analysis/individual';
// Ensure report directory exists
const fs = await Promise.resolve().then(() => __importStar(require('fs')));
if (!fs.existsSync(reportDir)) {
fs.mkdirSync(reportDir, { recursive: true });
}
fs.writeFileSync(markdownPath, reportContent);
fs.writeFileSync(jsonPath, JSON.stringify(jsonResults, null, 2));
console.log(`๐ ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} reports generated:`);
console.log(` ๐ Markdown: ${markdownPath}`);
console.log(` ๐ JSON: ${jsonPath}`);
// Brief console summary
console.log(`๐ ${evaluatorType.charAt(0).toUpperCase() + evaluatorType.slice(1)} Results:`);
results.forEach((result, index) => {
console.log(` ${index + 1}. ${result.key}: ${result.bestModel} (${result.score})`);
});
return results;
}
async function main() {
console.log('๐ฌ Starting Multi-Model Comparative Evaluation\n');
// Clean old debug files but preserve evaluation datasets
console.log('๐งน Cleaning old debug files...');
try {
await execAsync('find ./tmp/debug-ai -type f ! -name \'*.jsonl\' -delete 2>/dev/null || true');
await execAsync('mkdir -p ./tmp/debug-ai');
console.log('โ
Debug files cleaned (datasets preserved)\n');
}
catch (error) {
console.warn('โ ๏ธ Could not clean debug files:', error instanceof Error ? error.message : String(error));
}
// Clean old evaluation result files from eval/results
console.log('๐งน Cleaning old evaluation result files...');
try {
await execAsync('rm -f ./eval/results/*_comparative_evaluation_*.jsonl 2>/dev/null || true');
await execAsync('mkdir -p ./eval/results');
console.log('โ
Old evaluation results cleaned\n');
}
catch (error) {
console.warn('โ ๏ธ Could not clean old evaluation results:', error instanceof Error ? error.message : String(error));
}
// Check model metadata freshness before starting any evaluation work
const modelMetadata = loadModelMetadata();
const datasetsDir = './eval/datasets';
// Parse command line arguments for subset evaluation
const args = process.argv.slice(2);
let filterType = undefined;
if (args.length > 0) {
const requestedType = args[0];
if (requestedType in EVALUATOR_CONFIG) {
filterType = requestedType;
}
else {
console.error(`โ Invalid evaluation type: "${requestedType}"`);
console.error(`โ
Available types: ${Object.keys(EVALUATOR_CONFIG).join(', ')}`);
process.exit(1);
}
}
const availableDatasets = await detectAvailableDatasets(datasetsDir, filterType);
console.log('๐ Dataset Detection:');
for (const [type, available] of Object.entries(availableDatasets)) {
console.log(`- ${type.charAt(0).toUpperCase() + type.slice(1)} datasets: ${available ? 'โ
' : 'โ'}`);
}
if (filterType) {
console.log(`\n๐ฏ Running evaluation for: ${filterType}`);
}
const hasAnyDatasets = Object.values(availableDatasets).some(Boolean);
if (!hasAnyDatasets) {
if (filterType) {
console.error(`โ No datasets found for type: ${filterType}`);
}
else {
console.error('โ No evaluation datasets found. Please run integration tests first to generate datasets.');
}
process.exit(1);
}
try {
const allResults = [];
// If filterType is specified, only run that evaluation type
if (filterType) {
if (availableDatasets[filterType]) {
const results = await runEvaluation(filterType, datasetsDir, modelMetadata);
allResults.push(...results);
}
else {
console.error(`โ No datasets available for type: ${filterType}`);
process.exit(1);
}
}
else {
// Run all available evaluations
for (const [type, available] of Object.entries(availableDatasets)) {
if (available) {
const results = await runEvaluation(type, datasetsDir, modelMetadata);
allResults.push(...results);
}
}
}
console.log(`\n๐ All Evaluations Complete! Total scenarios analyzed: ${allResults.length}`);
console.log(`๐ Check ./eval/reports/ for detailed analysis reports\n`);
}
catch (error) {
console.error('โ Evaluation failed:', error);
process.exit(1);
}
}
// Run if this file is executed directly
if (require.main === module) {
main().catch(console.error);
}