@mettamatt/code-reasoning
Version:
Enhanced MCP server for code reasoning using sequential thinking methodology, optimized for programming tasks
361 lines (360 loc) • 15.5 kB
JavaScript
/**
* Main evaluator for prompt testing
*/
import * as fs from 'fs';
import * as path from 'path';
import * as dotenv from 'dotenv';
import chalk from 'chalk';
import { PROMPT_TEST_SCENARIOS } from './scenarios.js';
import { callAPI, evaluateQuality } from './api.js';
import { getActivePrompt, ALL_PROMPTS, setActivePrompt, setCustomPrompt, SYSTEM_PROMPT, } from './core-prompts.js';
import { getPaths, selectFromList, promptUser, closeReadline, formatDate } from './utils.js';
// Get project paths from utility function
const { evaluationsDir, reportsDir } = getPaths();
// Load environment variables from the correct location
const envPath = path.join(evaluationsDir, '.env');
console.log(`Loading environment from: ${envPath}`);
dotenv.config({ path: envPath });
// Ensure reports directory exists
if (!fs.existsSync(reportsDir)) {
fs.mkdirSync(reportsDir, { recursive: true });
}
/**
* Check parameter adherence
*/
function checkParameterAdherence(thoughtChain) {
const checks = [];
// Check for required parameters
checks.push({
name: 'requiredParameters',
passed: thoughtChain.every(t => typeof t.thought === 'string' &&
typeof t.thought_number === 'number' &&
typeof t.total_thoughts === 'number' &&
typeof t.next_thought_needed === 'boolean'),
details: 'All thoughts must have required parameters',
});
// Check for sequential thought numbering
const hasSequentialNumbering = thoughtChain.every((t, i) => t.thought_number === i + 1);
checks.push({
name: 'sequentialNumbering',
passed: hasSequentialNumbering,
details: 'Thought numbers must be sequential',
});
// Check for proper termination
const lastThought = thoughtChain[thoughtChain.length - 1];
checks.push({
name: 'properTermination',
passed: lastThought && lastThought.next_thought_needed === false,
details: 'Final thought must have next_thought_needed set to false',
});
// Check for branching parameters
const branchingThoughts = thoughtChain.filter(t => t.branch_from_thought !== undefined && t.branch_id !== undefined);
const validBranching = branchingThoughts.every(t => typeof t.branch_from_thought === 'number' &&
typeof t.branch_id === 'string' &&
t.branch_from_thought >= 1 &&
t.branch_from_thought < t.thought_number);
checks.push({
name: 'branchingParameters',
passed: branchingThoughts.length === 0 || validBranching,
details: 'Branching thoughts must have valid branch_from_thought and branch_id',
});
// Check for revision parameters
const revisionThoughts = thoughtChain.filter(t => t.is_revision === true);
const validRevisions = revisionThoughts.every(t => t.is_revision === true &&
typeof t.revises_thought === 'number' &&
t.revises_thought >= 1 &&
t.revises_thought < t.thought_number);
checks.push({
name: 'revisionParameters',
passed: revisionThoughts.length === 0 || validRevisions,
details: 'Revision thoughts must have is_revision=true and valid revises_thought',
});
return checks;
}
/**
* Evaluate a thought chain for a scenario
*/
async function evaluateThoughtChain(apiKey, scenario, thoughtChain, options = {}) {
// Check parameter adherence
const checks = checkParameterAdherence(thoughtChain);
const allChecksPassed = checks.every(check => check.passed);
// Evaluate solution quality
const qualityResult = await evaluateQuality(apiKey, scenario, thoughtChain, options);
// Determine overall status
const status = allChecksPassed ? 'PASS' : 'FAIL';
// Create failure message if any checks failed
let failureMessage;
if (!allChecksPassed) {
const failedChecks = checks.filter(check => !check.passed);
failureMessage = failedChecks.map(check => `${check.name}: ${check.details}`).join('; ');
}
// Get active prompt details
const activePrompt = getActivePrompt();
// Create test result
const result = {
scenarioId: scenario.id,
scenarioName: scenario.name,
status,
checks,
failureMessage,
thoughtChain,
date: new Date().toISOString(),
modelId: options.model || 'unknown',
temperature: options.temperature,
qualityScore: qualityResult.success ? qualityResult.qualityScore : undefined,
qualityJustification: qualityResult.success ? qualityResult.justification : undefined,
// Include all prompts for standalone report
promptName: activePrompt.key,
corePrompt: activePrompt.prompt,
systemPrompt: SYSTEM_PROMPT,
scenarioPrompt: scenario.problem,
scenarioDetails: scenario,
};
return result;
}
/**
* Run evaluation on scenarios
*/
async function runEvaluation(apiKey, scenarios, options = {}) {
const results = [];
for (const [index, scenario] of scenarios.entries()) {
console.log(chalk.blue(`\nEvaluating scenario ${index + 1} of ${scenarios.length}: ${scenario.name}`));
try {
// Call API
const apiResponse = await callAPI(apiKey, scenario.problem, options);
// Handle API response
if (!apiResponse.success) {
console.error(chalk.red(`API error: ${apiResponse.error}`));
continue;
}
const thoughtChain = apiResponse.thoughtChain;
if (!thoughtChain || thoughtChain.length === 0) {
console.error(chalk.red('No thought chain found in API response'));
continue;
}
// Evaluate the thought chain
const result = await evaluateThoughtChain(apiKey, scenario, thoughtChain, options);
// Save to results array
results.push(result);
// Display status
const statusText = result.status === 'PASS' ? chalk.green('PASSED') : chalk.red('FAILED');
console.log(`\nScenario ${scenario.name}: ${statusText}`);
if (result.status === 'FAIL' && result.failureMessage) {
console.log(`Failure reason: ${chalk.red(result.failureMessage)}`);
}
if (result.qualityScore !== undefined) {
console.log(`Quality score: ${chalk.yellow(result.qualityScore + '%')}`);
}
}
catch (error) {
console.error(chalk.red(`Error evaluating scenario ${scenario.name}:`), error);
}
}
return results;
}
/**
* Generate report from results
*/
function generateReport(results) {
// Get active prompt info
const activePrompt = getActivePrompt();
// Create report content
let report = `# Prompt Evaluation Report\n\n`;
report += `Generated: ${new Date().toISOString()}\n\n`;
// Add prompt information
report += `## Prompt Information\n\n`;
report += `**Prompt Name:** ${activePrompt.key}\n\n`;
report += `**System Prompt:**\n\`\`\`\n${SYSTEM_PROMPT}\n\`\`\`\n\n`;
report += `**Core Prompt:**\n\`\`\`\n${activePrompt.prompt}\n\`\`\`\n\n`;
// Add summary
report += `## Summary\n\n`;
report += `Total scenarios evaluated: ${results.length}\n`;
const passedCount = results.filter(r => r.status === 'PASS').length;
report += `Scenarios passed: ${passedCount} (${Math.round((passedCount / results.length) * 100)}%)\n`;
// Calculate average quality score
const qualityScores = results.filter(r => r.qualityScore !== undefined).map(r => r.qualityScore);
if (qualityScores.length > 0) {
const avgQuality = qualityScores.reduce((sum, score) => sum + score, 0) / qualityScores.length;
report += `Average solution quality: ${Math.round(avgQuality)}%\n\n`;
}
// Add detailed results
report += `## Detailed Results\n\n`;
for (const result of results) {
report += `### ${result.scenarioName}\n\n`;
report += `**Status:** ${result.status}\n`;
if (result.qualityScore !== undefined) {
report += `**Quality Score:** ${result.qualityScore}%\n`;
}
if (result.qualityJustification) {
report += `**Quality Justification:** ${result.qualityJustification}\n`;
}
if (result.failureMessage) {
report += `**Failure Reason:** ${result.failureMessage}\n`;
}
// Add check results
report += `\n**Parameter Checks:**\n\n`;
for (const check of result.checks) {
const status = check.passed ? '✅ PASS' : '❌ FAIL';
report += `- ${status} ${check.name}: ${check.details}\n`;
}
// Add thought chain
report += `\n**Thought Chain:**\n\n`;
for (const thought of result.thoughtChain) {
report += `**Thought ${thought.thought_number}/${thought.total_thoughts}:**\n\n`;
report += `${thought.thought}\n\n`;
// Add metadata
const metadata = [];
metadata.push(`next_thought_needed: ${thought.next_thought_needed}`);
if (thought.is_revision) {
metadata.push(`is_revision: true`);
metadata.push(`revises_thought: ${thought.revises_thought}`);
}
if (thought.branch_from_thought) {
metadata.push(`branch_from_thought: ${thought.branch_from_thought}`);
metadata.push(`branch_id: "${thought.branch_id}"`);
}
report += `*Metadata: ${metadata.join(', ')}*\n\n`;
}
report += `---\n\n`;
}
return report;
}
/**
* Save report to file
*/
function saveReport(report) {
const timestamp = formatDate();
const promptName = getActivePrompt().key;
const filename = `report-${promptName}-${timestamp}.md`;
const filepath = path.join(reportsDir, filename);
fs.writeFileSync(filepath, report);
return filepath;
}
/**
* Select prompt interactive function
*/
async function selectPromptInteractive() {
console.log('\n=== SELECT CORE PROMPT ===');
// Create options list
const options = Object.entries(ALL_PROMPTS).map(([key, value]) => ({
key,
value: value.substring(0, 60) + '...',
}));
options.push({ key: 'CUSTOM', value: 'Enter your own custom prompt' });
// Get user selection
const selection = await selectFromList(options, item => `${item.key}: ${item.value}`, 'Select a prompt to use:', 0);
// Handle selection
if (selection.key === 'CUSTOM') {
console.log('\nEnter your custom prompt. Type .done on a new line when finished:');
let customPrompt = '';
let line = '';
// Use a condition that can be evaluated rather than true
let collecting = true;
while (collecting) {
line = await promptUser('');
if (line.trim() === '.done') {
collecting = false;
}
else {
customPrompt += line + '\n';
}
}
setCustomPrompt(customPrompt.trim());
console.log(chalk.green('\nCustom prompt set successfully'));
}
else {
setActivePrompt(selection.key);
console.log(chalk.green(`\nSelected prompt: ${selection.key}`));
}
}
/**
* Main CLI menu
*/
async function main() {
console.log(chalk.bold.blue('\n--------------------------------------------------'));
console.log(chalk.bold.blue('CODE REASONING TOOL - PROMPT EVALUATOR'));
console.log(chalk.bold.blue('--------------------------------------------------\n'));
// Check for API key
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
console.error(chalk.red('Error: ANTHROPIC_API_KEY environment variable not set'));
console.error(chalk.red('Please add it to your .env file'));
return;
}
// Default options
const defaultOptions = {
model: process.env.CLAUDE_MODEL || 'claude-3-7-sonnet-20250219',
temperature: parseFloat(process.env.TEMPERATURE || '0.2'),
maxTokens: parseInt(process.env.MAX_TOKENS || '4000'),
};
// Main menu options
const options = [
{ label: 'Select core prompt', value: 'select-prompt' },
{ label: 'Run evaluation on specific scenario', value: 'run-specific' },
{ label: 'Run evaluation on all scenarios', value: 'run-all' },
{ label: 'List available scenarios', value: 'list-scenarios' },
{ label: 'Exit', value: 'exit' },
];
let running = true;
while (running) {
const activePrompt = getActivePrompt();
console.log(chalk.yellow(`\nActive prompt: ${activePrompt.key}`));
const selection = await selectFromList(options, opt => opt.label, 'Select an option:', 0);
// Define variables outside the switch cases
let scenario;
let results;
let report;
let reportPath;
let allResults;
let allReport;
let allReportPath;
switch (selection.value) {
case 'select-prompt':
await selectPromptInteractive();
break;
case 'list-scenarios':
console.log('\n=== AVAILABLE SCENARIOS ===');
PROMPT_TEST_SCENARIOS.forEach((scenario, index) => {
console.log(`${index + 1}. ${scenario.name} (${scenario.difficulty}, ${scenario.targetSkill})`);
});
await promptUser('\nPress Enter to continue...');
break;
case 'run-specific':
console.log('\n=== RUN SPECIFIC SCENARIO ===');
scenario = await selectFromList(PROMPT_TEST_SCENARIOS, s => `${s.name} (${s.difficulty}, ${s.targetSkill})`, 'Select a scenario to evaluate:', 0);
console.log(chalk.yellow(`\nRunning evaluation for scenario: ${scenario.name}`));
results = await runEvaluation(apiKey, [scenario], defaultOptions);
if (results.length > 0) {
report = generateReport(results);
reportPath = saveReport(report);
console.log(chalk.green(`\nReport saved to: ${reportPath}`));
}
await promptUser('\nPress Enter to continue...');
break;
case 'run-all':
console.log('\n=== RUN ALL SCENARIOS ===');
console.log(chalk.yellow(`Running evaluation for all ${PROMPT_TEST_SCENARIOS.length} scenarios...`));
allResults = await runEvaluation(apiKey, PROMPT_TEST_SCENARIOS, defaultOptions);
if (allResults.length > 0) {
allReport = generateReport(allResults);
allReportPath = saveReport(allReport);
console.log(chalk.green(`\nReport saved to: ${allReportPath}`));
}
await promptUser('\nPress Enter to continue...');
break;
case 'exit':
console.log('\nExiting...');
running = false;
break;
}
}
closeReadline();
}
// CLI entry point
// For ES modules, we need a different approach than require.main === module
const isMainModule = import.meta.url === `file://${process.argv[1]}`;
if (isMainModule) {
main().catch(console.error);
}
// Export functions for programmatic use
export { runEvaluation, evaluateThoughtChain, generateReport, saveReport };