aiwg
Version:
Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.
636 lines (552 loc) • 20.5 kB
JavaScript
/**
* Best Output Tracker for External Ralph Loop
*
* Tracks quality scores across iterations and selects best output
* (not just final iteration) per REF-015 Self-Refine research.
*
* Research shows quality can fluctuate during iterative refinement,
* with peak quality often occurring at iteration 2-3 before degrading.
*
* @implements @.aiwg/requirements/use-cases/UC-168-best-output-selection.md
* @schema @agentic/code/addons/ralph/schemas/iteration-analytics.yaml
* @research @.aiwg/research/findings/REF-015-self-refine.md
*/
import { existsSync, mkdirSync, readFileSync, writeFileSync, cpSync, rmSync } from 'fs';
import { join, dirname } from 'path';
/**
* @typedef {Object} QualityDimensions
* @property {number} validation - Passes validation checks (0-1)
* @property {number} completeness - All required sections present (0-1)
* @property {number} correctness - Accurate information/behavior (0-1)
* @property {number} readability - Clear, well-structured (0-1)
* @property {number} efficiency - Appropriate length/complexity (0-1)
*/
/**
* @typedef {Object} IterationRecord
* @property {number} iteration_number - Iteration number
* @property {string} timestamp - ISO timestamp
* @property {number} quality_score - Overall quality score 0-100
* @property {number|null} quality_delta - Change from previous iteration
* @property {QualityDimensions} dimensions - Quality dimension scores
* @property {number} tokens_used - Tokens consumed
* @property {number} token_cost_usd - Cost in USD
* @property {number} execution_time_ms - Duration in milliseconds
* @property {string} verification_status - passed|failed|skipped
* @property {string} snapshot_path - Path to artifact snapshot
* @property {string[]} artifacts - Artifact file paths
* @property {string[]} reflections - Self-reflection notes
*/
/**
* @typedef {Object} SelectionCriteria
* @property {string} mode - highest_quality|highest_quality_verified|most_recent_above_threshold
* @property {number} threshold - Minimum quality threshold (0-100)
* @property {boolean} require_verification - Only select verified iterations
*/
/**
* @typedef {Object} SelectionResult
* @property {number} selected_iteration - Selected iteration number
* @property {number} quality_score - Quality score of selected iteration
* @property {string} reason - Explanation of selection
* @property {number} final_iteration - Final iteration number
* @property {number} final_quality - Final iteration quality
* @property {boolean} degradation_detected - Whether quality degraded
*/
/**
* @typedef {Object} TrackingConfig
* @property {string} storage_path - Base storage directory
* @property {SelectionCriteria} selection - Selection criteria
* @property {boolean} keep_all_iterations - Whether to preserve all snapshots
* @property {Object} quality_weights - Dimension weights
*/
// Default quality dimension weights
const DEFAULT_WEIGHTS = {
validation: 0.30,
completeness: 0.25,
correctness: 0.25,
readability: 0.10,
efficiency: 0.10,
};
// Default selection criteria
const DEFAULT_SELECTION = {
mode: 'highest_quality_verified',
threshold: 70,
require_verification: true,
};
export class BestOutputTracker {
/**
* @param {string} loopId - Loop identifier
* @param {Partial<TrackingConfig>} config - Configuration
*/
constructor(loopId, config = {}) {
this.loopId = loopId;
this.config = {
storage_path: config.storage_path || join(process.cwd(), '.aiwg', 'ralph', loopId),
selection: { ...DEFAULT_SELECTION, ...(config.selection || {}) },
keep_all_iterations: config.keep_all_iterations !== false,
quality_weights: { ...DEFAULT_WEIGHTS, ...(config.quality_weights || {}) },
};
this.iterationsDir = join(this.config.storage_path, 'iterations');
this.trackingFile = join(this.config.storage_path, 'best-output-tracking.json');
/** @type {IterationRecord[]} */
this.iterations = [];
/** @type {number|null} */
this.bestIterationNumber = null;
// Initialize storage
this.ensureStorageExists();
// Load existing tracking if present
this.load();
}
/**
* Ensure storage directories exist
*/
ensureStorageExists() {
if (!existsSync(this.iterationsDir)) {
mkdirSync(this.iterationsDir, { recursive: true });
}
}
/**
* Calculate overall quality score from dimensions
* @param {QualityDimensions} dimensions
* @returns {number} Score 0-100
*/
calculateQualityScore(dimensions) {
const weights = this.config.quality_weights;
const weighted =
dimensions.validation * weights.validation +
dimensions.completeness * weights.completeness +
dimensions.correctness * weights.correctness +
dimensions.readability * weights.readability +
dimensions.efficiency * weights.efficiency;
return Math.round(weighted * 100);
}
/**
* Record an iteration with quality metrics
* @param {Object} params
* @param {number} params.iteration_number - Iteration number
* @param {QualityDimensions} params.dimensions - Quality dimensions
* @param {string[]} params.artifacts - Artifact paths
* @param {number} [params.tokens_used=0] - Tokens consumed
* @param {number} [params.token_cost_usd=0] - Cost in USD
* @param {number} [params.execution_time_ms=0] - Execution time
* @param {string} [params.verification_status='skipped'] - Verification status
* @param {string[]} [params.reflections=[]] - Self-reflection notes
*/
recordIteration(params) {
const quality_score = this.calculateQualityScore(params.dimensions);
// Calculate delta from previous iteration
let quality_delta = null;
if (this.iterations.length > 0) {
const previous = this.iterations[this.iterations.length - 1];
quality_delta = quality_score - previous.quality_score;
}
// Create snapshot directory
const snapshotDir = join(
this.iterationsDir,
`iteration-${String(params.iteration_number).padStart(3, '0')}`
);
mkdirSync(snapshotDir, { recursive: true });
// Snapshot artifacts
const snapshotArtifacts = [];
for (const artifactPath of params.artifacts) {
if (existsSync(artifactPath)) {
const basename = artifactPath.split('/').pop();
const destPath = join(snapshotDir, basename);
cpSync(artifactPath, destPath, { recursive: true });
snapshotArtifacts.push(destPath);
}
}
// Create iteration record
/** @type {IterationRecord} */
const record = {
iteration_number: params.iteration_number,
timestamp: new Date().toISOString(),
quality_score,
quality_delta,
dimensions: params.dimensions,
tokens_used: params.tokens_used || 0,
token_cost_usd: params.token_cost_usd || 0,
execution_time_ms: params.execution_time_ms || 0,
verification_status: params.verification_status || 'skipped',
snapshot_path: snapshotDir,
artifacts: snapshotArtifacts,
reflections: params.reflections || [],
};
this.iterations.push(record);
// Update best iteration
this.updateBest(record);
// Persist
this.save();
return record;
}
/**
* Update running best iteration tracker
* @param {IterationRecord} newRecord
*/
updateBest(newRecord) {
if (this.bestIterationNumber === null) {
this.bestIterationNumber = newRecord.iteration_number;
return;
}
const current = this.getBest();
if (!current) {
this.bestIterationNumber = newRecord.iteration_number;
return;
}
// Compare by quality score
if (newRecord.quality_score > current.quality_score) {
this.bestIterationNumber = newRecord.iteration_number;
}
}
/**
* Get current best iteration record
* @returns {IterationRecord|null}
*/
getBest() {
if (this.bestIterationNumber === null) {
return null;
}
return this.iterations.find(
(it) => it.iteration_number === this.bestIterationNumber
) || null;
}
/**
* Select output based on configured criteria
* @returns {SelectionResult}
*/
selectOutput() {
if (this.iterations.length === 0) {
throw new Error('No iterations recorded');
}
const criteria = this.config.selection;
const finalIteration = this.iterations[this.iterations.length - 1];
// Filter by threshold
let candidates = this.iterations.filter(
(it) => it.quality_score >= criteria.threshold
);
// Filter by verification if required
if (criteria.require_verification) {
candidates = candidates.filter(
(it) => it.verification_status === 'passed'
);
}
// If no candidates meet criteria, fall back to all iterations
if (candidates.length === 0) {
candidates = [...this.iterations];
}
let selected;
let reason;
switch (criteria.mode) {
case 'highest_quality':
// Select highest quality regardless of verification
selected = candidates.reduce((best, current) =>
current.quality_score > best.quality_score ? current : best
);
reason = `Highest quality score: ${selected.quality_score}%`;
break;
case 'highest_quality_verified':
// Select highest quality among verified
const verified = candidates.filter(
(it) => it.verification_status === 'passed'
);
if (verified.length > 0) {
selected = verified.reduce((best, current) =>
current.quality_score > best.quality_score ? current : best
);
reason = `Highest verified quality: ${selected.quality_score}%`;
} else {
// Fall back to highest overall
selected = candidates.reduce((best, current) =>
current.quality_score > best.quality_score ? current : best
);
reason = `Highest quality (no verified iterations): ${selected.quality_score}%`;
}
break;
case 'most_recent_above_threshold':
// Select most recent iteration above threshold
selected = candidates[candidates.length - 1];
reason = `Most recent above threshold (${criteria.threshold}%): ${selected.quality_score}%`;
break;
default:
// Default to highest quality
selected = candidates.reduce((best, current) =>
current.quality_score > best.quality_score ? current : best
);
reason = `Highest quality: ${selected.quality_score}%`;
}
// Detect degradation
const degradation_detected =
selected.iteration_number !== finalIteration.iteration_number &&
selected.quality_score > finalIteration.quality_score;
return {
selected_iteration: selected.iteration_number,
quality_score: selected.quality_score,
reason,
final_iteration: finalIteration.iteration_number,
final_quality: finalIteration.quality_score,
degradation_detected,
};
}
/**
* Generate selection report
* @param {SelectionResult} selection - Selection result
* @returns {string} Markdown report
*/
generateSelectionReport(selection) {
const lines = [];
lines.push('# Output Selection Report');
lines.push('');
lines.push(`**Loop ID**: ${this.loopId}`);
lines.push(`**Total Iterations**: ${this.iterations.length}`);
lines.push(`**Selected Iteration**: ${selection.selected_iteration}`);
lines.push('');
// Summary table
lines.push('## Summary');
lines.push('');
lines.push('| Metric | Value |');
lines.push('|--------|-------|');
lines.push(`| Selected Iteration | ${selection.selected_iteration} |`);
lines.push(`| Selected Quality | ${selection.quality_score}% |`);
lines.push(`| Final Iteration | ${selection.final_iteration} |`);
lines.push(`| Final Quality | ${selection.final_quality}% |`);
lines.push(`| Degradation Detected | ${selection.degradation_detected ? 'Yes' : 'No'} |`);
lines.push('');
// Quality scores table
lines.push('## Quality Scores');
lines.push('');
lines.push('| Iteration | Quality | Delta | Tokens | Cost | Verified |');
lines.push('|-----------|---------|-------|--------|------|----------|');
for (const iteration of this.iterations) {
const marker = iteration.iteration_number === selection.selected_iteration ? ' ✓' : '';
const delta = iteration.quality_delta !== null
? (iteration.quality_delta >= 0 ? '+' : '') + iteration.quality_delta.toFixed(1)
: '-';
const cost = iteration.token_cost_usd.toFixed(4);
const verified = iteration.verification_status === 'passed' ? '✓' : '✗';
lines.push(
`| ${iteration.iteration_number}${marker} | ${iteration.quality_score}% | ${delta}% | ${iteration.tokens_used} | $${cost} | ${verified} |`
);
}
lines.push('');
// Selection rationale
lines.push('## Selection Rationale');
lines.push('');
lines.push(`**Selected**: Iteration ${selection.selected_iteration}`);
lines.push(`**Reason**: ${selection.reason}`);
lines.push('');
if (selection.degradation_detected) {
lines.push('### Degradation Detected');
lines.push('');
lines.push(
`Quality degraded from iteration ${selection.selected_iteration} ` +
`(${selection.quality_score}%) to final iteration ${selection.final_iteration} ` +
`(${selection.final_quality}%). Selected best output instead of final.`
);
lines.push('');
}
// Quality trajectory
lines.push('## Quality Trajectory');
lines.push('');
lines.push('```');
const maxScore = Math.max(...this.iterations.map((it) => it.quality_score));
for (const iteration of this.iterations) {
const barLength = Math.round((iteration.quality_score / maxScore) * 40);
const bar = '█'.repeat(barLength);
const marker = iteration.iteration_number === selection.selected_iteration ? ' ← SELECTED' : '';
lines.push(`Iteration ${iteration.iteration_number}: ${bar} ${iteration.quality_score}%${marker}`);
}
lines.push('```');
lines.push('');
// Artifacts applied
const selectedIteration = this.iterations.find(
(it) => it.iteration_number === selection.selected_iteration
);
if (selectedIteration) {
lines.push('## Artifacts Applied');
lines.push('');
for (const artifact of selectedIteration.artifacts) {
lines.push(`- ${artifact}`);
}
lines.push('');
}
// Recommendations
lines.push('## Recommendations');
lines.push('');
if (selection.degradation_detected) {
const degradationAmount = selection.quality_score - selection.final_quality;
lines.push(`- Quality degraded by ${degradationAmount}% in later iterations`);
lines.push('- Consider setting max iterations to avoid over-refinement');
lines.push(`- Optimal iteration count for this task: ~${selection.selected_iteration}`);
} else if (selection.selected_iteration === selection.final_iteration) {
lines.push('- Quality improved or remained stable throughout iterations');
lines.push('- Final iteration selected as best output');
} else {
lines.push('- Selected iteration met criteria while final did not');
lines.push('- Review verification requirements if appropriate');
}
lines.push('');
return lines.join('\n');
}
/**
* Get iteration by number
* @param {number} iterationNumber
* @returns {IterationRecord|null}
*/
getIteration(iterationNumber) {
return this.iterations.find(
(it) => it.iteration_number === iterationNumber
) || null;
}
/**
* Get all iterations
* @returns {IterationRecord[]}
*/
getAllIterations() {
return [...this.iterations];
}
/**
* Get quality trajectory
* @returns {Array<{iteration: number, quality: number}>}
*/
getQualityTrajectory() {
return this.iterations.map((it) => ({
iteration: it.iteration_number,
quality: it.quality_score,
}));
}
/**
* Detect diminishing returns
* @param {number} consecutiveThreshold - Number of low-delta iterations
* @param {number} deltaThreshold - Delta threshold (0-100)
* @returns {{detected: boolean, iteration: number|null}}
*/
detectDiminishingReturns(consecutiveThreshold = 2, deltaThreshold = 5) {
if (this.iterations.length < consecutiveThreshold + 1) {
return { detected: false, iteration: null };
}
let consecutiveLow = 0;
for (let i = 1; i < this.iterations.length; i++) {
const delta = this.iterations[i].quality_delta;
if (delta !== null && Math.abs(delta) < deltaThreshold) {
consecutiveLow++;
if (consecutiveLow >= consecutiveThreshold) {
return {
detected: true,
iteration: this.iterations[i - consecutiveThreshold + 1].iteration_number,
};
}
} else {
consecutiveLow = 0;
}
}
return { detected: false, iteration: null };
}
/**
* Save tracking data to disk
*/
save() {
const data = {
loop_id: this.loopId,
config: this.config,
iterations: this.iterations,
best_iteration_number: this.bestIterationNumber,
last_updated: new Date().toISOString(),
};
mkdirSync(dirname(this.trackingFile), { recursive: true });
writeFileSync(this.trackingFile, JSON.stringify(data, null, 2));
}
/**
* Load tracking data from disk
*/
load() {
if (!existsSync(this.trackingFile)) {
return;
}
try {
const content = readFileSync(this.trackingFile, 'utf8');
const data = JSON.parse(content);
this.iterations = data.iterations || [];
this.bestIterationNumber = data.best_iteration_number || null;
// Merge config (prefer constructor config over loaded)
if (data.config) {
this.config = {
...data.config,
...this.config, // Constructor config takes precedence
};
}
} catch (error) {
console.error('Failed to load tracking data:', error.message);
}
}
/**
* Clean up snapshots except selected
* @param {number} selectedIteration - Iteration to keep
*/
cleanupSnapshots(selectedIteration) {
if (this.config.keep_all_iterations) {
return; // Keep all by configuration
}
for (const iteration of this.iterations) {
if (iteration.iteration_number !== selectedIteration) {
const snapshotDir = iteration.snapshot_path;
if (existsSync(snapshotDir)) {
rmSync(snapshotDir, { recursive: true, force: true });
}
}
}
}
/**
* Export tracking data as CSV
* @returns {string} CSV content
*/
exportCSV() {
const lines = [];
// Header
lines.push('iteration,timestamp,quality_score,quality_delta,tokens_used,cost_usd,execution_time_ms,verification_status');
// Rows
for (const iteration of this.iterations) {
const delta = iteration.quality_delta !== null ? iteration.quality_delta : '';
lines.push(
`${iteration.iteration_number},` +
`${iteration.timestamp},` +
`${iteration.quality_score},` +
`${delta},` +
`${iteration.tokens_used},` +
`${iteration.token_cost_usd},` +
`${iteration.execution_time_ms},` +
`${iteration.verification_status}`
);
}
return lines.join('\n');
}
/**
* Get summary statistics
* @returns {Object}
*/
getSummary() {
if (this.iterations.length === 0) {
return {
total_iterations: 0,
average_quality: 0,
best_quality: 0,
worst_quality: 0,
total_tokens: 0,
total_cost_usd: 0,
total_time_ms: 0,
};
}
const qualities = this.iterations.map((it) => it.quality_score);
const totalTokens = this.iterations.reduce((sum, it) => sum + it.tokens_used, 0);
const totalCost = this.iterations.reduce((sum, it) => sum + it.token_cost_usd, 0);
const totalTime = this.iterations.reduce((sum, it) => sum + it.execution_time_ms, 0);
return {
total_iterations: this.iterations.length,
average_quality: qualities.reduce((a, b) => a + b, 0) / qualities.length,
best_quality: Math.max(...qualities),
worst_quality: Math.min(...qualities),
total_tokens: totalTokens,
total_cost_usd: totalCost,
total_time_ms: totalTime,
};
}
}
export default BestOutputTracker;