aiwg

Version:

Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.

aiwg.io

jmagly/aiwg

787 lines (690 loc) • 22.6 kB

YAML

# Ralph State Extension Schema - Agent Persistence # Extends loop-state.yaml with agent persistence tracking fields # Issue: #261 $schema: "https://json-schema.org/draft/2020-12/schema" $id: "https://aiwg.io/schemas/ralph-persistence-extension/v1" title: "Ralph Persistence Extension Schema" description: | Extension to loop-state.yaml adding agent persistence tracking: - Baseline metrics for regression detection - Iteration history with quality scores - Best output selection tracking - Regression event log - Reinforcement escalation state - Recovery attempt tracking type: object description: "Extends properties object in loop-state.yaml" properties: # ============================================================================ # Baseline Metrics # ============================================================================ baseline_metrics: type: object description: | Snapshot of codebase health at loop start. Used for regression detection throughout loop execution. required: - captured_at - test_count - coverage_percentage properties: captured_at: type: string format: date-time description: "When baseline was captured" test_count: type: integer minimum: 0 description: "Total number of tests" coverage_percentage: type: number minimum: 0 maximum: 100 description: "Overall code coverage percentage" typescript_errors: type: integer minimum: 0 nullable: true description: "TypeScript compilation errors (if applicable)" lint_errors: type: integer minimum: 0 nullable: true description: "Linting errors" lint_warnings: type: integer minimum: 0 nullable: true description: "Linting warnings" file_count: type: integer minimum: 0 description: "Total files in codebase" lines_of_code: type: integer minimum: 0 description: "Total lines of code" custom_metrics: type: object additionalProperties: true description: "Project-specific baseline metrics" # ============================================================================ # Iteration History # ============================================================================ iteration_history: type: array description: | Complete history of all iterations with quality scores. Enables best output selection per REF-015 Self-Refine. items: type: object required: - iteration - timestamp - quality_score - artifacts properties: iteration: type: integer minimum: 1 description: "Iteration number (1-based)" timestamp: type: string format: date-time description: "When iteration completed" quality_score: type: number minimum: 0 maximum: 100 description: | Overall quality score for this iteration. Weighted combination of: - Validation (30%) - Completeness (25%) - Correctness (25%) - Readability (10%) - Efficiency (10%) quality_delta: type: number description: "Change from previous iteration" quality_breakdown: type: object description: "Individual quality dimension scores" properties: validation: type: number minimum: 0 maximum: 100 completeness: type: number minimum: 0 maximum: 100 correctness: type: number minimum: 0 maximum: 100 readability: type: number minimum: 0 maximum: 100 efficiency: type: number minimum: 0 maximum: 100 artifacts: type: array items: type: object properties: path: type: string hash: type: string size_bytes: type: integer description: "Files created/modified in this iteration" snapshot_path: type: string description: | Path to full iteration snapshot. Multi-loop: .aiwg/ralph/loops/{loop_id}/iterations/iteration-{n:03d}.json Legacy: .aiwg/ralph/iterations/iteration-{n:03d}.json test_results: type: object nullable: true properties: total: type: integer passed: type: integer failed: type: integer skipped: type: integer coverage: type: number minimum: 0 maximum: 100 metrics_snapshot: type: object description: "All metrics at iteration completion" properties: test_count: type: integer coverage_percentage: type: number typescript_errors: type: integer lint_errors: type: integer regression_detected: type: boolean default: false description: "Whether regression was detected in this iteration" recovery_attempted: type: boolean default: false description: "Whether recovery protocol was invoked" # ============================================================================ # Best Output Tracking # ============================================================================ best_iteration: type: object description: | Tracks the highest quality iteration for final output selection. Per REF-015, final iteration is not always the best. nullable: true properties: iteration: type: integer minimum: 1 description: "Iteration number with highest quality" quality_score: type: number minimum: 0 maximum: 100 description: "Quality score of best iteration" snapshot_path: type: string description: "Path to best iteration snapshot" updated_at: type: string format: date-time description: "When best iteration was last updated" selection_reason: type: string description: "Why this iteration is best" examples: - "Highest quality score (85%)" - "Passed all validation and highest completeness" # ============================================================================ # Regression Events # ============================================================================ regression_events: type: array description: "Log of all detected regressions" items: type: object required: - timestamp - iteration - regression_type - severity properties: event_id: type: string format: uuid description: "Unique event identifier" timestamp: type: string format: date-time description: "When regression was detected" iteration: type: integer minimum: 1 description: "Iteration where regression occurred" regression_type: type: string enum: - test_deletion - test_skipping - feature_removal - coverage_regression - validation_bypass - assertion_weakening - error_suppression description: "Type of regression detected" severity: type: string enum: [critical, high, medium, low] description: "Severity level" details: type: object description: "Regression-specific details" properties: baseline_value: description: "Value before regression" current_value: description: "Value after regression" diff: type: object description: "Detailed diff information" recovery_protocol_invoked: type: boolean default: false description: "Whether recovery was triggered" recovery_outcome: type: string enum: [success, failed, escalated, skipped] nullable: true description: "Outcome of recovery attempt" human_gate_invoked: type: boolean default: false description: "Whether human gate was triggered" human_decision: type: string enum: [approve, reject, abort] nullable: true description: "Human decision if gate was invoked" # ============================================================================ # Reinforcement State # ============================================================================ reinforcement_level: type: string enum: [OFF, MINIMAL, STANDARD, AGGRESSIVE, ADAPTIVE] default: MINIMAL description: | Current prompt reinforcement intensity level. Escalates based on iteration count and quality trajectory. reinforcement_history: type: array description: "History of reinforcement level changes" items: type: object properties: timestamp: type: string format: date-time iteration: type: integer from_level: type: string to_level: type: string reason: type: string examples: - "Iteration 5 threshold reached" - "Quality plateau detected (3 iterations < 5% delta)" - "Regression detected in iteration 7" # ============================================================================ # Recovery Attempts # ============================================================================ recovery_attempts: type: integer minimum: 0 default: 0 description: "Total recovery protocol invocations" recovery_history: type: array description: "Detailed recovery attempt log" items: type: object required: - timestamp - iteration - trigger properties: recovery_id: type: string format: uuid description: "Unique recovery attempt ID" timestamp: type: string format: date-time description: "When recovery was initiated" iteration: type: integer minimum: 1 description: "Iteration where recovery was triggered" trigger: type: string description: "What triggered recovery" examples: - "regression_detected: test_deletion" - "stuck_loop_detected: 3 consecutive failures" protocol_steps: type: object description: "PDARE protocol execution" properties: pause: type: object properties: executed_at: type: string format: date-time actions: type: array items: type: string diagnose: type: object properties: executed_at: type: string format: date-time root_cause: type: string confidence: type: number minimum: 0 maximum: 1 adapt: type: object properties: executed_at: type: string format: date-time strategy: type: string changes_made: type: array items: type: string retry: type: object properties: executed_at: type: string format: date-time retry_iteration: type: integer outcome: type: string enum: [success, failed] escalate: type: object nullable: true properties: executed_at: type: string format: date-time reason: type: string human_decision: type: string outcome: type: string enum: [success, failed, escalated, aborted] description: "Overall recovery outcome" # ============================================================================ # Detection State # ============================================================================ detection_enabled: type: boolean default: false description: "Whether laziness detection is active" detected_patterns: type: array description: "All detected laziness patterns in this loop" items: type: object properties: pattern_id: type: string description: "Pattern identifier (e.g., LP-001)" pattern_name: type: string iteration: type: integer timestamp: type: string format: date-time severity: type: string enum: [critical, high, medium, low] false_positive: type: boolean default: false description: "Human-marked false positive" # ============================================================================ # Performance Metrics # ============================================================================ persistence_metrics: type: object description: "Agent persistence framework performance metrics" properties: detection_latency_p95_ms: type: integer description: "95th percentile detection latency" detection_latency_p99_ms: type: integer description: "99th percentile detection latency" integration_overhead_percentage: type: number description: "Percentage increase in iteration time due to hooks" false_positive_rate: type: number minimum: 0 maximum: 1 description: "False positives / total detections" true_positive_count: type: integer minimum: 0 description: "Confirmed laziness patterns detected" false_positive_count: type: integer minimum: 0 description: "False alarms" # ============================================================================ # Integration with loop-state.yaml # ============================================================================ integration: description: | These fields extend the loop-state.yaml properties object. Full loop state structure: { version: "2.0.0", loop_id: "ralph-fix-tests-a1b2c3d4", status: "running", iteration: 5, ... // Standard loop-state fields above // Agent persistence extension fields below baseline_metrics: {...}, iteration_history: [...], best_iteration: {...}, regression_events: [...], reinforcement_level: "STANDARD", ... } merge_strategy: "shallow_merge" conflict_resolution: "persistence_extension_wins" # ============================================================================ # State Update Protocol # ============================================================================ state_updates: on_loop_start: - set: baseline_metrics from: "progress-tracker.baseline" - set: detection_enabled value: true - set: reinforcement_level value: "MINIMAL" - initialize: iteration_history value: [] - initialize: regression_events value: [] on_iteration_complete: - append: iteration_history from: "iteration_metrics" - update: best_iteration from: "best-output-tracker.best" - increment: metrics.total_iterations on_regression_detected: - append: regression_events from: "regression_record" - increment: recovery_attempts on_loop_complete: - set: detection_enabled value: false - finalize: best_iteration # ============================================================================ # Examples # ============================================================================ examples: full_state_with_persistence: # Standard loop-state fields version: "2.0.0" loop_id: "ralph-fix-tests-a1b2c3d4" status: "running" iteration: 7 task: "Fix all TypeScript errors" completion_criteria: "npx tsc --noEmit passes" started_at: "2026-02-02T21:00:00Z" last_updated: "2026-02-02T21:15:00Z" # Agent persistence extension fields baseline_metrics: captured_at: "2026-02-02T21:00:00Z" test_count: 150 coverage_percentage: 85 typescript_errors: 12 lines_of_code: 15000 iteration_history: - iteration: 1 timestamp: "2026-02-02T21:02:00Z" quality_score: 60 quality_delta: 0 artifacts: - path: "src/auth/login.ts" hash: "abc123" snapshot_path: ".aiwg/ralph/loops/ralph-fix-tests-a1b2c3d4/iterations/iteration-001.json" metrics_snapshot: test_count: 150 coverage_percentage: 85 typescript_errors: 10 regression_detected: false - iteration: 2 timestamp: "2026-02-02T21:05:00Z" quality_score: 85 quality_delta: 25 artifacts: - path: "src/auth/login.ts" hash: "def456" snapshot_path: ".aiwg/ralph/loops/ralph-fix-tests-a1b2c3d4/iterations/iteration-002.json" metrics_snapshot: test_count: 150 coverage_percentage: 87 typescript_errors: 5 regression_detected: false - iteration: 7 timestamp: "2026-02-02T21:15:00Z" quality_score: 70 quality_delta: -15 artifacts: - path: "src/auth/login.ts" hash: "ghi789" snapshot_path: ".aiwg/ralph/loops/ralph-fix-tests-a1b2c3d4/iterations/iteration-007.json" metrics_snapshot: test_count: 148 # REGRESSION coverage_percentage: 84 # REGRESSION typescript_errors: 0 regression_detected: true recovery_attempted: true best_iteration: iteration: 2 quality_score: 85 snapshot_path: ".aiwg/ralph/loops/ralph-fix-tests-a1b2c3d4/iterations/iteration-002.json" updated_at: "2026-02-02T21:05:00Z" selection_reason: "Highest quality score (85%)" regression_events: - event_id: "reg-001" timestamp: "2026-02-02T21:15:05Z" iteration: 7 regression_type: "test_deletion" severity: "critical" details: baseline_value: 150 current_value: 148 diff: deleted_tests: - "test/unit/auth/login.test.ts: should validate email format" - "test/unit/auth/login.test.ts: should reject weak passwords" recovery_protocol_invoked: true recovery_outcome: "escalated" human_gate_invoked: true human_decision: "reject" reinforcement_level: "AGGRESSIVE" reinforcement_history: - timestamp: "2026-02-02T21:00:00Z" iteration: 1 from_level: "OFF" to_level: "MINIMAL" reason: "Loop initialization" - timestamp: "2026-02-02T21:10:00Z" iteration: 5 from_level: "MINIMAL" to_level: "STANDARD" reason: "Iteration 5 threshold reached" - timestamp: "2026-02-02T21:15:05Z" iteration: 7 from_level: "STANDARD" to_level: "AGGRESSIVE" reason: "Regression detected in iteration 7" recovery_attempts: 1 recovery_history: - recovery_id: "rec-001" timestamp: "2026-02-02T21:15:05Z" iteration: 7 trigger: "regression_detected: test_deletion" protocol_steps: pause: executed_at: "2026-02-02T21:15:05Z" actions: - "Blocked pending file operations" - "Snapshot created: checkpoint-007" diagnose: executed_at: "2026-02-02T21:15:10Z" root_cause: "Agent deleted failing tests instead of fixing validation logic" confidence: 0.95 adapt: executed_at: "2026-02-02T21:15:15Z" strategy: "Escalate to human gate - critical regression" changes_made: [] escalate: executed_at: "2026-02-02T21:15:20Z" reason: "Critical regression: test deletion" human_decision: "reject - revert iteration 7" outcome: "escalated" detection_enabled: true detected_patterns: - pattern_id: "LP-001" pattern_name: "Test Deletion" iteration: 7 timestamp: "2026-02-02T21:15:05Z" severity: "critical" false_positive: false persistence_metrics: detection_latency_p95_ms: 450 detection_latency_p99_ms: 850 integration_overhead_percentage: 8.5 false_positive_rate: 0.03 true_positive_count: 1 false_positive_count: 0 # ============================================================================ # References # ============================================================================ references: base_schema: - "@agentic/code/addons/ralph/schemas/loop-state.yaml" related_schemas: - "@agentic/code/addons/ralph/schemas/checkpoint.yaml" - "@agentic/code/addons/ralph/schemas/iteration-analytics.yaml" - "@agentic/code/addons/ralph/hooks/persistence-hooks.yaml" requirements: - "@.aiwg/requirements/nfr-modules/agent-persistence-nfrs.md" research: - "@.aiwg/research/findings/REF-015-self-refine.md" # Best output selection - "@.aiwg/research/findings/REF-058-r-lam.md" # Checkpointing - "@.aiwg/research/findings/agentic-laziness-research.md" # Laziness patterns