aiwg
Version:
Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.
640 lines (609 loc) • 18.2 kB
YAML
# Executable Feedback Loop Workflow Schema
# Based on REF-013 MetaGPT
# Finding: +4.2% HumanEval improvement, -63% human revision cost
# Issue: #101
$schema: "https://json-schema.org/draft/2020-12/schema"
$id: "https://aiwg.io/schemas/executable-feedback/v1"
title: "Executable Feedback Loop Schema"
description: |
Workflow schema for the execute-before-return pattern in code-generating
agents. Implements MetaGPT's executable feedback loop where generated
code is tested before being returned to the user, with structured debug
memory for cross-session learning.
Key findings from REF-013 MetaGPT:
- +4.2% HumanEval improvement from execution feedback
- -63% human revision cost (2.25 → 0.83 cycles)
- Debug memory enables learning from past failures
type: object
required:
- workflow_id
- agent
- code_artifact
- execution_config
- loop_state
properties:
workflow_id:
type: string
format: uuid
description: "Unique workflow execution identifier"
agent:
type: object
required: [name, type]
properties:
name:
type: string
description: "Name of the code-generating agent"
type:
type: string
enum:
- software_implementer
- test_engineer
- debugger
- code_reviewer
description: "Agent type in the SDLC framework"
version:
type: string
description: "Agent version"
code_artifact:
$ref: "#/$defs/CodeArtifact"
description: "The code being generated and tested"
execution_config:
$ref: "#/$defs/ExecutionConfig"
description: "Configuration for test execution"
retry_policy:
$ref: "#/$defs/RetryPolicy"
description: "Policy for retrying failed executions"
coverage_requirements:
$ref: "#/$defs/CoverageRequirements"
description: "Minimum test coverage requirements by code type"
escalation_policy:
$ref: "#/$defs/EscalationPolicy"
description: "When and how to escalate to human review"
loop_state:
$ref: "#/$defs/LoopState"
description: "Current state of the feedback loop"
debug_memory_ref:
type: string
description: "Path to debug memory file for this session"
pattern: "^\\.aiwg/ralph/debug-memory/"
ralph_integration:
$ref: "#/$defs/RalphIntegration"
description: "Integration with Ralph loop if running within one"
timestamps:
type: object
properties:
started_at:
type: string
format: date-time
completed_at:
type: string
format: date-time
last_attempt_at:
type: string
format: date-time
$defs:
CodeArtifact:
type: object
required: [path, language]
properties:
path:
type: string
description: "File path of the generated code"
language:
type: string
enum: [typescript, javascript, python, go, rust, java, other]
description: "Programming language"
code_type:
type: string
enum:
- new_function # New function or module
- bug_fix # Fix for existing bug
- refactor # Restructuring existing code
- api_endpoint # New API endpoint
- integration # Integration with external system
description: "Type of code change"
content_hash:
type: string
description: "SHA-256 hash of current code content"
test_files:
type: array
items:
type: string
description: "Associated test file paths"
requirements_ref:
type: array
items:
type: string
description: "@-mention paths to requirements this code implements"
ExecutionConfig:
type: object
required: [test_framework, test_command]
properties:
test_framework:
type: string
enum: [jest, vitest, pytest, go_test, cargo_test, junit, mocha, other]
description: "Test framework being used"
test_command:
type: string
description: "Shell command to execute tests"
examples:
- "npm test -- --grep auth"
- "pytest tests/unit/auth/"
- "go test ./auth/..."
timeout_seconds:
type: integer
minimum: 5
maximum: 600
default: 120
description: "Maximum time for test execution"
environment:
type: object
additionalProperties:
type: string
description: "Environment variables for test execution"
working_directory:
type: string
description: "Working directory for test execution"
fail_fast:
type: boolean
default: false
description: "Stop on first failure"
verbose:
type: boolean
default: true
description: "Verbose test output for better analysis"
RetryPolicy:
type: object
properties:
max_attempts:
type: integer
minimum: 1
maximum: 10
default: 3
description: "Maximum number of fix-and-retry attempts"
backoff:
type: string
enum: [none, linear, exponential]
default: none
description: "Backoff strategy between attempts"
escalation_on_max:
type: boolean
default: true
description: "Escalate to human when max attempts reached"
abort_on_regression:
type: boolean
default: true
description: "Stop if previously passing tests start failing"
CoverageRequirements:
type: object
description: "Minimum test coverage requirements by code type"
properties:
new_function:
type: object
properties:
minimum_coverage:
type: number
minimum: 0
maximum: 100
default: 80
required_tests:
type: array
items:
type: string
default:
- "happy_path"
- "edge_cases"
- "error_handling"
bug_fix:
type: object
properties:
minimum_coverage:
type: number
default: 100
description: "100% coverage of the fix"
required_tests:
type: array
default:
- "regression_test"
- "original_bug_reproduction"
refactor:
type: object
properties:
minimum_coverage:
type: number
default: -1
description: "Must match original coverage (-1 = match)"
required_tests:
type: array
default:
- "existing_tests_pass"
api_endpoint:
type: object
properties:
minimum_coverage:
type: number
default: 90
required_tests:
type: array
default:
- "happy_path"
- "error_cases"
- "integration_test"
- "validation_test"
EscalationPolicy:
type: object
properties:
triggers:
type: array
items:
type: object
required: [condition, action]
properties:
condition:
type: string
enum:
- max_attempts_reached
- regression_detected
- security_issue_found
- coverage_threshold_unmet
- timeout_exceeded
- unknown_error_type
description: "Condition that triggers escalation"
action:
type: string
enum:
- human_review
- senior_agent
- abort_with_report
- flag_and_continue
description: "Action to take on trigger"
include_in_report:
type: array
items:
type: string
enum:
- original_code
- all_test_results
- failure_analyses
- fix_attempts
- debug_memory_summary
- stack_traces
description: "What to include in escalation report"
notification:
type: object
properties:
channel:
type: string
enum: [issue_comment, cli, slack, email]
default: issue_comment
template:
type: string
description: "Notification template"
LoopState:
type: object
required: [phase, attempt_number, status]
properties:
phase:
type: string
enum:
- generate_code # Initial code generation
- generate_tests # Test generation for new code
- execute_tests # Running test suite
- analyze_failures # Analyzing test failures
- apply_fix # Applying fix based on analysis
- verify_fix # Re-running tests after fix
- complete # All tests passing
- escalated # Escalated to human
- aborted # Aborted due to regression or error
description: "Current phase of the feedback loop"
attempt_number:
type: integer
minimum: 1
description: "Current attempt number"
status:
type: string
enum: [in_progress, passed, failed, escalated, aborted]
description: "Overall loop status"
test_results:
$ref: "#/$defs/TestResults"
attempts:
type: array
items:
$ref: "#/$defs/ExecutionAttempt"
description: "History of all execution attempts"
TestResults:
type: object
properties:
total:
type: integer
minimum: 0
passed:
type: integer
minimum: 0
failed:
type: integer
minimum: 0
errors:
type: integer
minimum: 0
skipped:
type: integer
minimum: 0
duration_ms:
type: number
minimum: 0
coverage_percent:
type: number
minimum: 0
maximum: 100
ExecutionAttempt:
type: object
required: [attempt_number, timestamp, phase, test_results]
properties:
attempt_number:
type: integer
minimum: 1
timestamp:
type: string
format: date-time
phase:
type: string
description: "Phase when this attempt occurred"
code_hash:
type: string
description: "SHA-256 of code at time of attempt"
test_results:
$ref: "#/$defs/TestResults"
failures:
type: array
items:
type: object
required: [test_name, error_type, error_message]
properties:
test_name:
type: string
test_file:
type: string
error_type:
type: string
description: "Error class (TypeError, AssertionError, etc.)"
error_message:
type: string
stack_trace:
type: string
line_number:
type: integer
analysis:
type: object
properties:
root_cause:
type: string
description: "Identified root cause of failure"
fix_strategy:
type: string
description: "Strategy for fixing the failure"
confidence:
type: number
minimum: 0
maximum: 1
description: "Agent's confidence in the analysis"
patterns_matched:
type: array
items:
type: string
description: "Known patterns from debug memory that matched"
fix_applied:
type: object
properties:
description:
type: string
diff_summary:
type: string
description: "Summary of changes (e.g., +5/-2 lines)"
files_modified:
type: array
items:
type: string
RalphIntegration:
type: object
description: "Integration with Ralph loop when running within one"
properties:
loop_id:
type: string
description: "Ralph loop ID if applicable"
iteration:
type: integer
description: "Current Ralph iteration"
execution_gate:
type: object
properties:
require_passing_tests:
type: boolean
default: true
allow_skip:
type: boolean
default: false
debug_memory:
type: object
properties:
persist_per_iteration:
type: boolean
default: true
cross_iteration_learning:
type: boolean
default: true
progress_metric:
type: object
properties:
include_test_pass_rate:
type: boolean
default: true
weight:
type: number
default: 0.3
description: "Weight of test pass rate in overall progress"
# Workflow Protocol
#
# The executable feedback loop follows this protocol:
#
# 1. GENERATE code based on requirements
# └─ Agent produces initial code artifact
#
# 2. GENERATE tests (if not present)
# ├─ Happy path tests
# ├─ Edge case tests
# └─ Error handling tests
#
# 3. EXECUTE tests
# ├─ Capture all output
# └─ Record in debug memory
#
# 4. IF tests PASS:
# ├─ Record success in debug memory
# ├─ Check coverage requirements
# └─ Return code to user
#
# 5. IF tests FAIL:
# a. ANALYZE failures
# │ ├─ Parse error messages
# │ ├─ Identify root cause
# │ └─ Check debug memory for known patterns
# │
# b. APPLY fix
# │ ├─ Generate targeted fix
# │ └─ Update code
# │
# c. INCREMENT attempt counter
# │
# d. IF attempts < max_attempts:
# │ └─ GOTO step 3
# │
# e. ELSE:
# ├─ ESCALATE to human
# └─ Include debug memory context
# Pre-Generation Protocol
#
# Before generating code, agents SHOULD:
# 1. Check debug memory for similar past failures
# 2. Load patterns from .aiwg/ralph/debug-memory/
# 3. Apply learnings to avoid known failure patterns
# 4. Set appropriate coverage requirements for code type
# Metrics
#
# Track these metrics for continuous improvement:
#
# | Metric | Target | Purpose |
# |-------------------------|--------|--------------------------------|
# | first_attempt_pass_rate | >70% | Code generation quality |
# | average_attempts | <2.0 | Iteration efficiency |
# | escalation_rate | <10% | Self-sufficiency |
# | debug_memory_reuse | >30% | Learning effectiveness |
# | coverage_met_rate | >90% | Test completeness |
# Examples
examples:
simple_function:
workflow_id: "ef-001"
agent:
name: "software-implementer"
type: software_implementer
code_artifact:
path: "src/utils/validate.ts"
language: typescript
code_type: new_function
test_files: ["test/unit/utils/validate.test.ts"]
execution_config:
test_framework: jest
test_command: "npx jest test/unit/utils/validate.test.ts"
timeout_seconds: 30
retry_policy:
max_attempts: 3
escalation_on_max: true
coverage_requirements:
new_function:
minimum_coverage: 80
required_tests: [happy_path, edge_cases, error_handling]
loop_state:
phase: complete
attempt_number: 2
status: passed
test_results:
total: 8
passed: 8
failed: 0
errors: 0
skipped: 0
duration_ms: 450
coverage_percent: 92
attempts:
- attempt_number: 1
timestamp: "2026-01-25T10:00:00Z"
phase: execute_tests
test_results:
total: 8
passed: 6
failed: 2
errors: 0
skipped: 0
failures:
- test_name: "should reject empty string"
error_type: "TypeError"
error_message: "Cannot read property 'length' of null"
analysis:
root_cause: "Missing null check in validateInput()"
fix_strategy: "Add null/undefined guard at function entry"
confidence: 0.95
fix_applied:
description: "Added null check: if (!input) return { valid: false }"
diff_summary: "+3/-0 lines"
- attempt_number: 2
timestamp: "2026-01-25T10:00:30Z"
phase: verify_fix
test_results:
total: 8
passed: 8
failed: 0
errors: 0
skipped: 0
bug_fix_escalated:
workflow_id: "ef-002"
agent:
name: "debugger"
type: debugger
code_artifact:
path: "src/auth/token.ts"
language: typescript
code_type: bug_fix
execution_config:
test_framework: jest
test_command: "npx jest test/unit/auth/token.test.ts"
retry_policy:
max_attempts: 3
escalation_on_max: true
loop_state:
phase: escalated
attempt_number: 3
status: escalated
# Validation Rules
#
# Before returning code to user:
# - [ ] Tests generated for new code
# - [ ] Tests executed (not skipped)
# - [ ] All tests passing
# - [ ] Debug memory updated
# - [ ] Failures analyzed (if any occurred)
# - [ ] Coverage meets minimum for code type
# - [ ] Learnings recorded in debug memory
# References
references:
research:
- "@.aiwg/research/findings/REF-013-metagpt.md"
schemas:
- "@agentic/code/addons/ralph/schemas/debug-memory.yaml"
- "@agentic/code/addons/ralph/schemas/actionable-feedback.yaml"
- "@agentic/code/addons/ralph/schemas/iteration-analytics.yaml"
rules:
- "@.claude/rules/executable-feedback.md"
implementation:
- "#101"
guide:
- "@.aiwg/ralph/docs/executable-feedback-guide.md"