aiwg
Version:
Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.
633 lines (571 loc) • 19.4 kB
YAML
# Reliability Patterns Framework Schema
# Based on REF-001 Agentic AI in Production
# Issues: #239 (Retry Patterns), #240 (Checkpoints), #241 (Fallbacks)
$schema: "https://json-schema.org/draft/2020-12/schema"
$id: "https://aiwg.io/schemas/reliability-patterns/v1"
title: "Reliability Patterns Framework Schema"
description: |
Production reliability patterns for agentic systems implementing structured retry
configuration, comprehensive checkpointing, and fallback agent assignments per
REF-001 Agentic AI in Production.
type: object
required:
- version
- retry_patterns
- checkpoint_artifacts
- fallback_assignments
properties:
version:
type: string
pattern: "^\\d+\\.\\d+\\.\\d+$"
default: "1.0.0"
retry_patterns:
$ref: "#/$defs/RetryPatterns"
checkpoint_artifacts:
$ref: "#/$defs/CheckpointArtifacts"
fallback_assignments:
$ref: "#/$defs/FallbackAssignments"
$defs:
RetryPatterns:
type: object
description: "Structured retry pattern configuration per REF-001"
properties:
enabled:
type: boolean
default: true
policy_schema:
type: object
properties:
max_attempts:
type: object
properties:
type: { type: string, default: "integer" }
default: { type: integer, default: 3 }
min: { type: integer, default: 1 }
max: { type: integer, default: 10 }
backoff:
type: object
properties:
strategy:
type: object
properties:
type: { type: string, default: "string" }
enum:
type: array
default: ["constant", "linear", "exponential"]
default: { type: string, default: "exponential" }
initial_delay:
type: object
properties:
type: { type: string, default: "string" }
pattern: { type: string, default: "^\\d+(ms|s|m)$" }
default: { type: string, default: "1s" }
max_delay:
type: object
properties:
type: { type: string, default: "string" }
default: { type: string, default: "30s" }
multiplier:
type: object
properties:
type: { type: string, default: "number" }
default: { type: number, default: 2.0 }
description: { type: string, default: "Exponential backoff multiplier" }
jitter:
type: object
properties:
type: { type: string, default: "number" }
min: { type: number, default: 0 }
max: { type: number, default: 1 }
default: { type: number, default: 0.1 }
description: { type: string, default: "Randomization factor to prevent thundering herd" }
circuit_breaker:
type: object
properties:
failure_threshold:
type: object
properties:
type: { type: string, default: "integer" }
default: { type: integer, default: 5 }
description: { type: string, default: "Consecutive failures before opening circuit" }
timeout:
type: object
properties:
type: { type: string, default: "string" }
default: { type: string, default: "60s" }
description: { type: string, default: "Time in open state before half-open" }
half_open_requests:
type: object
properties:
type: { type: string, default: "integer" }
default: { type: integer, default: 3 }
description: { type: string, default: "Test requests in half-open state" }
error_classification:
type: object
properties:
retryable_errors:
type: array
items: { type: string }
default:
- "RateLimitError"
- "NetworkTimeoutError"
- "TemporaryAPIError"
- "ServiceUnavailableError"
- "GatewayTimeoutError"
non_retryable_errors:
type: array
items: { type: string }
default:
- "AuthenticationError"
- "ValidationError"
- "NotFoundError"
- "PermissionDeniedError"
- "InvalidInputError"
context_preservation:
type: object
properties:
type: { type: string, default: "string" }
enum:
type: array
default: ["full", "partial", "none"]
default: { type: string, default: "full" }
descriptions:
type: object
properties:
full: { type: string, default: "Preserve all context including conversation history" }
partial: { type: string, default: "Preserve task state, discard conversation details" }
none: { type: string, default: "Start fresh on retry" }
default_policies:
type: object
properties:
agent_task:
type: object
properties:
max_attempts: { type: integer, default: 3 }
backoff: { type: string, default: "exponential" }
initial_delay: { type: string, default: "2s" }
context_preservation: { type: string, default: "full" }
api_call:
type: object
properties:
max_attempts: { type: integer, default: 5 }
backoff: { type: string, default: "exponential" }
initial_delay: { type: string, default: "1s" }
max_delay: { type: string, default: "30s" }
jitter: { type: number, default: 0.1 }
ralph_loop:
type: object
properties:
max_attempts: { type: integer, default: 5 }
backoff: { type: string, default: "exponential" }
initial_delay: { type: string, default: "5s" }
max_delay: { type: string, default: "60s" }
context_preservation: { type: string, default: "full" }
budget_per_task: { type: integer, default: 3 }
agent_metadata_extension:
type: string
default: |
# Agent retry policy extension
---
name: Test Engineer
retry-policy:
max-attempts: 3
backoff:
strategy: exponential
initial-delay: 1s
max-delay: 30s
multiplier: 2
circuit-breaker:
failure-threshold: 5
timeout: 60s
half-open-requests: 3
retryable-errors:
- RateLimitError
- NetworkTimeoutError
non-retryable-errors:
- AuthenticationError
- ValidationError
---
CheckpointArtifacts:
type: object
description: "Comprehensive checkpoint artifacts per REF-001"
properties:
enabled:
type: boolean
default: true
triggers:
type: array
items: { type: string }
default:
- "task-completion"
- "error"
- "manual"
- "periodic"
- "phase-transition"
checkpoint_schema:
type: object
properties:
checkpoint:
type: object
properties:
id:
type: object
properties:
type: { type: string, default: "string" }
format: { type: string, default: "ckpt-YYYYMMDD-HHMMSS" }
iteration: { type: string, default: "integer" }
timestamp: { type: string, default: "date-time" }
trigger: { type: string, default: "string" }
execution:
type: object
properties:
current_phase: { type: string, default: "string" }
current_agent: { type: string, default: "string" }
task_stack:
type: string
default: "array of {id, description, status, startedAt}"
completed_tasks:
type: string
default: "array of {id, completedAt}"
artifacts:
type: object
properties:
created:
type: string
default: "array of {path, hash, size}"
modified:
type: string
default: "array of {path, hash, previousHash}"
context:
type: object
properties:
environment:
type: string
default: "{cwd, node, aiwg}"
variables:
type: string
default: "key-value pairs"
tool_outputs:
type: string
default: "array of {tool, invocation, output, exitCode, timestamp}"
agent_memory:
type: object
properties:
conversation_history:
type: string
default: "array of {role, content}"
working_memory:
type: string
default: "agent-specific state"
provenance:
type: object
properties:
parent_checkpoint: { type: string, default: "string" }
derived_from:
type: string
default: "array of {artifact, relationship}"
storage:
type: object
properties:
path:
type: string
default: ".aiwg/ralph/checkpoints/"
format:
type: string
default: "json"
compression:
type: boolean
default: true
retention:
type: object
properties:
max_checkpoints: { type: integer, default: 50 }
max_age_days: { type: integer, default: 30 }
modes:
type: object
properties:
full:
type: object
properties:
description: { type: string, default: "Complete state snapshot" }
includes:
type: array
items: { type: string }
default:
- "execution"
- "artifacts"
- "context"
- "tool_outputs"
- "agent_memory"
- "provenance"
incremental:
type: object
properties:
description: { type: string, default: "Only changes since last checkpoint" }
includes:
type: array
items: { type: string }
default:
- "execution"
- "artifacts.modified"
- "tool_outputs.recent"
FallbackAssignments:
type: object
description: "Fallback agent assignments per REF-001"
properties:
enabled:
type: boolean
default: true
fallback_schema:
type: object
properties:
primary:
type: object
properties:
type: { type: string, default: "string" }
description: { type: string, default: "First fallback agent to try" }
secondary:
type: object
properties:
type: { type: string, default: "string" }
description: { type: string, default: "Second fallback if primary unavailable" }
ultimate:
type: object
properties:
type: { type: string, default: "string" }
default: { type: string, default: "Generalist-Agent" }
description: { type: string, default: "Last resort fallback" }
strategy:
type: object
properties:
preserve_context:
type: object
properties:
type: { type: string, default: "boolean" }
default: { type: boolean, default: true }
skill_subset:
type: object
properties:
type: { type: string, default: "array" }
description: { type: string, default: "Skills fallback must support" }
degraded_mode:
type: object
properties:
description: { type: string, default: "Warning about reduced capability" }
acceptable: { type: boolean, default: true }
default_chains:
type: object
description: "Default fallback chains for SDLC agents"
properties:
test_engineer:
type: array
items: { type: string }
default:
- "QA-Specialist"
- "Software-Engineer"
- "Generalist-Agent"
security_auditor:
type: array
items: { type: string }
default:
- "Software-Engineer"
- "Generalist-Agent"
deployment_engineer:
type: array
items: { type: string }
default:
- "DevOps-Engineer"
- "Software-Engineer"
- "Generalist-Agent"
requirements_analyst:
type: array
items: { type: string }
default:
- "System-Analyst"
- "Software-Engineer"
- "Generalist-Agent"
architecture_designer:
type: array
items: { type: string }
default:
- "Software-Engineer"
- "Generalist-Agent"
agent_metadata_extension:
type: string
default: |
# Agent fallback extension
---
name: Test Engineer
role: testing
specialization: unit-testing
fallback:
primary: QA-Specialist
secondary: Software-Engineer
ultimate: Generalist-Agent
fallback-strategy:
preserve-context: true
skill-subset:
- test-writing
- test-execution
degraded-mode:
description: "Fallback may not validate coverage rigorously"
acceptable: true
---
# CLI commands
cli_commands:
retry_config:
command: "aiwg retry-config <agent>"
description: "Show retry configuration for agent"
options:
- name: "--set"
description: "Update retry policy"
checkpoint_list:
command: "aiwg checkpoints list"
description: "List available checkpoints"
options:
- name: "--since"
description: "Filter by date"
checkpoint_inspect:
command: "aiwg checkpoints inspect <id>"
description: "Show checkpoint details"
checkpoint_restore:
command: "aiwg ralph-resume --checkpoint <id>"
description: "Resume from specific checkpoint"
fallback_chain:
command: "aiwg agents fallback-chain <agent>"
description: "Show fallback chain for agent"
# Agent protocol
agent_protocol:
execute_with_retry:
description: "Execute task with retry policy"
steps:
- load_retry_policy
- initialize_circuit_breaker
- for_attempt_in_max_attempts:
- check_circuit_breaker_state
- if_open_fail_fast
- if_half_open_test_request
- execute_task
- if_success:
- record_success
- return_result
- if_retryable_error:
- record_failure
- calculate_backoff_delay
- apply_jitter
- wait_delay
- preserve_context
- if_non_retryable_error:
- fail_immediately
- circuit_breaker_trip_if_threshold
create_checkpoint:
description: "Create comprehensive checkpoint"
triggers:
- task_completion
- error_recovery
- manual_request
- periodic_timer
steps:
- determine_checkpoint_mode
- capture_execution_state
- capture_artifact_hashes
- capture_context_variables
- capture_recent_tool_outputs
- serialize_agent_memory
- link_provenance_chain
- compress_if_configured
- persist_checkpoint
- prune_old_checkpoints
resolve_fallback:
description: "Resolve agent fallback chain"
triggers:
- agent_unavailable
- agent_task_failure
steps:
- get_primary_agent
- verify_agent_available
- if_unavailable:
- load_fallback_chain
- for_each_fallback:
- verify_fallback_available
- check_skill_subset_match
- transfer_context
- log_degraded_mode_warning
- return_fallback_agent
- if_all_unavailable:
- fail_with_no_agents_error
# Storage
storage:
retry_policies: ".aiwg/agents/retry-policies/"
checkpoints: ".aiwg/ralph/checkpoints/"
fallback_chains: ".aiwg/agents/fallback-chains/"
# Research targets (from REF-001)
research_targets:
retry_patterns: "Structured retry with exponential backoff and circuit breakers"
checkpoint_artifacts: "Comprehensive state snapshots for full recovery"
fallback_assignments: "Agent hierarchies for graceful degradation"
# Example retry configuration
example_retry_config: |
# .aiwg/agents/retry-policies/test-engineer.yaml
agent: test-engineer
policy:
max_attempts: 3
backoff:
strategy: exponential
initial_delay: 2s
max_delay: 30s
multiplier: 2
jitter: 0.1
circuit_breaker:
failure_threshold: 5
timeout: 60s
half_open_requests: 3
retryable_errors:
- RateLimitError
- NetworkTimeoutError
context_preservation: full
# Example checkpoint
example_checkpoint: |
{
"checkpoint": {
"id": "ckpt-20260125-143022",
"iteration": 5,
"timestamp": "2026-01-25T14:30:22Z",
"trigger": "task-completion"
},
"execution": {
"currentPhase": "elaboration",
"currentAgent": "Requirements-Analyst",
"taskStack": [
{
"id": "task-003",
"description": "Elaborate NFR-Security module",
"status": "in-progress",
"startedAt": "2026-01-25T14:28:15Z"
}
]
},
"artifacts": {
"created": [
{
"path": ".aiwg/requirements/nfr-modules/security.md",
"hash": "sha256:abc123..."
}
]
},
"provenance": {
"parentCheckpoint": "ckpt-20260125-142000"
}
}
# References
references:
research:
- "@.aiwg/research/findings/REF-001-agentic-ai-production.md"
implementation:
- "#239"
- "#240"
- "#241"
related:
- "@tools/ralph-external/loop.ts"
- "@agentic/code/frameworks/sdlc-complete/agents/"
- "@agentic/code/frameworks/sdlc-complete/schemas/flows/agent-efficiency.yaml"