aiwg

Version:

Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.

aiwg.io

jmagly/aiwg

633 lines (571 loc) • 19.4 kB

YAML

# Reliability Patterns Framework Schema # Based on REF-001 Agentic AI in Production # Issues: #239 (Retry Patterns), #240 (Checkpoints), #241 (Fallbacks) $schema: "https://json-schema.org/draft/2020-12/schema" $id: "https://aiwg.io/schemas/reliability-patterns/v1" title: "Reliability Patterns Framework Schema" description: | Production reliability patterns for agentic systems implementing structured retry configuration, comprehensive checkpointing, and fallback agent assignments per REF-001 Agentic AI in Production. type: object required: - version - retry_patterns - checkpoint_artifacts - fallback_assignments properties: version: type: string pattern: "^\\d+\\.\\d+\\.\\d+$" default: "1.0.0" retry_patterns: $ref: "#/$defs/RetryPatterns" checkpoint_artifacts: $ref: "#/$defs/CheckpointArtifacts" fallback_assignments: $ref: "#/$defs/FallbackAssignments" $defs: RetryPatterns: type: object description: "Structured retry pattern configuration per REF-001" properties: enabled: type: boolean default: true policy_schema: type: object properties: max_attempts: type: object properties: type: { type: string, default: "integer" } default: { type: integer, default: 3 } min: { type: integer, default: 1 } max: { type: integer, default: 10 } backoff: type: object properties: strategy: type: object properties: type: { type: string, default: "string" } enum: type: array default: ["constant", "linear", "exponential"] default: { type: string, default: "exponential" } initial_delay: type: object properties: type: { type: string, default: "string" } pattern: { type: string, default: "^\\d+(ms|s|m)$" } default: { type: string, default: "1s" } max_delay: type: object properties: type: { type: string, default: "string" } default: { type: string, default: "30s" } multiplier: type: object properties: type: { type: string, default: "number" } default: { type: number, default: 2.0 } description: { type: string, default: "Exponential backoff multiplier" } jitter: type: object properties: type: { type: string, default: "number" } min: { type: number, default: 0 } max: { type: number, default: 1 } default: { type: number, default: 0.1 } description: { type: string, default: "Randomization factor to prevent thundering herd" } circuit_breaker: type: object properties: failure_threshold: type: object properties: type: { type: string, default: "integer" } default: { type: integer, default: 5 } description: { type: string, default: "Consecutive failures before opening circuit" } timeout: type: object properties: type: { type: string, default: "string" } default: { type: string, default: "60s" } description: { type: string, default: "Time in open state before half-open" } half_open_requests: type: object properties: type: { type: string, default: "integer" } default: { type: integer, default: 3 } description: { type: string, default: "Test requests in half-open state" } error_classification: type: object properties: retryable_errors: type: array items: { type: string } default: - "RateLimitError" - "NetworkTimeoutError" - "TemporaryAPIError" - "ServiceUnavailableError" - "GatewayTimeoutError" non_retryable_errors: type: array items: { type: string } default: - "AuthenticationError" - "ValidationError" - "NotFoundError" - "PermissionDeniedError" - "InvalidInputError" context_preservation: type: object properties: type: { type: string, default: "string" } enum: type: array default: ["full", "partial", "none"] default: { type: string, default: "full" } descriptions: type: object properties: full: { type: string, default: "Preserve all context including conversation history" } partial: { type: string, default: "Preserve task state, discard conversation details" } none: { type: string, default: "Start fresh on retry" } default_policies: type: object properties: agent_task: type: object properties: max_attempts: { type: integer, default: 3 } backoff: { type: string, default: "exponential" } initial_delay: { type: string, default: "2s" } context_preservation: { type: string, default: "full" } api_call: type: object properties: max_attempts: { type: integer, default: 5 } backoff: { type: string, default: "exponential" } initial_delay: { type: string, default: "1s" } max_delay: { type: string, default: "30s" } jitter: { type: number, default: 0.1 } ralph_loop: type: object properties: max_attempts: { type: integer, default: 5 } backoff: { type: string, default: "exponential" } initial_delay: { type: string, default: "5s" } max_delay: { type: string, default: "60s" } context_preservation: { type: string, default: "full" } budget_per_task: { type: integer, default: 3 } agent_metadata_extension: type: string default: | # Agent retry policy extension --- name: Test Engineer retry-policy: max-attempts: 3 backoff: strategy: exponential initial-delay: 1s max-delay: 30s multiplier: 2 circuit-breaker: failure-threshold: 5 timeout: 60s half-open-requests: 3 retryable-errors: - RateLimitError - NetworkTimeoutError non-retryable-errors: - AuthenticationError - ValidationError --- CheckpointArtifacts: type: object description: "Comprehensive checkpoint artifacts per REF-001" properties: enabled: type: boolean default: true triggers: type: array items: { type: string } default: - "task-completion" - "error" - "manual" - "periodic" - "phase-transition" checkpoint_schema: type: object properties: checkpoint: type: object properties: id: type: object properties: type: { type: string, default: "string" } format: { type: string, default: "ckpt-YYYYMMDD-HHMMSS" } iteration: { type: string, default: "integer" } timestamp: { type: string, default: "date-time" } trigger: { type: string, default: "string" } execution: type: object properties: current_phase: { type: string, default: "string" } current_agent: { type: string, default: "string" } task_stack: type: string default: "array of {id, description, status, startedAt}" completed_tasks: type: string default: "array of {id, completedAt}" artifacts: type: object properties: created: type: string default: "array of {path, hash, size}" modified: type: string default: "array of {path, hash, previousHash}" context: type: object properties: environment: type: string default: "{cwd, node, aiwg}" variables: type: string default: "key-value pairs" tool_outputs: type: string default: "array of {tool, invocation, output, exitCode, timestamp}" agent_memory: type: object properties: conversation_history: type: string default: "array of {role, content}" working_memory: type: string default: "agent-specific state" provenance: type: object properties: parent_checkpoint: { type: string, default: "string" } derived_from: type: string default: "array of {artifact, relationship}" storage: type: object properties: path: type: string default: ".aiwg/ralph/checkpoints/" format: type: string default: "json" compression: type: boolean default: true retention: type: object properties: max_checkpoints: { type: integer, default: 50 } max_age_days: { type: integer, default: 30 } modes: type: object properties: full: type: object properties: description: { type: string, default: "Complete state snapshot" } includes: type: array items: { type: string } default: - "execution" - "artifacts" - "context" - "tool_outputs" - "agent_memory" - "provenance" incremental: type: object properties: description: { type: string, default: "Only changes since last checkpoint" } includes: type: array items: { type: string } default: - "execution" - "artifacts.modified" - "tool_outputs.recent" FallbackAssignments: type: object description: "Fallback agent assignments per REF-001" properties: enabled: type: boolean default: true fallback_schema: type: object properties: primary: type: object properties: type: { type: string, default: "string" } description: { type: string, default: "First fallback agent to try" } secondary: type: object properties: type: { type: string, default: "string" } description: { type: string, default: "Second fallback if primary unavailable" } ultimate: type: object properties: type: { type: string, default: "string" } default: { type: string, default: "Generalist-Agent" } description: { type: string, default: "Last resort fallback" } strategy: type: object properties: preserve_context: type: object properties: type: { type: string, default: "boolean" } default: { type: boolean, default: true } skill_subset: type: object properties: type: { type: string, default: "array" } description: { type: string, default: "Skills fallback must support" } degraded_mode: type: object properties: description: { type: string, default: "Warning about reduced capability" } acceptable: { type: boolean, default: true } default_chains: type: object description: "Default fallback chains for SDLC agents" properties: test_engineer: type: array items: { type: string } default: - "QA-Specialist" - "Software-Engineer" - "Generalist-Agent" security_auditor: type: array items: { type: string } default: - "Software-Engineer" - "Generalist-Agent" deployment_engineer: type: array items: { type: string } default: - "DevOps-Engineer" - "Software-Engineer" - "Generalist-Agent" requirements_analyst: type: array items: { type: string } default: - "System-Analyst" - "Software-Engineer" - "Generalist-Agent" architecture_designer: type: array items: { type: string } default: - "Software-Engineer" - "Generalist-Agent" agent_metadata_extension: type: string default: | # Agent fallback extension --- name: Test Engineer role: testing specialization: unit-testing fallback: primary: QA-Specialist secondary: Software-Engineer ultimate: Generalist-Agent fallback-strategy: preserve-context: true skill-subset: - test-writing - test-execution degraded-mode: description: "Fallback may not validate coverage rigorously" acceptable: true --- # CLI commands cli_commands: retry_config: command: "aiwg retry-config <agent>" description: "Show retry configuration for agent" options: - name: "--set" description: "Update retry policy" checkpoint_list: command: "aiwg checkpoints list" description: "List available checkpoints" options: - name: "--since" description: "Filter by date" checkpoint_inspect: command: "aiwg checkpoints inspect <id>" description: "Show checkpoint details" checkpoint_restore: command: "aiwg ralph-resume --checkpoint <id>" description: "Resume from specific checkpoint" fallback_chain: command: "aiwg agents fallback-chain <agent>" description: "Show fallback chain for agent" # Agent protocol agent_protocol: execute_with_retry: description: "Execute task with retry policy" steps: - load_retry_policy - initialize_circuit_breaker - for_attempt_in_max_attempts: - check_circuit_breaker_state - if_open_fail_fast - if_half_open_test_request - execute_task - if_success: - record_success - return_result - if_retryable_error: - record_failure - calculate_backoff_delay - apply_jitter - wait_delay - preserve_context - if_non_retryable_error: - fail_immediately - circuit_breaker_trip_if_threshold create_checkpoint: description: "Create comprehensive checkpoint" triggers: - task_completion - error_recovery - manual_request - periodic_timer steps: - determine_checkpoint_mode - capture_execution_state - capture_artifact_hashes - capture_context_variables - capture_recent_tool_outputs - serialize_agent_memory - link_provenance_chain - compress_if_configured - persist_checkpoint - prune_old_checkpoints resolve_fallback: description: "Resolve agent fallback chain" triggers: - agent_unavailable - agent_task_failure steps: - get_primary_agent - verify_agent_available - if_unavailable: - load_fallback_chain - for_each_fallback: - verify_fallback_available - check_skill_subset_match - transfer_context - log_degraded_mode_warning - return_fallback_agent - if_all_unavailable: - fail_with_no_agents_error # Storage storage: retry_policies: ".aiwg/agents/retry-policies/" checkpoints: ".aiwg/ralph/checkpoints/" fallback_chains: ".aiwg/agents/fallback-chains/" # Research targets (from REF-001) research_targets: retry_patterns: "Structured retry with exponential backoff and circuit breakers" checkpoint_artifacts: "Comprehensive state snapshots for full recovery" fallback_assignments: "Agent hierarchies for graceful degradation" # Example retry configuration example_retry_config: | # .aiwg/agents/retry-policies/test-engineer.yaml agent: test-engineer policy: max_attempts: 3 backoff: strategy: exponential initial_delay: 2s max_delay: 30s multiplier: 2 jitter: 0.1 circuit_breaker: failure_threshold: 5 timeout: 60s half_open_requests: 3 retryable_errors: - RateLimitError - NetworkTimeoutError context_preservation: full # Example checkpoint example_checkpoint: | { "checkpoint": { "id": "ckpt-20260125-143022", "iteration": 5, "timestamp": "2026-01-25T14:30:22Z", "trigger": "task-completion" }, "execution": { "currentPhase": "elaboration", "currentAgent": "Requirements-Analyst", "taskStack": [ { "id": "task-003", "description": "Elaborate NFR-Security module", "status": "in-progress", "startedAt": "2026-01-25T14:28:15Z" } ] }, "artifacts": { "created": [ { "path": ".aiwg/requirements/nfr-modules/security.md", "hash": "sha256:abc123..." } ] }, "provenance": { "parentCheckpoint": "ckpt-20260125-142000" } } # References references: research: - "@.aiwg/research/findings/REF-001-agentic-ai-production.md" implementation: - "#239" - "#240" - "#241" related: - "@tools/ralph-external/loop.ts" - "@agentic/code/frameworks/sdlc-complete/agents/" - "@agentic/code/frameworks/sdlc-complete/schemas/flows/agent-efficiency.yaml"