aiwg
Version:
Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.
361 lines (333 loc) • 9.52 kB
YAML
# Flow Error Handling Configuration Schema
# Based on REF-001 Production Agentic (error handling patterns)
# Issue: #110
$schema: "https://json-schema.org/draft/2020-12/schema"
$id: "https://aiwg.io/schemas/error-handling/v1"
title: "Flow Error Handling Configuration Schema"
description: |
Structured error handling configuration for flow commands, enabling retry patterns,
fallback agents, and graceful degradation. Based on production agentic best practices.
type: object
required:
- strategy
- classification
properties:
strategy:
type: string
enum:
- retry_then_escalate # Retry, then escalate to human
- retry_then_fallback # Retry, then use fallback agent
- fail_fast # Fail immediately, no retry
- graceful_degrade # Continue with reduced functionality
- checkpoint_recover # Restore from last checkpoint
description: "Primary error handling strategy"
classification:
type: object
required: [categories]
description: "How to classify errors"
properties:
categories:
type: array
items:
$ref: "#/$defs/ErrorCategory"
default_category:
type: string
default: "unknown"
retry:
$ref: "#/$defs/RetryConfig"
description: "Retry configuration"
fallback:
$ref: "#/$defs/FallbackConfig"
description: "Fallback agent configuration"
escalation:
$ref: "#/$defs/EscalationConfig"
description: "Human escalation configuration"
checkpoint:
$ref: "#/$defs/CheckpointConfig"
description: "Checkpoint and recovery configuration"
logging:
type: object
properties:
level:
type: string
enum: [error, warn, info, debug]
default: error
include_stack:
type: boolean
default: true
include_context:
type: boolean
default: true
destination:
type: string
default: ".aiwg/logs/errors.log"
description: "Error logging configuration"
$defs:
ErrorCategory:
type: object
required:
- id
- pattern
- type
properties:
id:
type: string
description: "Category identifier"
pattern:
type: string
description: "Regex pattern to match error messages"
type:
type: string
enum:
- transient # Temporary failures (network, rate limit)
- permanent # Unrecoverable (invalid input, missing resource)
- user # User action needed
- system # System/infrastructure error
- timeout # Operation timed out
- validation # Validation failure
- permission # Permission denied
description: "Error type for handling decision"
severity:
type: string
enum: [critical, high, medium, low]
default: medium
action:
type: string
enum: [retry, fallback, escalate, abort, ignore]
description: "Default action for this category"
message_template:
type: string
description: "User-friendly message template"
RetryConfig:
type: object
properties:
enabled:
type: boolean
default: true
max_attempts:
type: integer
minimum: 1
maximum: 10
default: 3
description: "Maximum retry attempts"
initial_delay_ms:
type: integer
minimum: 0
default: 1000
description: "Initial delay before first retry"
backoff_strategy:
type: string
enum:
- none # No delay increase
- linear # Add fixed delay each retry
- exponential # Double delay each retry
- fibonacci # Fibonacci sequence delays
default: exponential
max_delay_ms:
type: integer
default: 30000
description: "Maximum delay between retries"
jitter:
type: boolean
default: true
description: "Add random jitter to delays"
retry_on:
type: array
items:
type: string
default: ["transient", "timeout"]
description: "Error types to retry on"
FallbackConfig:
type: object
properties:
enabled:
type: boolean
default: false
agents:
type: array
items:
type: object
properties:
primary:
type: string
description: "Primary agent"
fallback:
type: string
description: "Fallback agent"
conditions:
type: array
items:
type: string
description: "When to trigger fallback"
description: "Agent fallback mappings"
model_fallback:
type: object
properties:
enabled:
type: boolean
default: false
primary:
type: string
enum: [opus, sonnet, haiku]
fallback:
type: string
enum: [opus, sonnet, haiku]
description: "Model tier fallback"
EscalationConfig:
type: object
properties:
enabled:
type: boolean
default: true
trigger_after:
type: integer
default: 3
description: "Escalate after N failures"
channels:
type: array
items:
type: string
enum: [cli, issue_comment, slack, email]
default: [cli, issue_comment]
include_context:
type: boolean
default: true
description: "Include error context in escalation"
template:
type: string
description: "Escalation message template"
auto_create_issue:
type: boolean
default: false
description: "Auto-create tracking issue"
CheckpointConfig:
type: object
properties:
enabled:
type: boolean
default: true
auto_checkpoint:
type: boolean
default: true
description: "Automatically checkpoint before risky operations"
checkpoint_dir:
type: string
default: ".aiwg/checkpoints/"
retention_count:
type: integer
default: 5
description: "Number of checkpoints to retain"
checkpoint_on:
type: array
items:
type: string
enum:
- phase_start
- artifact_complete
- before_external_call
- iteration_boundary
default: [phase_start, artifact_complete]
recovery_strategy:
type: string
enum:
- last_checkpoint # Restore most recent
- select_checkpoint # Let user choose
- smart_rollback # Analyze and select best
default: last_checkpoint
# Predefined error patterns
common_patterns:
network_errors:
id: "network"
pattern: "(ECONNREFUSED|ETIMEDOUT|ENOTFOUND|network|socket)"
type: transient
severity: medium
action: retry
rate_limit:
id: "rate_limit"
pattern: "(rate.?limit|429|too.?many.?requests)"
type: transient
severity: low
action: retry
auth_errors:
id: "auth"
pattern: "(unauthorized|403|401|permission.?denied|access.?denied)"
type: permission
severity: high
action: escalate
validation_errors:
id: "validation"
pattern: "(invalid|validation.?failed|schema.?error|type.?error)"
type: validation
severity: medium
action: escalate
timeout_errors:
id: "timeout"
pattern: "(timeout|timed.?out|deadline.?exceeded)"
type: timeout
severity: medium
action: retry
resource_errors:
id: "resource"
pattern: "(not.?found|404|missing|does.?not.?exist)"
type: permanent
severity: high
action: escalate
# Flow integration
flow_integration:
description: "How to add error handling to flow YAML"
example: |
# In flow command definition
flow:
name: my-flow
error_handling:
strategy: retry_then_escalate
retry:
max_attempts: 3
backoff_strategy: exponential
escalation:
channels: [cli, issue_comment]
checkpoint:
auto_checkpoint: true
checkpoint_on: [phase_start]
# Ralph integration
ralph_integration:
description: "Error handling in Ralph loops"
on_iteration_failure:
- increment_failure_count: true
- check_retry_budget: true
- checkpoint_state: true
- analyze_error_pattern: true
recovery_actions:
transient: retry_with_backoff
validation: adjust_and_retry
permanent: skip_or_escalate
timeout: extend_timeout_and_retry
# Examples
examples:
- name: "Standard flow error handling"
strategy: retry_then_escalate
classification:
categories:
- id: "api_error"
pattern: "(API|api).*(error|failed)"
type: transient
action: retry
default_category: unknown
retry:
enabled: true
max_attempts: 3
backoff_strategy: exponential
initial_delay_ms: 1000
escalation:
enabled: true
trigger_after: 3
channels: [cli, issue_comment]
# References
references:
research:
- "@.aiwg/research/findings/REF-001-production-agentic.md"
implementation:
- "#110"
related:
- "@agentic/code/frameworks/sdlc-complete/schemas/flows/hitl-gate.yaml"
- "@agentic/code/addons/ralph/schemas/reflection-memory.json"