UNPKG

aiwg

Version:

Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.

361 lines (333 loc) 9.52 kB
# Flow Error Handling Configuration Schema # Based on REF-001 Production Agentic (error handling patterns) # Issue: #110 $schema: "https://json-schema.org/draft/2020-12/schema" $id: "https://aiwg.io/schemas/error-handling/v1" title: "Flow Error Handling Configuration Schema" description: | Structured error handling configuration for flow commands, enabling retry patterns, fallback agents, and graceful degradation. Based on production agentic best practices. type: object required: - strategy - classification properties: strategy: type: string enum: - retry_then_escalate # Retry, then escalate to human - retry_then_fallback # Retry, then use fallback agent - fail_fast # Fail immediately, no retry - graceful_degrade # Continue with reduced functionality - checkpoint_recover # Restore from last checkpoint description: "Primary error handling strategy" classification: type: object required: [categories] description: "How to classify errors" properties: categories: type: array items: $ref: "#/$defs/ErrorCategory" default_category: type: string default: "unknown" retry: $ref: "#/$defs/RetryConfig" description: "Retry configuration" fallback: $ref: "#/$defs/FallbackConfig" description: "Fallback agent configuration" escalation: $ref: "#/$defs/EscalationConfig" description: "Human escalation configuration" checkpoint: $ref: "#/$defs/CheckpointConfig" description: "Checkpoint and recovery configuration" logging: type: object properties: level: type: string enum: [error, warn, info, debug] default: error include_stack: type: boolean default: true include_context: type: boolean default: true destination: type: string default: ".aiwg/logs/errors.log" description: "Error logging configuration" $defs: ErrorCategory: type: object required: - id - pattern - type properties: id: type: string description: "Category identifier" pattern: type: string description: "Regex pattern to match error messages" type: type: string enum: - transient # Temporary failures (network, rate limit) - permanent # Unrecoverable (invalid input, missing resource) - user # User action needed - system # System/infrastructure error - timeout # Operation timed out - validation # Validation failure - permission # Permission denied description: "Error type for handling decision" severity: type: string enum: [critical, high, medium, low] default: medium action: type: string enum: [retry, fallback, escalate, abort, ignore] description: "Default action for this category" message_template: type: string description: "User-friendly message template" RetryConfig: type: object properties: enabled: type: boolean default: true max_attempts: type: integer minimum: 1 maximum: 10 default: 3 description: "Maximum retry attempts" initial_delay_ms: type: integer minimum: 0 default: 1000 description: "Initial delay before first retry" backoff_strategy: type: string enum: - none # No delay increase - linear # Add fixed delay each retry - exponential # Double delay each retry - fibonacci # Fibonacci sequence delays default: exponential max_delay_ms: type: integer default: 30000 description: "Maximum delay between retries" jitter: type: boolean default: true description: "Add random jitter to delays" retry_on: type: array items: type: string default: ["transient", "timeout"] description: "Error types to retry on" FallbackConfig: type: object properties: enabled: type: boolean default: false agents: type: array items: type: object properties: primary: type: string description: "Primary agent" fallback: type: string description: "Fallback agent" conditions: type: array items: type: string description: "When to trigger fallback" description: "Agent fallback mappings" model_fallback: type: object properties: enabled: type: boolean default: false primary: type: string enum: [opus, sonnet, haiku] fallback: type: string enum: [opus, sonnet, haiku] description: "Model tier fallback" EscalationConfig: type: object properties: enabled: type: boolean default: true trigger_after: type: integer default: 3 description: "Escalate after N failures" channels: type: array items: type: string enum: [cli, issue_comment, slack, email] default: [cli, issue_comment] include_context: type: boolean default: true description: "Include error context in escalation" template: type: string description: "Escalation message template" auto_create_issue: type: boolean default: false description: "Auto-create tracking issue" CheckpointConfig: type: object properties: enabled: type: boolean default: true auto_checkpoint: type: boolean default: true description: "Automatically checkpoint before risky operations" checkpoint_dir: type: string default: ".aiwg/checkpoints/" retention_count: type: integer default: 5 description: "Number of checkpoints to retain" checkpoint_on: type: array items: type: string enum: - phase_start - artifact_complete - before_external_call - iteration_boundary default: [phase_start, artifact_complete] recovery_strategy: type: string enum: - last_checkpoint # Restore most recent - select_checkpoint # Let user choose - smart_rollback # Analyze and select best default: last_checkpoint # Predefined error patterns common_patterns: network_errors: id: "network" pattern: "(ECONNREFUSED|ETIMEDOUT|ENOTFOUND|network|socket)" type: transient severity: medium action: retry rate_limit: id: "rate_limit" pattern: "(rate.?limit|429|too.?many.?requests)" type: transient severity: low action: retry auth_errors: id: "auth" pattern: "(unauthorized|403|401|permission.?denied|access.?denied)" type: permission severity: high action: escalate validation_errors: id: "validation" pattern: "(invalid|validation.?failed|schema.?error|type.?error)" type: validation severity: medium action: escalate timeout_errors: id: "timeout" pattern: "(timeout|timed.?out|deadline.?exceeded)" type: timeout severity: medium action: retry resource_errors: id: "resource" pattern: "(not.?found|404|missing|does.?not.?exist)" type: permanent severity: high action: escalate # Flow integration flow_integration: description: "How to add error handling to flow YAML" example: | # In flow command definition flow: name: my-flow error_handling: strategy: retry_then_escalate retry: max_attempts: 3 backoff_strategy: exponential escalation: channels: [cli, issue_comment] checkpoint: auto_checkpoint: true checkpoint_on: [phase_start] # Ralph integration ralph_integration: description: "Error handling in Ralph loops" on_iteration_failure: - increment_failure_count: true - check_retry_budget: true - checkpoint_state: true - analyze_error_pattern: true recovery_actions: transient: retry_with_backoff validation: adjust_and_retry permanent: skip_or_escalate timeout: extend_timeout_and_retry # Examples examples: - name: "Standard flow error handling" strategy: retry_then_escalate classification: categories: - id: "api_error" pattern: "(API|api).*(error|failed)" type: transient action: retry default_category: unknown retry: enabled: true max_attempts: 3 backoff_strategy: exponential initial_delay_ms: 1000 escalation: enabled: true trigger_after: 3 channels: [cli, issue_comment] # References references: research: - "@.aiwg/research/findings/REF-001-production-agentic.md" implementation: - "#110" related: - "@agentic/code/frameworks/sdlc-complete/schemas/flows/hitl-gate.yaml" - "@agentic/code/addons/ralph/schemas/reflection-memory.json"