[RESILIENCE] Define per-agent fallback portfolios and routing doctrine (#170)

2026-04-04 21:40:36 +00:00
parent 2142d20129
commit ff7e22dcc8
3 changed files with 537 additions and 3 deletions
--- a/fallback-portfolios.yaml
+++ b/fallback-portfolios.yaml
@@ -0,0 +1,284 @@
+schema_version: 1
+status: proposed
+runtime_wiring: false
+owner: timmy-config
+
+ownership:
+  owns:
+    - routing doctrine for task classes
+    - sidecar-readable per-agent fallback portfolios
+    - degraded-mode capability floors
+  does_not_own:
+    - live queue state outside Gitea truth
+    - launchd or loop process state
+    - ad hoc worktree history
+
+policy:
+  require_four_slots_for_critical_agents: true
+  terminal_fallback_must_be_usable: true
+  forbid_synchronized_fleet_degradation: true
+  forbid_human_token_fallbacks: true
+  anti_correlation_rule: no two critical agents may share the same primary+fallback1 pair
+
+sensitive_control_surfaces:
+  - SOUL.md
+  - config.yaml
+  - deploy.sh
+  - tasks.py
+  - playbooks/
+  - cron/
+  - memories/
+  - skins/
+  - training/
+
+role_classes:
+  judgment:
+    current_surfaces:
+      - playbooks/issue-triager.yaml
+      - playbooks/pr-reviewer.yaml
+      - playbooks/verified-logic.yaml
+    task_classes:
+      - issue-triage
+      - queue-routing
+      - pr-review
+      - proof-check
+      - governance-review
+    degraded_mode:
+      fallback2:
+        allowed:
+          - classify backlog
+          - summarize risk
+          - produce draft routing plans
+          - leave bounded labels or comments with evidence
+        denied:
+          - merge pull requests
+          - close or rewrite governing issues or PRs
+          - mutate sensitive control surfaces
+          - bulk-reassign the fleet
+          - silently change routing policy
+      terminal:
+        lane: report-and-route
+        allowed:
+          - classify backlog
+          - summarize risk
+          - produce draft routing artifacts
+        denied:
+          - merge pull requests
+          - bulk-reassign the fleet
+          - mutate sensitive control surfaces
+
+  builder:
+    current_surfaces:
+      - playbooks/bug-fixer.yaml
+      - playbooks/test-writer.yaml
+      - playbooks/refactor-specialist.yaml
+    task_classes:
+      - bug-fix
+      - test-writing
+      - refactor
+      - bounded-docs-change
+    degraded_mode:
+      fallback2:
+        allowed:
+          - reversible single-issue changes
+          - narrow docs fixes
+          - test scaffolds and reproducers
+        denied:
+          - cross-repo changes
+          - sensitive control-surface edits
+          - merge or release actions
+      terminal:
+        lane: narrow-patch
+        allowed:
+          - single-issue small patch
+          - reproducer test
+          - docs-only repair
+        denied:
+          - sensitive control-surface edits
+          - multi-file architecture work
+          - irreversible actions
+
+  wolf_bulk:
+    current_surfaces:
+      - docs/automation-inventory.md
+      - FALSEWORK.md
+    task_classes:
+      - docs-inventory
+      - log-summarization
+      - queue-hygiene
+      - repetitive-small-diff
+      - research-sweep
+    degraded_mode:
+      fallback2:
+        allowed:
+          - gather evidence
+          - refresh inventories
+          - summarize logs
+          - propose labels or routes
+        denied:
+          - multi-repo branch fanout
+          - mass agent assignment
+          - sensitive control-surface edits
+          - irreversible queue mutation
+      terminal:
+        lane: gather-and-summarize
+        allowed:
+          - inventory refresh
+          - evidence bundles
+          - summaries
+        denied:
+          - multi-repo branch fanout
+          - mass agent assignment
+          - sensitive control-surface edits
+
+routing:
+  issue-triage: judgment
+  queue-routing: judgment
+  pr-review: judgment
+  proof-check: judgment
+  governance-review: judgment
+  bug-fix: builder
+  test-writing: builder
+  refactor: builder
+  bounded-docs-change: builder
+  docs-inventory: wolf_bulk
+  log-summarization: wolf_bulk
+  queue-hygiene: wolf_bulk
+  repetitive-small-diff: wolf_bulk
+  research-sweep: wolf_bulk
+
+promotion_rules:
+  - If a wolf/bulk task touches a sensitive control surface, promote it to judgment.
+  - If a builder task expands beyond 5 files, architecture review, or multi-repo coordination, promote it to judgment.
+  - If a terminal lane cannot produce a usable artifact, the portfolio is invalid and must be redesigned before wiring.
+
+agents:
+  triage-coordinator:
+    role_class: judgment
+    critical: true
+    current_playbooks:
+      - playbooks/issue-triager.yaml
+    portfolio:
+      primary:
+        provider: anthropic
+        model: claude-opus-4-6
+        lane: full-judgment
+      fallback1:
+        provider: openai-codex
+        model: codex
+        lane: high-judgment
+      fallback2:
+        provider: gemini
+        model: gemini-2.5-pro
+        lane: bounded-judgment
+      terminal:
+        provider: ollama
+        model: hermes3:latest
+        lane: report-and-route
+        local_capable: true
+        usable_output:
+          - backlog classification
+          - routing draft
+          - risk summary
+
+  pr-reviewer:
+    role_class: judgment
+    critical: true
+    current_playbooks:
+      - playbooks/pr-reviewer.yaml
+    portfolio:
+      primary:
+        provider: anthropic
+        model: claude-opus-4-6
+        lane: full-review
+      fallback1:
+        provider: gemini
+        model: gemini-2.5-pro
+        lane: high-review
+      fallback2:
+        provider: grok
+        model: grok-3-mini-fast
+        lane: comment-only-review
+      terminal:
+        provider: openrouter
+        model: openai/gpt-4.1-mini
+        lane: low-stakes-diff-summary
+        local_capable: false
+        usable_output:
+          - diff risk summary
+          - explicit uncertainty notes
+          - merge-block recommendation
+
+  builder-main:
+    role_class: builder
+    critical: true
+    current_playbooks:
+      - playbooks/bug-fixer.yaml
+      - playbooks/test-writer.yaml
+      - playbooks/refactor-specialist.yaml
+    portfolio:
+      primary:
+        provider: openai-codex
+        model: codex
+        lane: full-builder
+      fallback1:
+        provider: kimi-coding
+        model: kimi-k2.5
+        lane: bounded-builder
+      fallback2:
+        provider: groq
+        model: llama-3.3-70b-versatile
+        lane: small-patch-builder
+      terminal:
+        provider: custom_provider
+        provider_name: Local llama.cpp
+        model: hermes4:14b
+        lane: narrow-patch
+        local_capable: true
+        usable_output:
+          - small patch
+          - reproducer test
+          - docs repair
+
+  wolf-sweeper:
+    role_class: wolf_bulk
+    critical: true
+    current_world_state:
+      - docs/automation-inventory.md
+    portfolio:
+      primary:
+        provider: gemini
+        model: gemini-2.5-flash
+        lane: fast-bulk
+      fallback1:
+        provider: groq
+        model: llama-3.3-70b-versatile
+        lane: fast-bulk-backup
+      fallback2:
+        provider: openrouter
+        model: openai/gpt-4.1-mini
+        lane: bounded-bulk-summary
+      terminal:
+        provider: ollama
+        model: hermes3:latest
+        lane: gather-and-summarize
+        local_capable: true
+        usable_output:
+          - inventory refresh
+          - evidence bundle
+          - summary comment
+
+cross_checks:
+  unique_primary_fallback1_pairs:
+    triage-coordinator:
+      - anthropic/claude-opus-4-6
+      - openai-codex/codex
+    pr-reviewer:
+      - anthropic/claude-opus-4-6
+      - gemini/gemini-2.5-pro
+    builder-main:
+      - openai-codex/codex
+      - kimi-coding/kimi-k2.5
+    wolf-sweeper:
+      - gemini/gemini-2.5-flash
+      - groq/llama-3.3-70b-versatile