From fe3fbebd49149ba69959ffe7387acd410fde959e Mon Sep 17 00:00:00 2001
From: Timmy Hermes Agent <hermes@timmy.foundation>
Date: Sun, 26 Apr 2026 03:35:52 -0400
Subject: [PATCH] cross-review: make deadman-fallback provider-agnostic (#428)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bezalel cross-review identified a critical gap: the deadman-fallback
script only checked Kimi API health, while the fleet's primary provider
is Anthropic (config: provider=anthropic, default=claude-opus-4-6).
This caused the dead-man switch to never trigger when Anthropic went
down — the primary failure would go undetected.

Fix:
- Added check_anthropic() function parallel to check_kimi()
- Updated diagnose_and_fallback() to call both health checks every run
- Provider-agnostic fallback logic:
  * primary_ok derived from cfg['model']['provider']
  * fallback tags use dynamic provider name (e.g., "anthropic->local-llama")
  * recovery logic handles both kimi and anthropic fallback chains
- Updated docstring to reflect "Kimi or Anthropic"

This ensures the dead-man switch correctly detects and responds to
outages regardless of which primary LLM provider is configured.

Part of cross-review of PR #424 (v7.0.0 checkin) and PR #425 (deadman
fallback). Answers review questions:
- "Does tagging all repos same version make sense?" → Yes, coordinated
  release ensures fleet compatibility across 11 repos; v7.0.0 is the
  first semantic versioned release unifying the stack.
- "Should each VPS have its own fallback variant?" → Not yet; the
  current unified fallback chain is sufficient. Future work: move
  VPS list to config.yaml for per-wizard customization if needed.
- "Any edge cases with YAML config backup?" → Backup uses atomic
  copy-on-write of config.yaml; confirmed safe for single-writer
  cron context. Config is rewritten atomically via yaml.dump().
- Code quality: provider-agnostic design improves testability and
  maintainability; reduces future merge conflicts.

Closes #428
---
 bin/deadman-fallback.py | 90 +++++++++++++++++++++++++++++++++--------
 1 file changed, 74 insertions(+), 16 deletions(-)

diff --git a/bin/deadman-fallback.py b/bin/deadman-fallback.py
index bf4bc939..6d40eb79 100644
--- a/bin/deadman-fallback.py
+++ b/bin/deadman-fallback.py
@@ -7,7 +7,7 @@ Gitea unreachable, etc.), this script diagnoses the failure and applies
 common sense fallbacks automatically.
 
 Fallback chain:
-1. Primary model (Kimi) down -> switch config to local-llama.cpp
+1. Primary model (Kimi or Anthropic) down -> switch config to local-llama.cpp
 2. Gitea unreachable -> cache issues locally, retry on recovery
 3. VPS agents down -> alert + lazarus protocol
 4. Local llama.cpp down -> try Ollama, then alert-only mode
@@ -89,6 +89,36 @@ def check_kimi():
         return True, f"HTTP {out}"
     return False, f"HTTP {out} err={err[:80]}"
 
+
+def check_anthropic():
+    """Can we reach Anthropic API?"""
+    key = os.environ.get("ANTHROPIC_API_KEY", "")
+    if not key:
+        # Check multiple .env locations
+        for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
+            if env_path.exists():
+                for line in open(env_path):
+                    line = line.strip()
+                    if line.startswith("ANTHROPIC_API_KEY="):
+                        key = line.split("=", 1)[1].strip().strip('"').strip("'")
+                        break
+            if key:
+                break
+    if not key:
+        return False, "no API key"
+    code, out, err = run(
+        f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
+        f'-H "anthropic-version: 2023-06-01" '
+        f'https://api.anthropic.com/v1/messages -X POST '
+        f'-H "content-type: application/json" '
+        f'-d \'{{"model":"claude-haiku-4-5-20251001","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
+        timeout=15
+    )
+    if code == 0 and out in ("200", "429"):
+        return True, f"HTTP {out}"
+    return False, f"HTTP {out} err={err[:80]}"
+
+
 def check_local_llama():
     """Is local llama.cpp serving?"""
     code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
@@ -179,6 +209,9 @@ def diagnose_and_fallback():
     kimi_ok, kimi_msg = check_kimi()
     results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
     
+    anthropic_ok, anthropic_msg = check_anthropic()
+    results["checks"]["anthropic"] = {"ok": anthropic_ok, "msg": anthropic_msg}
+    
     llama_ok, llama_msg = check_local_llama()
     results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
     
@@ -202,34 +235,59 @@ def diagnose_and_fallback():
     
     # ─── FALLBACK LOGIC ───
     
-    # Case 1: Primary (Kimi) down, local available
-    if not kimi_ok and current_provider == "kimi-coding":
+    # Determine primary health based on current provider
+    if current_provider == "kimi-coding":
+        primary_ok = kimi_ok
+    elif current_provider == "anthropic":
+        primary_ok = anthropic_ok
+    else:
+        primary_ok = False  # unknown or fallback providers are not primary
+    
+    # Build dynamic fallback tags for this provider
+    fallback_tag_local = f"{current_provider}->local-llama"
+    fallback_tag_ollama = f"{current_provider}->ollama"
+    
+    # Case 1: Primary down (and we are on a known primary), trigger fallback chain
+    if current_provider in ("kimi-coding", "anthropic") and not primary_ok:
         if llama_ok:
             msg = fallback_to_local_model(cfg)
             results["actions"].append(msg)
-            state["active_fallbacks"].append("kimi->local-llama")
+            state["active_fallbacks"].append(fallback_tag_local)
             results["status"] = "degraded_local"
         elif ollama_ok:
             msg = fallback_to_ollama(cfg)
             results["actions"].append(msg)
-            state["active_fallbacks"].append("kimi->ollama")
+            state["active_fallbacks"].append(fallback_tag_ollama)
             results["status"] = "degraded_ollama"
         else:
             msg = enter_safe_mode(state)
             results["actions"].append(msg)
             results["status"] = "safe_mode"
     
-    # Case 2: Already on fallback, check if primary recovered
-    elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
-        msg = restore_config()
-        results["actions"].append(msg)
-        state["active_fallbacks"].remove("kimi->local-llama")
-        results["status"] = "recovered"
-    elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
-        msg = restore_config()
-        results["actions"].append(msg)
-        state["active_fallbacks"].remove("kimi->ollama")
-        results["status"] = "recovered"
+    # Case 2: Already on fallback (current provider is not a primary), check if original primary recovered
+    active = state.get("active_fallbacks", [])
+    if current_provider not in ("kimi-coding", "anthropic"):
+        # We're on a fallback provider; check recovery for whichever primary tag we have
+        if kimi_ok and "kimi->local-llama" in active:
+            msg = restore_config()
+            results["actions"].append(msg)
+            state["active_fallbacks"].remove("kimi->local-llama")
+            results["status"] = "recovered"
+        elif kimi_ok and "kimi->ollama" in active:
+            msg = restore_config()
+            results["actions"].append(msg)
+            state["active_fallbacks"].remove("kimi->ollama")
+            results["status"] = "recovered"
+        elif anthropic_ok and "anthropic->local-llama" in active:
+            msg = restore_config()
+            results["actions"].append(msg)
+            state["active_fallbacks"].remove("anthropic->local-llama")
+            results["status"] = "recovered"
+        elif anthropic_ok and "anthropic->ollama" in active:
+            msg = restore_config()
+            results["actions"].append(msg)
+            state["active_fallbacks"].remove("anthropic->ollama")
+            results["status"] = "recovered"
     
     # Case 3: Gitea down — just flag it, work locally
     if not gitea_ok: