1|#!/usr/bin/env python3 2|""" 3|Dead Man Switch Fallback Engine 4| 5|When the dead man switch triggers (zero commits for 2+ hours, model down, 6|Gitea unreachable, etc.), this script diagnoses the failure and applies 7|common sense fallbacks automatically. 8| 9|Fallback chain: 10|1. Primary model (Anthropic) down -> switch config to local-llama.cpp 11|2. Gitea unreachable -> cache issues locally, retry on recovery 12|3. VPS agents down -> alert + lazarus protocol 13|4. Local llama.cpp down -> try Ollama, then alert-only mode 14|5. All inference dead -> safe mode (cron pauses, alert Alexander) 15| 16|Each fallback is reversible. Recovery auto-restores the previous config. 17|""" 18|import os 19|import sys 20|import json 21|import subprocess 22|import time 23|import yaml 24|import shutil 25|from pathlib import Path 26|from datetime import datetime, timedelta 27| 28|HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes"))) 29|CONFIG_PATH = HERMES_HOME / "config.yaml" 30|FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json" 31|BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback" 32|FORGE_URL = "https://forge.alexanderwhitestone.com" 33| 34|def load_config(): 35| with open(CONFIG_PATH) as f: 36| return yaml.safe_load(f) 37| 38|def save_config(cfg): 39| with open(CONFIG_PATH, "w") as f: 40| yaml.dump(cfg, f, default_flow_style=False) 41| 42|def load_state(): 43| if FALLBACK_STATE.exists(): 44| with open(FALLBACK_STATE) as f: 45| return json.load(f) 46| return {"active_fallbacks": [], "last_check": None, "recovery_pending": False} 47| 48|def save_state(state): 49| state["last_check"] = datetime.now().isoformat() 50| with open(FALLBACK_STATE, "w") as f: 51| json.dump(state, f, indent=2) 52| 53|def run(cmd, timeout=10): 54| try: 55| r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) 56| return r.returncode, r.stdout.strip(), r.stderr.strip() 57| except subprocess.TimeoutExpired: 58| return -1, "", "timeout" 59| except Exception as e: 60| return -1, "", str(e) 61| 62|# ─── HEALTH CHECKS ─── 63| 64|def check_anthropic(): 65| """Can we reach Anthropic API?""" 66| key = os.environ.get("ANTHROPIC_API_KEY", "") 67| if not key: 68| # Check multiple .env locations 69| for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]: 70| if env_path.exists(): 71| for line in open(env_path): 72| line = line.strip() 73| if line.startswith("ANTHROPIC_API_KEY=*** 74| key = line.split("=", 1)[1].strip().strip('"').strip("'") 75| break 76| if key: 77| break 78| if not key: 79| return False, "no API key" 80| code, out, err = run( 81| f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" ' 82| f'-H "anthropic-version: 2023-06-01" ' 83| f'https://api.anthropic.com/v1/messages -X POST ' 84| f'-H "content-type: application/json" ' 85| f'-d \'{{"model":"claude-haiku-4-5-20251001","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ', 86| timeout=15 87| ) 88| if code == 0 and out in ("200", "429"): 89| return True, f"HTTP {out}" 90| return False, f"HTTP {out} err={err[:80]}" 91| 92|def check_local_llama(): 93| """Is local llama.cpp serving?""" 94| code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5) 95| if code == 0 and "hermes" in out.lower(): 96| return True, "serving" 97| return False, f"exit={code}" 98| 99|def check_ollama(): 100| """Is Ollama running?""" 101| code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5) 102| if code == 0 and "models" in out: 103| return True, "running" 104| return False, f"exit={code}" 105| 106|def check_gitea(): 107| """Can we reach the Forge?""" 108| token_path = Path.home() / ".config" / "gitea" / "timmy-token" 109| if not token_path.exists(): 110| return False, "no token" 111| token = token_path.read_text().strip() 112| code, out, err = run( 113| f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" ' 114| f'"{FORGE_URL}/api/v1/user"', 115| timeout=10 116| ) 117| if code == 0 and out == "200": 118| return True, "reachable" 119| return False, f"HTTP {out}" 120| 121|def check_vps(ip, name): 122| """Can we SSH into a VPS?""" 123| code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10) 124| if code == 0 and "alive" in out: 125| return True, "alive" 126| return False, f"unreachable" 127| 128|# ─── FALLBACK ACTIONS ─── 129| 130|def fallback_to_local_model(cfg): 131| """Switch primary model from Anthropic to local llama.cpp""" 132| if not BACKUP_CONFIG.exists(): 133| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) 134| 135| cfg["model"]["provider"] = "local-llama.cpp" 136| cfg["model"]["default"] = "hermes3" 137| save_config(cfg) 138| return "Switched primary model to local-llama.cpp/hermes3" 139| 140|def fallback_to_ollama(cfg): 141| """Switch to Ollama if llama.cpp is also down""" 142| if not BACKUP_CONFIG.exists(): 143| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) 144| 145| cfg["model"]["provider"] = "ollama" 146| cfg["model"]["default"] = "gemma4:latest" 147| save_config(cfg) 148| return "Switched primary model to ollama/gemma4:latest" 149| 150|def enter_safe_mode(state): 151| """Pause all non-essential cron jobs, alert Alexander""" 152| state["safe_mode"] = True 153| state["safe_mode_entered"] = datetime.now().isoformat() 154| save_state(state) 155| return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander." 156| 157|def restore_config(): 158| """Restore pre-fallback config when primary recovers""" 159| if BACKUP_CONFIG.exists(): 160| shutil.copy2(BACKUP_CONFIG, CONFIG_PATH) 161| BACKUP_CONFIG.unlink() 162| return "Restored original config from backup" 163| return "No backup config to restore" 164| 165|# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ─── 166| 167|def diagnose_and_fallback(): 168| state = load_state() 169| cfg = load_config() 170| 171| results = { 172| "timestamp": datetime.now().isoformat(), 173| "checks": {}, 174| "actions": [], 175| "status": "healthy" 176| } 177| 178| # Check all systems 179| anthropic_ok, anthropic_msg = check_anthropic() 180| results["checks"]["anthropic"] = {"ok": anthropic_ok, "msg": anthropic_msg} 181| 182| llama_ok, llama_msg = check_local_llama() 183| results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg} 184| 185| ollama_ok, ollama_msg = check_ollama() 186| results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg} 187| 188| gitea_ok, gitea_msg = check_gitea() 189| results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg} 190| 191| # VPS checks 192| vpses = [ 193| ("167.99.126.228", "Allegro"), 194| ("143.198.27.163", "Ezra"), 195| ("159.203.146.185", "Bezalel"), 196| ] 197| for ip, name in vpses: 198| vps_ok, vps_msg = check_vps(ip, name) 199| results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg} 200| 201| current_provider = cfg.get("model", {}).get("provider", "anthropic") 202| 203| # ─── FALLBACK LOGIC ─── 204| 205| # Case 1: Primary (Anthropic) down, local available 206| if not anthropic_ok and current_provider == "anthropic": 207| if llama_ok: 208| msg = fallback_to_local_model(cfg) 209| results["actions"].append(msg) 210| state["active_fallbacks"].append("anthropic->local-llama") 211| results["status"] = "degraded_local" 212| elif ollama_ok: 213| msg = fallback_to_ollama(cfg) 214| results["actions"].append(msg) 215| state["active_fallbacks"].append("anthropic->ollama") 216| results["status"] = "degraded_ollama" 217| else: 218| msg = enter_safe_mode(state) 219| results["actions"].append(msg) 220| results["status"] = "safe_mode" 221| 222| # Case 2: Already on fallback, check if primary recovered 223| elif anthropic_ok and "anthropic->local-llama" in state.get("active_fallbacks", []): 224| msg = restore_config() 225| results["actions"].append(msg) 226| state["active_fallbacks"].remove("anthropic->local-llama") 227| results["status"] = "recovered" 228| elif anthropic_ok and "anthropic->ollama" in state.get("active_fallbacks", []): 229| msg = restore_config() 230| results["actions"].append(msg) 231| state["active_fallbacks"].remove("anthropic->ollama") 232| results["status"] = "recovered" 233| 234| # Case 3: Gitea down — just flag it, work locally 235| if not gitea_ok: 236| results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery") 237| if "gitea_down" not in state.get("active_fallbacks", []): 238| state["active_fallbacks"].append("gitea_down") 239| results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0) 240| elif "gitea_down" in state.get("active_fallbacks", []): 241| state["active_fallbacks"].remove("gitea_down") 242| results["actions"].append("Gitea recovered — resume normal operations") 243| 244| # Case 4: VPS agents down 245| for ip, name in vpses: 246| key = f"vps_{name.lower()}" 247| if not results["checks"][key]["ok"]: 248| results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed") 249| 250| save_state(state) 251| return results 252| 253|if __name__ == "__main__": 254| results = diagnose_and_fallback() 255| print(json.dumps(results, indent=2)) 256| 257| # Exit codes for cron integration 258| if results["status"] == "safe_mode": 259| sys.exit(2) 260| elif results["status"].startswith("degraded"): 261| sys.exit(1) 262| else: 263| sys.exit(0) 264|