diff --git a/bin/deadman-fallback.py b/bin/deadman-fallback.py new file mode 100644 index 00000000..c094361c --- /dev/null +++ b/bin/deadman-fallback.py @@ -0,0 +1,264 @@ + 1|#!/usr/bin/env python3 + 2|""" + 3|Dead Man Switch Fallback Engine + 4| + 5|When the dead man switch triggers (zero commits for 2+ hours, model down, + 6|Gitea unreachable, etc.), this script diagnoses the failure and applies + 7|common sense fallbacks automatically. + 8| + 9|Fallback chain: + 10|1. Primary model (Anthropic) down -> switch config to local-llama.cpp + 11|2. Gitea unreachable -> cache issues locally, retry on recovery + 12|3. VPS agents down -> alert + lazarus protocol + 13|4. Local llama.cpp down -> try Ollama, then alert-only mode + 14|5. All inference dead -> safe mode (cron pauses, alert Alexander) + 15| + 16|Each fallback is reversible. Recovery auto-restores the previous config. + 17|""" + 18|import os + 19|import sys + 20|import json + 21|import subprocess + 22|import time + 23|import yaml + 24|import shutil + 25|from pathlib import Path + 26|from datetime import datetime, timedelta + 27| + 28|HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes"))) + 29|CONFIG_PATH = HERMES_HOME / "config.yaml" + 30|FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json" + 31|BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback" + 32|FORGE_URL = "https://forge.alexanderwhitestone.com" + 33| + 34|def load_config(): + 35| with open(CONFIG_PATH) as f: + 36| return yaml.safe_load(f) + 37| + 38|def save_config(cfg): + 39| with open(CONFIG_PATH, "w") as f: + 40| yaml.dump(cfg, f, default_flow_style=False) + 41| + 42|def load_state(): + 43| if FALLBACK_STATE.exists(): + 44| with open(FALLBACK_STATE) as f: + 45| return json.load(f) + 46| return {"active_fallbacks": [], "last_check": None, "recovery_pending": False} + 47| + 48|def save_state(state): + 49| state["last_check"] = datetime.now().isoformat() + 50| with open(FALLBACK_STATE, "w") as f: + 51| json.dump(state, f, indent=2) + 52| + 53|def run(cmd, timeout=10): + 54| try: + 55| r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) + 56| return r.returncode, r.stdout.strip(), r.stderr.strip() + 57| except subprocess.TimeoutExpired: + 58| return -1, "", "timeout" + 59| except Exception as e: + 60| return -1, "", str(e) + 61| + 62|# ─── HEALTH CHECKS ─── + 63| + 64|def check_anthropic(): + 65| """Can we reach Anthropic API?""" + 66| key = os.environ.get("ANTHROPIC_API_KEY", "") + 67| if not key: + 68| # Check multiple .env locations + 69| for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]: + 70| if env_path.exists(): + 71| for line in open(env_path): + 72| line = line.strip() + 73| if line.startswith("ANTHROPIC_API_KEY=*** + 74| key = line.split("=", 1)[1].strip().strip('"').strip("'") + 75| break + 76| if key: + 77| break + 78| if not key: + 79| return False, "no API key" + 80| code, out, err = run( + 81| f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" ' + 82| f'-H "anthropic-version: 2023-06-01" ' + 83| f'https://api.anthropic.com/v1/messages -X POST ' + 84| f'-H "content-type: application/json" ' + 85| f'-d \'{{"model":"claude-haiku-4-5-20251001","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ', + 86| timeout=15 + 87| ) + 88| if code == 0 and out in ("200", "429"): + 89| return True, f"HTTP {out}" + 90| return False, f"HTTP {out} err={err[:80]}" + 91| + 92|def check_local_llama(): + 93| """Is local llama.cpp serving?""" + 94| code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5) + 95| if code == 0 and "hermes" in out.lower(): + 96| return True, "serving" + 97| return False, f"exit={code}" + 98| + 99|def check_ollama(): + 100| """Is Ollama running?""" + 101| code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5) + 102| if code == 0 and "models" in out: + 103| return True, "running" + 104| return False, f"exit={code}" + 105| + 106|def check_gitea(): + 107| """Can we reach the Forge?""" + 108| token_path = Path.home() / ".config" / "gitea" / "timmy-token" + 109| if not token_path.exists(): + 110| return False, "no token" + 111| token = token_path.read_text().strip() + 112| code, out, err = run( + 113| f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" ' + 114| f'"{FORGE_URL}/api/v1/user"', + 115| timeout=10 + 116| ) + 117| if code == 0 and out == "200": + 118| return True, "reachable" + 119| return False, f"HTTP {out}" + 120| + 121|def check_vps(ip, name): + 122| """Can we SSH into a VPS?""" + 123| code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10) + 124| if code == 0 and "alive" in out: + 125| return True, "alive" + 126| return False, f"unreachable" + 127| + 128|# ─── FALLBACK ACTIONS ─── + 129| + 130|def fallback_to_local_model(cfg): + 131| """Switch primary model from Anthropic to local llama.cpp""" + 132| if not BACKUP_CONFIG.exists(): + 133| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) + 134| + 135| cfg["model"]["provider"] = "local-llama.cpp" + 136| cfg["model"]["default"] = "hermes3" + 137| save_config(cfg) + 138| return "Switched primary model to local-llama.cpp/hermes3" + 139| + 140|def fallback_to_ollama(cfg): + 141| """Switch to Ollama if llama.cpp is also down""" + 142| if not BACKUP_CONFIG.exists(): + 143| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) + 144| + 145| cfg["model"]["provider"] = "ollama" + 146| cfg["model"]["default"] = "gemma4:latest" + 147| save_config(cfg) + 148| return "Switched primary model to ollama/gemma4:latest" + 149| + 150|def enter_safe_mode(state): + 151| """Pause all non-essential cron jobs, alert Alexander""" + 152| state["safe_mode"] = True + 153| state["safe_mode_entered"] = datetime.now().isoformat() + 154| save_state(state) + 155| return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander." + 156| + 157|def restore_config(): + 158| """Restore pre-fallback config when primary recovers""" + 159| if BACKUP_CONFIG.exists(): + 160| shutil.copy2(BACKUP_CONFIG, CONFIG_PATH) + 161| BACKUP_CONFIG.unlink() + 162| return "Restored original config from backup" + 163| return "No backup config to restore" + 164| + 165|# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ─── + 166| + 167|def diagnose_and_fallback(): + 168| state = load_state() + 169| cfg = load_config() + 170| + 171| results = { + 172| "timestamp": datetime.now().isoformat(), + 173| "checks": {}, + 174| "actions": [], + 175| "status": "healthy" + 176| } + 177| + 178| # Check all systems + 179| anthropic_ok, anthropic_msg = check_anthropic() + 180| results["checks"]["anthropic"] = {"ok": anthropic_ok, "msg": anthropic_msg} + 181| + 182| llama_ok, llama_msg = check_local_llama() + 183| results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg} + 184| + 185| ollama_ok, ollama_msg = check_ollama() + 186| results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg} + 187| + 188| gitea_ok, gitea_msg = check_gitea() + 189| results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg} + 190| + 191| # VPS checks + 192| vpses = [ + 193| ("167.99.126.228", "Allegro"), + 194| ("143.198.27.163", "Ezra"), + 195| ("159.203.146.185", "Bezalel"), + 196| ] + 197| for ip, name in vpses: + 198| vps_ok, vps_msg = check_vps(ip, name) + 199| results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg} + 200| + 201| current_provider = cfg.get("model", {}).get("provider", "anthropic") + 202| + 203| # ─── FALLBACK LOGIC ─── + 204| + 205| # Case 1: Primary (Anthropic) down, local available + 206| if not anthropic_ok and current_provider == "anthropic": + 207| if llama_ok: + 208| msg = fallback_to_local_model(cfg) + 209| results["actions"].append(msg) + 210| state["active_fallbacks"].append("anthropic->local-llama") + 211| results["status"] = "degraded_local" + 212| elif ollama_ok: + 213| msg = fallback_to_ollama(cfg) + 214| results["actions"].append(msg) + 215| state["active_fallbacks"].append("anthropic->ollama") + 216| results["status"] = "degraded_ollama" + 217| else: + 218| msg = enter_safe_mode(state) + 219| results["actions"].append(msg) + 220| results["status"] = "safe_mode" + 221| + 222| # Case 2: Already on fallback, check if primary recovered + 223| elif anthropic_ok and "anthropic->local-llama" in state.get("active_fallbacks", []): + 224| msg = restore_config() + 225| results["actions"].append(msg) + 226| state["active_fallbacks"].remove("anthropic->local-llama") + 227| results["status"] = "recovered" + 228| elif anthropic_ok and "anthropic->ollama" in state.get("active_fallbacks", []): + 229| msg = restore_config() + 230| results["actions"].append(msg) + 231| state["active_fallbacks"].remove("anthropic->ollama") + 232| results["status"] = "recovered" + 233| + 234| # Case 3: Gitea down — just flag it, work locally + 235| if not gitea_ok: + 236| results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery") + 237| if "gitea_down" not in state.get("active_fallbacks", []): + 238| state["active_fallbacks"].append("gitea_down") + 239| results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0) + 240| elif "gitea_down" in state.get("active_fallbacks", []): + 241| state["active_fallbacks"].remove("gitea_down") + 242| results["actions"].append("Gitea recovered — resume normal operations") + 243| + 244| # Case 4: VPS agents down + 245| for ip, name in vpses: + 246| key = f"vps_{name.lower()}" + 247| if not results["checks"][key]["ok"]: + 248| results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed") + 249| + 250| save_state(state) + 251| return results + 252| + 253|if __name__ == "__main__": + 254| results = diagnose_and_fallback() + 255| print(json.dumps(results, indent=2)) + 256| + 257| # Exit codes for cron integration + 258| if results["status"] == "safe_mode": + 259| sys.exit(2) + 260| elif results["status"].startswith("degraded"): + 261| sys.exit(1) + 262| else: + 263| sys.exit(0) + 264| \ No newline at end of file