#!/usr/bin/env python3 """ Dead Man Switch Fallback Engine When the dead man switch triggers (zero commits for 2+ hours, model down, Gitea unreachable, etc.), this script diagnoses the failure and applies common sense fallbacks automatically. Fallback chain: 1. Primary model (Kimi) down -> switch config to local-llama.cpp 2. Gitea unreachable -> cache issues locally, retry on recovery 3. VPS agents down -> alert + lazarus protocol 4. Local llama.cpp down -> try Ollama, then alert-only mode 5. All inference dead -> safe mode (cron pauses, alert Alexander) Each fallback is reversible. Recovery auto-restores the previous config. """ import os import sys import json import subprocess import time import yaml import shutil from pathlib import Path from datetime import datetime, timedelta HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes"))) CONFIG_PATH = HERMES_HOME / "config.yaml" FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json" BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback" FORGE_URL = "https://forge.alexanderwhitestone.com" def load_config(): with open(CONFIG_PATH) as f: return yaml.safe_load(f) def save_config(cfg): with open(CONFIG_PATH, "w") as f: yaml.dump(cfg, f, default_flow_style=False) def load_state(): if FALLBACK_STATE.exists(): with open(FALLBACK_STATE) as f: return json.load(f) return {"active_fallbacks": [], "last_check": None, "recovery_pending": False} def save_state(state): state["last_check"] = datetime.now().isoformat() with open(FALLBACK_STATE, "w") as f: json.dump(state, f, indent=2) def run(cmd, timeout=10): try: r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) return r.returncode, r.stdout.strip(), r.stderr.strip() except subprocess.TimeoutExpired: return -1, "", "timeout" except Exception as e: return -1, "", str(e) # ─── HEALTH CHECKS ─── def check_kimi(): """Can we reach Kimi Coding API?""" key = os.environ.get("KIMI_API_KEY", "") if not key: # Check multiple .env locations for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]: if env_path.exists(): for line in open(env_path): line = line.strip() if line.startswith("KIMI_API_KEY="): key = line.split("=", 1)[1].strip().strip('"').strip("'") break if key: break if not key: return False, "no API key" code, out, err = run( f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" ' f'-H "x-api-provider: kimi-coding" ' f'https://api.kimi.com/coding/v1/models -X POST ' f'-H "content-type: application/json" ' f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ', timeout=15 ) if code == 0 and out in ("200", "429"): return True, f"HTTP {out}" return False, f"HTTP {out} err={err[:80]}" def check_local_llama(): """Is local llama.cpp serving?""" code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5) if code == 0 and "hermes" in out.lower(): return True, "serving" return False, f"exit={code}" def check_ollama(): """Is Ollama running?""" code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5) if code == 0 and "models" in out: return True, "running" return False, f"exit={code}" def check_gitea(): """Can we reach the Forge?""" token_path = Path.home() / ".config" / "gitea" / "timmy-token" if not token_path.exists(): return False, "no token" token = token_path.read_text().strip() code, out, err = run( f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" ' f'"{FORGE_URL}/api/v1/user"', timeout=10 ) if code == 0 and out == "200": return True, "reachable" return False, f"HTTP {out}" def check_vps(ip, name): """Can we SSH into a VPS?""" code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10) if code == 0 and "alive" in out: return True, "alive" return False, f"unreachable" # ─── FALLBACK ACTIONS ─── def fallback_to_local_model(cfg): """Switch primary model from Kimi to local llama.cpp""" if not BACKUP_CONFIG.exists(): shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) cfg["model"]["provider"] = "local-llama.cpp" cfg["model"]["default"] = "hermes3" save_config(cfg) return "Switched primary model to local-llama.cpp/hermes3" def fallback_to_ollama(cfg): """Switch to Ollama if llama.cpp is also down""" if not BACKUP_CONFIG.exists(): shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) cfg["model"]["provider"] = "ollama" cfg["model"]["default"] = "gemma4:latest" save_config(cfg) return "Switched primary model to ollama/gemma4:latest" def enter_safe_mode(state): """Pause all non-essential cron jobs, alert Alexander""" state["safe_mode"] = True state["safe_mode_entered"] = datetime.now().isoformat() save_state(state) return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander." def restore_config(): """Restore pre-fallback config when primary recovers""" if BACKUP_CONFIG.exists(): shutil.copy2(BACKUP_CONFIG, CONFIG_PATH) BACKUP_CONFIG.unlink() return "Restored original config from backup" return "No backup config to restore" # ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ─── def diagnose_and_fallback(): state = load_state() cfg = load_config() results = { "timestamp": datetime.now().isoformat(), "checks": {}, "actions": [], "status": "healthy" } # Check all systems kimi_ok, kimi_msg = check_kimi() results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg} llama_ok, llama_msg = check_local_llama() results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg} ollama_ok, ollama_msg = check_ollama() results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg} gitea_ok, gitea_msg = check_gitea() results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg} # VPS checks vpses = [ ("167.99.126.228", "Allegro"), ("143.198.27.163", "Ezra"), ("159.203.146.185", "Bezalel"), ] for ip, name in vpses: vps_ok, vps_msg = check_vps(ip, name) results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg} current_provider = cfg.get("model", {}).get("provider", "kimi-coding") # ─── FALLBACK LOGIC ─── # Case 1: Primary (Kimi) down, local available if not kimi_ok and current_provider == "kimi-coding": if llama_ok: msg = fallback_to_local_model(cfg) results["actions"].append(msg) state["active_fallbacks"].append("kimi->local-llama") results["status"] = "degraded_local" elif ollama_ok: msg = fallback_to_ollama(cfg) results["actions"].append(msg) state["active_fallbacks"].append("kimi->ollama") results["status"] = "degraded_ollama" else: msg = enter_safe_mode(state) results["actions"].append(msg) results["status"] = "safe_mode" # Case 2: Already on fallback, check if primary recovered elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []): msg = restore_config() results["actions"].append(msg) state["active_fallbacks"].remove("kimi->local-llama") results["status"] = "recovered" elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []): msg = restore_config() results["actions"].append(msg) state["active_fallbacks"].remove("kimi->ollama") results["status"] = "recovered" # Case 3: Gitea down — just flag it, work locally if not gitea_ok: results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery") if "gitea_down" not in state.get("active_fallbacks", []): state["active_fallbacks"].append("gitea_down") results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0) elif "gitea_down" in state.get("active_fallbacks", []): state["active_fallbacks"].remove("gitea_down") results["actions"].append("Gitea recovered — resume normal operations") # Case 4: VPS agents down for ip, name in vpses: key = f"vps_{name.lower()}" if not results["checks"][key]["ok"]: results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed") save_state(state) return results if __name__ == "__main__": results = diagnose_and_fallback() print(json.dumps(results, indent=2)) # Exit codes for cron integration if results["status"] == "safe_mode": sys.exit(2) elif results["status"].startswith("degraded"): sys.exit(1) else: sys.exit(0)