#!/usr/bin/env python3 """ Dead Man Switch Fallback Engine When the dead man switch triggers (zero commits for 2+ hours, model down, Gitea unreachable, etc.), this script diagnoses the failure and applies common sense fallbacks automatically. Fallback chain: 1. Primary model (Kimi) down -> switch config to local-llama.cpp 2. Gitea unreachable -> cache issues locally, retry on recovery 3. VPS agents down -> alert + lazarus protocol 4. Local llama.cpp down -> try Ollama, then alert-only mode 5. All inference dead -> safe mode (cron pauses, alert Alexander) Each fallback is reversible. Recovery auto-restores the previous config. """ import os import sys import json import subprocess import time import yaml import shutil from pathlib import Path from datetime import datetime, timedelta import sqlite3 import urllib.request import urllib.error HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes"))) CONFIG_PATH = HERMES_HOME / "config.yaml" FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json" BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback" FORGE_URL = "https://forge.alexanderwhitestone.com" # Golden-state fallback chain: Kimi → OpenRouter (Gemini 2.5 Pro) → Ollama (gemma4:latest) PROVIDER_TIMEOUT = int(os.getenv("PROVIDER_TIMEOUT", "30")) def load_config(): with open(CONFIG_PATH) as f: return yaml.safe_load(f) def save_config(cfg): with open(CONFIG_PATH, "w") as f: yaml.dump(cfg, f, default_flow_style=False) def load_state(): if FALLBACK_STATE.exists(): with open(FALLBACK_STATE) as f: return json.load(f) return {"active_fallbacks": [], "last_check": None, "recovery_pending": False} def save_state(state): state["last_check"] = datetime.now().isoformat() with open(FALLBACK_STATE, "w") as f: json.dump(state, f, indent=2) def run(cmd, timeout=PROVIDER_TIMEOUT): try: r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) return r.returncode, r.stdout.strip(), r.stderr.strip() except subprocess.TimeoutExpired: return -1, "", "timeout" except Exception as e: return -1, "", str(e) # ─── HEALTH CHECKS ─── def log_fallback_event(agent_name, provider, model, status, error_message=None): """Log fallback events to request_log SQLite DB (telemetry).""" try: log_path = Path.home() / ".local" / "timmy" / "request_log.db" if log_path.exists(): conn = sqlite3.connect(str(log_path)) cursor = conn.cursor() cursor.execute(""" INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message) VALUES (datetime('now'), ?, ?, ?, ?, ?, ?) """, (agent_name, provider, model, 'fallback_switch', status, error_message)) conn.commit() conn.close() except Exception: pass # Silent if telemetry unavailable def check_kimi(): """Can we reach Kimi Coding API?""" key = os.environ.get("KIMI_API_KEY", "") if not key: # Check multiple .env locations for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]: if env_path.exists(): for line in open(env_path): line = line.strip() if line.startswith("KIMI_API_KEY="): key = line.split("=", 1)[1].strip().strip('"').strip("'") break if key: break if not key: return False, "no API key" code, out, err = run( f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" ' f'-H "x-api-provider: kimi-coding" ' f'https://api.kimi.com/coding/v1/models -X POST ' f'-H "content-type: application/json" ' f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ', timeout=15 ) if code == 0 and out in ("200", "429"): return True, f"HTTP {out}" return False, f"HTTP {out} err={err[:80]}" def check_openrouter(): """Check OpenRouter API availability and credentials.""" key = os.environ.get("OPENROUTER_API_KEY", "") if not key: env_file = HERMES_HOME / ".env" if env_file.exists(): for line in open(env_file): line = line.strip() if line.startswith("OPENROUTER_API_KEY="): key = line.split("=", 1)[1].strip().strip('"\'') break if not key: return False, "No OPENROUTER_API_KEY" try: req = urllib.request.Request( "https://openrouter.ai/api/v1/models", headers={"Authorization": "Bearer " + key} ) resp = urllib.request.urlopen(req, timeout=PROVIDER_TIMEOUT) if resp.status == 200: data = json.loads(resp.read()) models = data.get("data", []) return True, f"{len(models)} models available" else: return False, f"HTTP {resp.status}" except urllib.error.HTTPError as e: if e.code == 401: return False, "Invalid OPENROUTER_API_KEY" else: return False, f"HTTP {e.code}" except Exception as e: return False, str(e)[:100] def check_ollama(): """Is Ollama running?""" code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5) if code == 0 and "models" in out: return True, "running" return False, f"exit={code}" def check_gitea(): """Can we reach the Forge?""" token_path = Path.home() / ".config" / "gitea" / "timmy-token" if not token_path.exists(): return False, "no token" token = token_path.read_text().strip() code, out, err = run( f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" ' f'"{FORGE_URL}/api/v1/user"', timeout=10 ) if code == 0 and out == "200": return True, "reachable" return False, f"HTTP {out}" def check_vps(ip, name): """Can we SSH into a VPS?""" code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10) if code == 0 and "alive" in out: return True, "alive" return False, f"unreachable" # ─── FALLBACK ACTIONS ─── def fallback_to_openrouter(cfg): "Switch primary model from Kimi to OpenRouter (Gemini 2.5 Pro)" if not BACKUP_CONFIG.exists(): shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) openrouter_cfg = cfg.get("providers", {}).get("openrouter", {}) base_url = openrouter_cfg.get("base_url", "https://openrouter.ai/api/v1") cfg["model"]["provider"] = "openrouter" cfg["model"]["default"] = "google/gemini-2.5-pro" cfg["model"]["base_url"] = base_url save_config(cfg) return "Switched primary model to openrouter/google/gemini-2.5-pro" def fallback_to_ollama(cfg): """Switch to Ollama if llama.cpp is also down""" if not BACKUP_CONFIG.exists(): shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) cfg["model"]["provider"] = "ollama" cfg["model"]["default"] = "gemma4:latest" save_config(cfg) return "Switched primary model to ollama/gemma4:latest" def enter_safe_mode(state): """Pause all non-essential cron jobs, alert Alexander""" state["safe_mode"] = True state["safe_mode_entered"] = datetime.now().isoformat() save_state(state) return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander." def restore_config(): """Restore pre-fallback config when primary recovers""" if BACKUP_CONFIG.exists(): shutil.copy2(BACKUP_CONFIG, CONFIG_PATH) BACKUP_CONFIG.unlink() return "Restored original config from backup" return "No backup config to restore" # ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ─── def diagnose_and_fallback(): state = load_state() cfg = load_config() results = { "timestamp": datetime.now().isoformat(), "checks": {}, "actions": [], "status": "healthy" } # Check all systems kimi_ok, kimi_msg = check_kimi() results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg} openrouter_ok, openrouter_msg = check_openrouter() results["checks"]["openrouter"] = {"ok": openrouter_ok, "msg": openrouter_msg} oopenrouter_ok, oopenrouter_msg = check_ollama() results["checks"]["ollama"] = {"ok": oopenrouter_ok, "msg": oopenrouter_msg} gitea_ok, gitea_msg = check_gitea() results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg} # VPS checks vpses = [ ("167.99.126.228", "Allegro"), ("143.198.27.163", "Ezra"), ("159.203.146.185", "Bezalel"), ] for ip, name in vpses: vps_ok, vps_msg = check_vps(ip, name) results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg} current_provider = cfg.get("model", {}).get("provider", "kimi-coding") # ─── FALLBACK LOGIC ─── # Case 1: Primary (Kimi) down, try fallback chain (OpenRouter -> Ollama) if not kimi_ok and current_provider == "kimi-coding": agent_name = cfg.get("agent", {}).get("name", "timmy") applied = False # Try OpenRouter fallback if openrouter_ok: try: msg = fallback_to_openrouter(cfg) results["actions"].append(msg) state["active_fallbacks"].append("kimi->openrouter") results["status"] = "degraded_openrouter" log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "success") applied = True except Exception as e: log(f"OpenRouter fallback failed: {e}") log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e)) # If still not applied, try Ollama if not applied and oopenrouter_ok: try: msg = fallback_to_ollama(cfg) results["actions"].append(msg) state["active_fallbacks"].append("kimi->ollama") results["status"] = "degraded_ollama" log_fallback_event(agent_name, "ollama", "gemma4:latest", "success") applied = True except Exception as e: log(f"Ollama fallback failed: {e}") log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e)) if not applied: try: msg = enter_safe_mode(state) results["actions"].append(msg) results["status"] = "safe_mode" except Exception as e: log(f"Safe mode failed: {e}") # Case 2: Already on fallback, check if primary recovered — restore with resilience elif kimi_ok: restored = False agent_name = cfg.get("agent", {}).get("name", "timmy") # Try restore from OpenRouter fallback if "kimi->openrouter" in state.get("active_fallbacks", []): try: msg = restore_config() results["actions"].append(msg) state["active_fallbacks"].remove("kimi->openrouter") results["status"] = "recovered" restored = True log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "restored") except Exception as e: log(f"Restore from OpenRouter failed: {e}") log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e)) # Try restore from Ollama fallback if still not restored if not restored and "kimi->ollama" in state.get("active_fallbacks", []): try: msg = restore_config() results["actions"].append(msg) state["active_fallbacks"].remove("kimi->ollama") results["status"] = "recovered" restored = True log_fallback_event(agent_name, "ollama", "gemma4:latest", "restored") except Exception as e: log(f"Restore from Ollama failed: {e}") log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e)) if not restored: log("WARNING: Primary recovered but unable to restore config") # Case 3: Gitea down — just flag it, work locally if not gitea_ok: results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery") if "gitea_down" not in state.get("active_fallbacks", []): state["active_fallbacks"].append("gitea_down") results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"] else 0) elif "gitea_down" in state.get("active_fallbacks", []): state["active_fallbacks"].remove("gitea_down") results["actions"].append("Gitea recovered — resume normal operations") # Case 4: VPS agents down for ip, name in vpses: key = f"vps_{name.lower()}" if not results["checks"][key]["ok"]: results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed") save_state(state) return results if __name__ == "__main__": results = diagnose_and_fallback() print(json.dumps(results, indent=2)) # Exit codes for cron integration if results["status"] == "safe_mode": sys.exit(2) elif results["status"].startswith("degraded"): sys.exit(1) else: sys.exit(0)