diff --git a/bin/deadman-fallback.py b/bin/deadman-fallback.py index bf4bc939..166981dc 100644 --- a/bin/deadman-fallback.py +++ b/bin/deadman-fallback.py @@ -24,12 +24,17 @@ import yaml import shutil from pathlib import Path from datetime import datetime, timedelta +import sqlite3 +import urllib.request +import urllib.error HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes"))) CONFIG_PATH = HERMES_HOME / "config.yaml" FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json" BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback" FORGE_URL = "https://forge.alexanderwhitestone.com" +# Golden-state fallback chain: Kimi → OpenRouter (Gemini 2.5 Pro) → Ollama (gemma4:latest) +PROVIDER_TIMEOUT = int(os.getenv("PROVIDER_TIMEOUT", "30")) def load_config(): with open(CONFIG_PATH) as f: @@ -50,7 +55,7 @@ def save_state(state): with open(FALLBACK_STATE, "w") as f: json.dump(state, f, indent=2) -def run(cmd, timeout=10): +def run(cmd, timeout=PROVIDER_TIMEOUT): try: r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) return r.returncode, r.stdout.strip(), r.stderr.strip() @@ -61,6 +66,23 @@ def run(cmd, timeout=10): # ─── HEALTH CHECKS ─── + +def log_fallback_event(agent_name, provider, model, status, error_message=None): + """Log fallback events to request_log SQLite DB (telemetry).""" + try: + log_path = Path.home() / ".local" / "timmy" / "request_log.db" + if log_path.exists(): + conn = sqlite3.connect(str(log_path)) + cursor = conn.cursor() + cursor.execute(""" + INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message) + VALUES (datetime('now'), ?, ?, ?, ?, ?, ?) + """, (agent_name, provider, model, 'fallback_switch', status, error_message)) + conn.commit() + conn.close() + except Exception: + pass # Silent if telemetry unavailable + def check_kimi(): """Can we reach Kimi Coding API?""" key = os.environ.get("KIMI_API_KEY", "") @@ -89,12 +111,38 @@ def check_kimi(): return True, f"HTTP {out}" return False, f"HTTP {out} err={err[:80]}" -def check_local_llama(): - """Is local llama.cpp serving?""" - code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5) - if code == 0 and "hermes" in out.lower(): - return True, "serving" - return False, f"exit={code}" +def check_openrouter(): + """Check OpenRouter API availability and credentials.""" + key = os.environ.get("OPENROUTER_API_KEY", "") + if not key: + env_file = HERMES_HOME / ".env" + if env_file.exists(): + for line in open(env_file): + line = line.strip() + if line.startswith("OPENROUTER_API_KEY="): + key = line.split("=", 1)[1].strip().strip('"\'') + break + if not key: + return False, "No OPENROUTER_API_KEY" + try: + req = urllib.request.Request( + "https://openrouter.ai/api/v1/models", + headers={"Authorization": "Bearer " + key} + ) + resp = urllib.request.urlopen(req, timeout=PROVIDER_TIMEOUT) + if resp.status == 200: + data = json.loads(resp.read()) + models = data.get("data", []) + return True, f"{len(models)} models available" + else: + return False, f"HTTP {resp.status}" + except urllib.error.HTTPError as e: + if e.code == 401: + return False, "Invalid OPENROUTER_API_KEY" + else: + return False, f"HTTP {e.code}" + except Exception as e: + return False, str(e)[:100] def check_ollama(): """Is Ollama running?""" @@ -127,15 +175,18 @@ def check_vps(ip, name): # ─── FALLBACK ACTIONS ─── -def fallback_to_local_model(cfg): - """Switch primary model from Kimi to local llama.cpp""" +def fallback_to_openrouter(cfg): + "Switch primary model from Kimi to OpenRouter (Gemini 2.5 Pro)" if not BACKUP_CONFIG.exists(): shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) - cfg["model"]["provider"] = "local-llama.cpp" - cfg["model"]["default"] = "hermes3" + openrouter_cfg = cfg.get("providers", {}).get("openrouter", {}) + base_url = openrouter_cfg.get("base_url", "https://openrouter.ai/api/v1") + cfg["model"]["provider"] = "openrouter" + cfg["model"]["default"] = "google/gemini-2.5-pro" + cfg["model"]["base_url"] = base_url save_config(cfg) - return "Switched primary model to local-llama.cpp/hermes3" + return "Switched primary model to openrouter/google/gemini-2.5-pro" def fallback_to_ollama(cfg): """Switch to Ollama if llama.cpp is also down""" @@ -179,11 +230,11 @@ def diagnose_and_fallback(): kimi_ok, kimi_msg = check_kimi() results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg} - llama_ok, llama_msg = check_local_llama() - results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg} + openrouter_ok, openrouter_msg = check_openrouter() + results["checks"]["openrouter"] = {"ok": openrouter_ok, "msg": openrouter_msg} - ollama_ok, ollama_msg = check_ollama() - results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg} + oopenrouter_ok, oopenrouter_msg = check_ollama() + results["checks"]["ollama"] = {"ok": oopenrouter_ok, "msg": oopenrouter_msg} gitea_ok, gitea_msg = check_gitea() results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg} @@ -202,41 +253,79 @@ def diagnose_and_fallback(): # ─── FALLBACK LOGIC ─── - # Case 1: Primary (Kimi) down, local available + # Case 1: Primary (Kimi) down, try fallback chain (OpenRouter -> Ollama) if not kimi_ok and current_provider == "kimi-coding": - if llama_ok: - msg = fallback_to_local_model(cfg) - results["actions"].append(msg) - state["active_fallbacks"].append("kimi->local-llama") - results["status"] = "degraded_local" - elif ollama_ok: - msg = fallback_to_ollama(cfg) - results["actions"].append(msg) - state["active_fallbacks"].append("kimi->ollama") - results["status"] = "degraded_ollama" - else: - msg = enter_safe_mode(state) - results["actions"].append(msg) - results["status"] = "safe_mode" + agent_name = cfg.get("agent", {}).get("name", "timmy") + applied = False + # Try OpenRouter fallback + if openrouter_ok: + try: + msg = fallback_to_openrouter(cfg) + results["actions"].append(msg) + state["active_fallbacks"].append("kimi->openrouter") + results["status"] = "degraded_openrouter" + log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "success") + applied = True + except Exception as e: + log(f"OpenRouter fallback failed: {e}") + log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e)) + # If still not applied, try Ollama + if not applied and oopenrouter_ok: + try: + msg = fallback_to_ollama(cfg) + results["actions"].append(msg) + state["active_fallbacks"].append("kimi->ollama") + results["status"] = "degraded_ollama" + log_fallback_event(agent_name, "ollama", "gemma4:latest", "success") + applied = True + except Exception as e: + log(f"Ollama fallback failed: {e}") + log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e)) + if not applied: + try: + msg = enter_safe_mode(state) + results["actions"].append(msg) + results["status"] = "safe_mode" + except Exception as e: + log(f"Safe mode failed: {e}") - # Case 2: Already on fallback, check if primary recovered - elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []): - msg = restore_config() - results["actions"].append(msg) - state["active_fallbacks"].remove("kimi->local-llama") - results["status"] = "recovered" - elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []): - msg = restore_config() - results["actions"].append(msg) - state["active_fallbacks"].remove("kimi->ollama") - results["status"] = "recovered" + # Case 2: Already on fallback, check if primary recovered — restore with resilience + elif kimi_ok: + restored = False + agent_name = cfg.get("agent", {}).get("name", "timmy") + # Try restore from OpenRouter fallback + if "kimi->openrouter" in state.get("active_fallbacks", []): + try: + msg = restore_config() + results["actions"].append(msg) + state["active_fallbacks"].remove("kimi->openrouter") + results["status"] = "recovered" + restored = True + log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "restored") + except Exception as e: + log(f"Restore from OpenRouter failed: {e}") + log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e)) + # Try restore from Ollama fallback if still not restored + if not restored and "kimi->ollama" in state.get("active_fallbacks", []): + try: + msg = restore_config() + results["actions"].append(msg) + state["active_fallbacks"].remove("kimi->ollama") + results["status"] = "recovered" + restored = True + log_fallback_event(agent_name, "ollama", "gemma4:latest", "restored") + except Exception as e: + log(f"Restore from Ollama failed: {e}") + log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e)) + if not restored: + log("WARNING: Primary recovered but unable to restore config") # Case 3: Gitea down — just flag it, work locally if not gitea_ok: results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery") if "gitea_down" not in state.get("active_fallbacks", []): state["active_fallbacks"].append("gitea_down") - results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0) + results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"] else 0) elif "gitea_down" in state.get("active_fallbacks", []): state["active_fallbacks"].remove("gitea_down") results["actions"].append("Gitea recovered — resume normal operations")