fix(deadman-fallback): try/except/continue cascade + OpenRouter
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 29s
Smoke Test / smoke (pull_request) Failing after 20s
Validate Config / YAML Lint (pull_request) Failing after 17s
Validate Config / JSON Validate (pull_request) Successful in 22s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 58s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 14s
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s
Validate Config / Playbook Schema Validation (pull_request) Successful in 30s
Architecture Lint / Lint Repository (pull_request) Failing after 28s
PR Checklist / pr-checklist (pull_request) Successful in 4m20s

- Add PROVIDER_TIMEOUT (30s default, env PROVIDER_TIMEOUT)
- Replace local-llama fallback with OpenRouter (openrouter/google/gemini-2.5-pro)
- Wrap fallback_to_openrouter, fallback_to_ollama, restore_config, enter_safe_mode in try/except
- Continue to next fallback on any error; no crash propagation
- Log all fallback events to request_log SQLite DB
- Provider errors caught/telemetry; never corrupt config

Closes #445
This commit is contained in:
Step35 Burn
2026-04-30 01:51:14 -04:00
parent 874ce137b0
commit ffd2d352c6

View File

@@ -24,12 +24,17 @@ import yaml
import shutil import shutil
from pathlib import Path from pathlib import Path
from datetime import datetime, timedelta from datetime import datetime, timedelta
import sqlite3
import urllib.request
import urllib.error
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes"))) HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
CONFIG_PATH = HERMES_HOME / "config.yaml" CONFIG_PATH = HERMES_HOME / "config.yaml"
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json" FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback" BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
FORGE_URL = "https://forge.alexanderwhitestone.com" FORGE_URL = "https://forge.alexanderwhitestone.com"
# Golden-state fallback chain: Kimi → OpenRouter (Gemini 2.5 Pro) → Ollama (gemma4:latest)
PROVIDER_TIMEOUT = int(os.getenv("PROVIDER_TIMEOUT", "30"))
def load_config(): def load_config():
with open(CONFIG_PATH) as f: with open(CONFIG_PATH) as f:
@@ -50,7 +55,7 @@ def save_state(state):
with open(FALLBACK_STATE, "w") as f: with open(FALLBACK_STATE, "w") as f:
json.dump(state, f, indent=2) json.dump(state, f, indent=2)
def run(cmd, timeout=10): def run(cmd, timeout=PROVIDER_TIMEOUT):
try: try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout) r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip() return r.returncode, r.stdout.strip(), r.stderr.strip()
@@ -61,6 +66,23 @@ def run(cmd, timeout=10):
# ─── HEALTH CHECKS ─── # ─── HEALTH CHECKS ───
def log_fallback_event(agent_name, provider, model, status, error_message=None):
"""Log fallback events to request_log SQLite DB (telemetry)."""
try:
log_path = Path.home() / ".local" / "timmy" / "request_log.db"
if log_path.exists():
conn = sqlite3.connect(str(log_path))
cursor = conn.cursor()
cursor.execute("""
INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message)
VALUES (datetime('now'), ?, ?, ?, ?, ?, ?)
""", (agent_name, provider, model, 'fallback_switch', status, error_message))
conn.commit()
conn.close()
except Exception:
pass # Silent if telemetry unavailable
def check_kimi(): def check_kimi():
"""Can we reach Kimi Coding API?""" """Can we reach Kimi Coding API?"""
key = os.environ.get("KIMI_API_KEY", "") key = os.environ.get("KIMI_API_KEY", "")
@@ -89,12 +111,38 @@ def check_kimi():
return True, f"HTTP {out}" return True, f"HTTP {out}"
return False, f"HTTP {out} err={err[:80]}" return False, f"HTTP {out} err={err[:80]}"
def check_local_llama(): def check_openrouter():
"""Is local llama.cpp serving?""" """Check OpenRouter API availability and credentials."""
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5) key = os.environ.get("OPENROUTER_API_KEY", "")
if code == 0 and "hermes" in out.lower(): if not key:
return True, "serving" env_file = HERMES_HOME / ".env"
return False, f"exit={code}" if env_file.exists():
for line in open(env_file):
line = line.strip()
if line.startswith("OPENROUTER_API_KEY="):
key = line.split("=", 1)[1].strip().strip('"\'')
break
if not key:
return False, "No OPENROUTER_API_KEY"
try:
req = urllib.request.Request(
"https://openrouter.ai/api/v1/models",
headers={"Authorization": "Bearer " + key}
)
resp = urllib.request.urlopen(req, timeout=PROVIDER_TIMEOUT)
if resp.status == 200:
data = json.loads(resp.read())
models = data.get("data", [])
return True, f"{len(models)} models available"
else:
return False, f"HTTP {resp.status}"
except urllib.error.HTTPError as e:
if e.code == 401:
return False, "Invalid OPENROUTER_API_KEY"
else:
return False, f"HTTP {e.code}"
except Exception as e:
return False, str(e)[:100]
def check_ollama(): def check_ollama():
"""Is Ollama running?""" """Is Ollama running?"""
@@ -127,15 +175,18 @@ def check_vps(ip, name):
# ─── FALLBACK ACTIONS ─── # ─── FALLBACK ACTIONS ───
def fallback_to_local_model(cfg): def fallback_to_openrouter(cfg):
"""Switch primary model from Kimi to local llama.cpp""" "Switch primary model from Kimi to OpenRouter (Gemini 2.5 Pro)"
if not BACKUP_CONFIG.exists(): if not BACKUP_CONFIG.exists():
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG) shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
cfg["model"]["provider"] = "local-llama.cpp" openrouter_cfg = cfg.get("providers", {}).get("openrouter", {})
cfg["model"]["default"] = "hermes3" base_url = openrouter_cfg.get("base_url", "https://openrouter.ai/api/v1")
cfg["model"]["provider"] = "openrouter"
cfg["model"]["default"] = "google/gemini-2.5-pro"
cfg["model"]["base_url"] = base_url
save_config(cfg) save_config(cfg)
return "Switched primary model to local-llama.cpp/hermes3" return "Switched primary model to openrouter/google/gemini-2.5-pro"
def fallback_to_ollama(cfg): def fallback_to_ollama(cfg):
"""Switch to Ollama if llama.cpp is also down""" """Switch to Ollama if llama.cpp is also down"""
@@ -179,11 +230,11 @@ def diagnose_and_fallback():
kimi_ok, kimi_msg = check_kimi() kimi_ok, kimi_msg = check_kimi()
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg} results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
llama_ok, llama_msg = check_local_llama() openrouter_ok, openrouter_msg = check_openrouter()
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg} results["checks"]["openrouter"] = {"ok": openrouter_ok, "msg": openrouter_msg}
ollama_ok, ollama_msg = check_ollama() oopenrouter_ok, oopenrouter_msg = check_ollama()
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg} results["checks"]["ollama"] = {"ok": oopenrouter_ok, "msg": oopenrouter_msg}
gitea_ok, gitea_msg = check_gitea() gitea_ok, gitea_msg = check_gitea()
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg} results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
@@ -202,41 +253,79 @@ def diagnose_and_fallback():
# ─── FALLBACK LOGIC ─── # ─── FALLBACK LOGIC ───
# Case 1: Primary (Kimi) down, local available # Case 1: Primary (Kimi) down, try fallback chain (OpenRouter -> Ollama)
if not kimi_ok and current_provider == "kimi-coding": if not kimi_ok and current_provider == "kimi-coding":
if llama_ok: agent_name = cfg.get("agent", {}).get("name", "timmy")
msg = fallback_to_local_model(cfg) applied = False
results["actions"].append(msg) # Try OpenRouter fallback
state["active_fallbacks"].append("kimi->local-llama") if openrouter_ok:
results["status"] = "degraded_local" try:
elif ollama_ok: msg = fallback_to_openrouter(cfg)
msg = fallback_to_ollama(cfg) results["actions"].append(msg)
results["actions"].append(msg) state["active_fallbacks"].append("kimi->openrouter")
state["active_fallbacks"].append("kimi->ollama") results["status"] = "degraded_openrouter"
results["status"] = "degraded_ollama" log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "success")
else: applied = True
msg = enter_safe_mode(state) except Exception as e:
results["actions"].append(msg) log(f"OpenRouter fallback failed: {e}")
results["status"] = "safe_mode" log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
# If still not applied, try Ollama
if not applied and oopenrouter_ok:
try:
msg = fallback_to_ollama(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->ollama")
results["status"] = "degraded_ollama"
log_fallback_event(agent_name, "ollama", "gemma4:latest", "success")
applied = True
except Exception as e:
log(f"Ollama fallback failed: {e}")
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
if not applied:
try:
msg = enter_safe_mode(state)
results["actions"].append(msg)
results["status"] = "safe_mode"
except Exception as e:
log(f"Safe mode failed: {e}")
# Case 2: Already on fallback, check if primary recovered # Case 2: Already on fallback, check if primary recovered — restore with resilience
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []): elif kimi_ok:
msg = restore_config() restored = False
results["actions"].append(msg) agent_name = cfg.get("agent", {}).get("name", "timmy")
state["active_fallbacks"].remove("kimi->local-llama") # Try restore from OpenRouter fallback
results["status"] = "recovered" if "kimi->openrouter" in state.get("active_fallbacks", []):
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []): try:
msg = restore_config() msg = restore_config()
results["actions"].append(msg) results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->ollama") state["active_fallbacks"].remove("kimi->openrouter")
results["status"] = "recovered" results["status"] = "recovered"
restored = True
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "restored")
except Exception as e:
log(f"Restore from OpenRouter failed: {e}")
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
# Try restore from Ollama fallback if still not restored
if not restored and "kimi->ollama" in state.get("active_fallbacks", []):
try:
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->ollama")
results["status"] = "recovered"
restored = True
log_fallback_event(agent_name, "ollama", "gemma4:latest", "restored")
except Exception as e:
log(f"Restore from Ollama failed: {e}")
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
if not restored:
log("WARNING: Primary recovered but unable to restore config")
# Case 3: Gitea down — just flag it, work locally # Case 3: Gitea down — just flag it, work locally
if not gitea_ok: if not gitea_ok:
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery") results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
if "gitea_down" not in state.get("active_fallbacks", []): if "gitea_down" not in state.get("active_fallbacks", []):
state["active_fallbacks"].append("gitea_down") state["active_fallbacks"].append("gitea_down")
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0) results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"] else 0)
elif "gitea_down" in state.get("active_fallbacks", []): elif "gitea_down" in state.get("active_fallbacks", []):
state["active_fallbacks"].remove("gitea_down") state["active_fallbacks"].remove("gitea_down")
results["actions"].append("Gitea recovered — resume normal operations") results["actions"].append("Gitea recovered — resume normal operations")