fix(deadman-fallback): try/except/continue cascade + OpenRouter
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 29s
Smoke Test / smoke (pull_request) Failing after 20s
Validate Config / YAML Lint (pull_request) Failing after 17s
Validate Config / JSON Validate (pull_request) Successful in 22s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 58s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 14s
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s
Validate Config / Playbook Schema Validation (pull_request) Successful in 30s
Architecture Lint / Lint Repository (pull_request) Failing after 28s
PR Checklist / pr-checklist (pull_request) Successful in 4m20s

- Add PROVIDER_TIMEOUT (30s default, env PROVIDER_TIMEOUT)
- Replace local-llama fallback with OpenRouter (openrouter/google/gemini-2.5-pro)
- Wrap fallback_to_openrouter, fallback_to_ollama, restore_config, enter_safe_mode in try/except
- Continue to next fallback on any error; no crash propagation
- Log all fallback events to request_log SQLite DB
- Provider errors caught/telemetry; never corrupt config

Closes #445
This commit is contained in:
Step35 Burn
2026-04-30 01:51:14 -04:00
parent 874ce137b0
commit ffd2d352c6

View File

@@ -24,12 +24,17 @@ import yaml
import shutil
from pathlib import Path
from datetime import datetime, timedelta
import sqlite3
import urllib.request
import urllib.error
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
CONFIG_PATH = HERMES_HOME / "config.yaml"
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
FORGE_URL = "https://forge.alexanderwhitestone.com"
# Golden-state fallback chain: Kimi → OpenRouter (Gemini 2.5 Pro) → Ollama (gemma4:latest)
PROVIDER_TIMEOUT = int(os.getenv("PROVIDER_TIMEOUT", "30"))
def load_config():
with open(CONFIG_PATH) as f:
@@ -50,7 +55,7 @@ def save_state(state):
with open(FALLBACK_STATE, "w") as f:
json.dump(state, f, indent=2)
def run(cmd, timeout=10):
def run(cmd, timeout=PROVIDER_TIMEOUT):
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
@@ -61,6 +66,23 @@ def run(cmd, timeout=10):
# ─── HEALTH CHECKS ───
def log_fallback_event(agent_name, provider, model, status, error_message=None):
"""Log fallback events to request_log SQLite DB (telemetry)."""
try:
log_path = Path.home() / ".local" / "timmy" / "request_log.db"
if log_path.exists():
conn = sqlite3.connect(str(log_path))
cursor = conn.cursor()
cursor.execute("""
INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message)
VALUES (datetime('now'), ?, ?, ?, ?, ?, ?)
""", (agent_name, provider, model, 'fallback_switch', status, error_message))
conn.commit()
conn.close()
except Exception:
pass # Silent if telemetry unavailable
def check_kimi():
"""Can we reach Kimi Coding API?"""
key = os.environ.get("KIMI_API_KEY", "")
@@ -89,12 +111,38 @@ def check_kimi():
return True, f"HTTP {out}"
return False, f"HTTP {out} err={err[:80]}"
def check_local_llama():
"""Is local llama.cpp serving?"""
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
if code == 0 and "hermes" in out.lower():
return True, "serving"
return False, f"exit={code}"
def check_openrouter():
"""Check OpenRouter API availability and credentials."""
key = os.environ.get("OPENROUTER_API_KEY", "")
if not key:
env_file = HERMES_HOME / ".env"
if env_file.exists():
for line in open(env_file):
line = line.strip()
if line.startswith("OPENROUTER_API_KEY="):
key = line.split("=", 1)[1].strip().strip('"\'')
break
if not key:
return False, "No OPENROUTER_API_KEY"
try:
req = urllib.request.Request(
"https://openrouter.ai/api/v1/models",
headers={"Authorization": "Bearer " + key}
)
resp = urllib.request.urlopen(req, timeout=PROVIDER_TIMEOUT)
if resp.status == 200:
data = json.loads(resp.read())
models = data.get("data", [])
return True, f"{len(models)} models available"
else:
return False, f"HTTP {resp.status}"
except urllib.error.HTTPError as e:
if e.code == 401:
return False, "Invalid OPENROUTER_API_KEY"
else:
return False, f"HTTP {e.code}"
except Exception as e:
return False, str(e)[:100]
def check_ollama():
"""Is Ollama running?"""
@@ -127,15 +175,18 @@ def check_vps(ip, name):
# ─── FALLBACK ACTIONS ───
def fallback_to_local_model(cfg):
"""Switch primary model from Kimi to local llama.cpp"""
def fallback_to_openrouter(cfg):
"Switch primary model from Kimi to OpenRouter (Gemini 2.5 Pro)"
if not BACKUP_CONFIG.exists():
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
cfg["model"]["provider"] = "local-llama.cpp"
cfg["model"]["default"] = "hermes3"
openrouter_cfg = cfg.get("providers", {}).get("openrouter", {})
base_url = openrouter_cfg.get("base_url", "https://openrouter.ai/api/v1")
cfg["model"]["provider"] = "openrouter"
cfg["model"]["default"] = "google/gemini-2.5-pro"
cfg["model"]["base_url"] = base_url
save_config(cfg)
return "Switched primary model to local-llama.cpp/hermes3"
return "Switched primary model to openrouter/google/gemini-2.5-pro"
def fallback_to_ollama(cfg):
"""Switch to Ollama if llama.cpp is also down"""
@@ -179,11 +230,11 @@ def diagnose_and_fallback():
kimi_ok, kimi_msg = check_kimi()
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
llama_ok, llama_msg = check_local_llama()
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
openrouter_ok, openrouter_msg = check_openrouter()
results["checks"]["openrouter"] = {"ok": openrouter_ok, "msg": openrouter_msg}
ollama_ok, ollama_msg = check_ollama()
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
oopenrouter_ok, oopenrouter_msg = check_ollama()
results["checks"]["ollama"] = {"ok": oopenrouter_ok, "msg": oopenrouter_msg}
gitea_ok, gitea_msg = check_gitea()
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
@@ -202,41 +253,79 @@ def diagnose_and_fallback():
# ─── FALLBACK LOGIC ───
# Case 1: Primary (Kimi) down, local available
# Case 1: Primary (Kimi) down, try fallback chain (OpenRouter -> Ollama)
if not kimi_ok and current_provider == "kimi-coding":
if llama_ok:
msg = fallback_to_local_model(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->local-llama")
results["status"] = "degraded_local"
elif ollama_ok:
msg = fallback_to_ollama(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->ollama")
results["status"] = "degraded_ollama"
else:
msg = enter_safe_mode(state)
results["actions"].append(msg)
results["status"] = "safe_mode"
agent_name = cfg.get("agent", {}).get("name", "timmy")
applied = False
# Try OpenRouter fallback
if openrouter_ok:
try:
msg = fallback_to_openrouter(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->openrouter")
results["status"] = "degraded_openrouter"
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "success")
applied = True
except Exception as e:
log(f"OpenRouter fallback failed: {e}")
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
# If still not applied, try Ollama
if not applied and oopenrouter_ok:
try:
msg = fallback_to_ollama(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->ollama")
results["status"] = "degraded_ollama"
log_fallback_event(agent_name, "ollama", "gemma4:latest", "success")
applied = True
except Exception as e:
log(f"Ollama fallback failed: {e}")
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
if not applied:
try:
msg = enter_safe_mode(state)
results["actions"].append(msg)
results["status"] = "safe_mode"
except Exception as e:
log(f"Safe mode failed: {e}")
# Case 2: Already on fallback, check if primary recovered
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->local-llama")
results["status"] = "recovered"
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->ollama")
results["status"] = "recovered"
# Case 2: Already on fallback, check if primary recovered — restore with resilience
elif kimi_ok:
restored = False
agent_name = cfg.get("agent", {}).get("name", "timmy")
# Try restore from OpenRouter fallback
if "kimi->openrouter" in state.get("active_fallbacks", []):
try:
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->openrouter")
results["status"] = "recovered"
restored = True
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "restored")
except Exception as e:
log(f"Restore from OpenRouter failed: {e}")
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
# Try restore from Ollama fallback if still not restored
if not restored and "kimi->ollama" in state.get("active_fallbacks", []):
try:
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->ollama")
results["status"] = "recovered"
restored = True
log_fallback_event(agent_name, "ollama", "gemma4:latest", "restored")
except Exception as e:
log(f"Restore from Ollama failed: {e}")
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
if not restored:
log("WARNING: Primary recovered but unable to restore config")
# Case 3: Gitea down — just flag it, work locally
if not gitea_ok:
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
if "gitea_down" not in state.get("active_fallbacks", []):
state["active_fallbacks"].append("gitea_down")
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"] else 0)
elif "gitea_down" in state.get("active_fallbacks", []):
state["active_fallbacks"].remove("gitea_down")
results["actions"].append("Gitea recovered — resume normal operations")