fix(deadman-fallback): try/except/continue cascade + OpenRouter
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 29s
Smoke Test / smoke (pull_request) Failing after 20s
Validate Config / YAML Lint (pull_request) Failing after 17s
Validate Config / JSON Validate (pull_request) Successful in 22s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 58s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 14s
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s
Validate Config / Playbook Schema Validation (pull_request) Successful in 30s
Architecture Lint / Lint Repository (pull_request) Failing after 28s
PR Checklist / pr-checklist (pull_request) Successful in 4m20s
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 29s
Smoke Test / smoke (pull_request) Failing after 20s
Validate Config / YAML Lint (pull_request) Failing after 17s
Validate Config / JSON Validate (pull_request) Successful in 22s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 58s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 14s
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s
Validate Config / Playbook Schema Validation (pull_request) Successful in 30s
Architecture Lint / Lint Repository (pull_request) Failing after 28s
PR Checklist / pr-checklist (pull_request) Successful in 4m20s
- Add PROVIDER_TIMEOUT (30s default, env PROVIDER_TIMEOUT) - Replace local-llama fallback with OpenRouter (openrouter/google/gemini-2.5-pro) - Wrap fallback_to_openrouter, fallback_to_ollama, restore_config, enter_safe_mode in try/except - Continue to next fallback on any error; no crash propagation - Log all fallback events to request_log SQLite DB - Provider errors caught/telemetry; never corrupt config Closes #445
This commit is contained in:
@@ -24,12 +24,17 @@ import yaml
|
|||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
import sqlite3
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
|
||||||
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
||||||
CONFIG_PATH = HERMES_HOME / "config.yaml"
|
CONFIG_PATH = HERMES_HOME / "config.yaml"
|
||||||
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
||||||
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
||||||
FORGE_URL = "https://forge.alexanderwhitestone.com"
|
FORGE_URL = "https://forge.alexanderwhitestone.com"
|
||||||
|
# Golden-state fallback chain: Kimi → OpenRouter (Gemini 2.5 Pro) → Ollama (gemma4:latest)
|
||||||
|
PROVIDER_TIMEOUT = int(os.getenv("PROVIDER_TIMEOUT", "30"))
|
||||||
|
|
||||||
def load_config():
|
def load_config():
|
||||||
with open(CONFIG_PATH) as f:
|
with open(CONFIG_PATH) as f:
|
||||||
@@ -50,7 +55,7 @@ def save_state(state):
|
|||||||
with open(FALLBACK_STATE, "w") as f:
|
with open(FALLBACK_STATE, "w") as f:
|
||||||
json.dump(state, f, indent=2)
|
json.dump(state, f, indent=2)
|
||||||
|
|
||||||
def run(cmd, timeout=10):
|
def run(cmd, timeout=PROVIDER_TIMEOUT):
|
||||||
try:
|
try:
|
||||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||||
@@ -61,6 +66,23 @@ def run(cmd, timeout=10):
|
|||||||
|
|
||||||
# ─── HEALTH CHECKS ───
|
# ─── HEALTH CHECKS ───
|
||||||
|
|
||||||
|
|
||||||
|
def log_fallback_event(agent_name, provider, model, status, error_message=None):
|
||||||
|
"""Log fallback events to request_log SQLite DB (telemetry)."""
|
||||||
|
try:
|
||||||
|
log_path = Path.home() / ".local" / "timmy" / "request_log.db"
|
||||||
|
if log_path.exists():
|
||||||
|
conn = sqlite3.connect(str(log_path))
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message)
|
||||||
|
VALUES (datetime('now'), ?, ?, ?, ?, ?, ?)
|
||||||
|
""", (agent_name, provider, model, 'fallback_switch', status, error_message))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
except Exception:
|
||||||
|
pass # Silent if telemetry unavailable
|
||||||
|
|
||||||
def check_kimi():
|
def check_kimi():
|
||||||
"""Can we reach Kimi Coding API?"""
|
"""Can we reach Kimi Coding API?"""
|
||||||
key = os.environ.get("KIMI_API_KEY", "")
|
key = os.environ.get("KIMI_API_KEY", "")
|
||||||
@@ -89,12 +111,38 @@ def check_kimi():
|
|||||||
return True, f"HTTP {out}"
|
return True, f"HTTP {out}"
|
||||||
return False, f"HTTP {out} err={err[:80]}"
|
return False, f"HTTP {out} err={err[:80]}"
|
||||||
|
|
||||||
def check_local_llama():
|
def check_openrouter():
|
||||||
"""Is local llama.cpp serving?"""
|
"""Check OpenRouter API availability and credentials."""
|
||||||
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
|
key = os.environ.get("OPENROUTER_API_KEY", "")
|
||||||
if code == 0 and "hermes" in out.lower():
|
if not key:
|
||||||
return True, "serving"
|
env_file = HERMES_HOME / ".env"
|
||||||
return False, f"exit={code}"
|
if env_file.exists():
|
||||||
|
for line in open(env_file):
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith("OPENROUTER_API_KEY="):
|
||||||
|
key = line.split("=", 1)[1].strip().strip('"\'')
|
||||||
|
break
|
||||||
|
if not key:
|
||||||
|
return False, "No OPENROUTER_API_KEY"
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
"https://openrouter.ai/api/v1/models",
|
||||||
|
headers={"Authorization": "Bearer " + key}
|
||||||
|
)
|
||||||
|
resp = urllib.request.urlopen(req, timeout=PROVIDER_TIMEOUT)
|
||||||
|
if resp.status == 200:
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
models = data.get("data", [])
|
||||||
|
return True, f"{len(models)} models available"
|
||||||
|
else:
|
||||||
|
return False, f"HTTP {resp.status}"
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 401:
|
||||||
|
return False, "Invalid OPENROUTER_API_KEY"
|
||||||
|
else:
|
||||||
|
return False, f"HTTP {e.code}"
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)[:100]
|
||||||
|
|
||||||
def check_ollama():
|
def check_ollama():
|
||||||
"""Is Ollama running?"""
|
"""Is Ollama running?"""
|
||||||
@@ -127,15 +175,18 @@ def check_vps(ip, name):
|
|||||||
|
|
||||||
# ─── FALLBACK ACTIONS ───
|
# ─── FALLBACK ACTIONS ───
|
||||||
|
|
||||||
def fallback_to_local_model(cfg):
|
def fallback_to_openrouter(cfg):
|
||||||
"""Switch primary model from Kimi to local llama.cpp"""
|
"Switch primary model from Kimi to OpenRouter (Gemini 2.5 Pro)"
|
||||||
if not BACKUP_CONFIG.exists():
|
if not BACKUP_CONFIG.exists():
|
||||||
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
||||||
|
|
||||||
cfg["model"]["provider"] = "local-llama.cpp"
|
openrouter_cfg = cfg.get("providers", {}).get("openrouter", {})
|
||||||
cfg["model"]["default"] = "hermes3"
|
base_url = openrouter_cfg.get("base_url", "https://openrouter.ai/api/v1")
|
||||||
|
cfg["model"]["provider"] = "openrouter"
|
||||||
|
cfg["model"]["default"] = "google/gemini-2.5-pro"
|
||||||
|
cfg["model"]["base_url"] = base_url
|
||||||
save_config(cfg)
|
save_config(cfg)
|
||||||
return "Switched primary model to local-llama.cpp/hermes3"
|
return "Switched primary model to openrouter/google/gemini-2.5-pro"
|
||||||
|
|
||||||
def fallback_to_ollama(cfg):
|
def fallback_to_ollama(cfg):
|
||||||
"""Switch to Ollama if llama.cpp is also down"""
|
"""Switch to Ollama if llama.cpp is also down"""
|
||||||
@@ -179,11 +230,11 @@ def diagnose_and_fallback():
|
|||||||
kimi_ok, kimi_msg = check_kimi()
|
kimi_ok, kimi_msg = check_kimi()
|
||||||
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
||||||
|
|
||||||
llama_ok, llama_msg = check_local_llama()
|
openrouter_ok, openrouter_msg = check_openrouter()
|
||||||
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
|
results["checks"]["openrouter"] = {"ok": openrouter_ok, "msg": openrouter_msg}
|
||||||
|
|
||||||
ollama_ok, ollama_msg = check_ollama()
|
oopenrouter_ok, oopenrouter_msg = check_ollama()
|
||||||
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
|
results["checks"]["ollama"] = {"ok": oopenrouter_ok, "msg": oopenrouter_msg}
|
||||||
|
|
||||||
gitea_ok, gitea_msg = check_gitea()
|
gitea_ok, gitea_msg = check_gitea()
|
||||||
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
||||||
@@ -202,41 +253,79 @@ def diagnose_and_fallback():
|
|||||||
|
|
||||||
# ─── FALLBACK LOGIC ───
|
# ─── FALLBACK LOGIC ───
|
||||||
|
|
||||||
# Case 1: Primary (Kimi) down, local available
|
# Case 1: Primary (Kimi) down, try fallback chain (OpenRouter -> Ollama)
|
||||||
if not kimi_ok and current_provider == "kimi-coding":
|
if not kimi_ok and current_provider == "kimi-coding":
|
||||||
if llama_ok:
|
agent_name = cfg.get("agent", {}).get("name", "timmy")
|
||||||
msg = fallback_to_local_model(cfg)
|
applied = False
|
||||||
|
# Try OpenRouter fallback
|
||||||
|
if openrouter_ok:
|
||||||
|
try:
|
||||||
|
msg = fallback_to_openrouter(cfg)
|
||||||
results["actions"].append(msg)
|
results["actions"].append(msg)
|
||||||
state["active_fallbacks"].append("kimi->local-llama")
|
state["active_fallbacks"].append("kimi->openrouter")
|
||||||
results["status"] = "degraded_local"
|
results["status"] = "degraded_openrouter"
|
||||||
elif ollama_ok:
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "success")
|
||||||
|
applied = True
|
||||||
|
except Exception as e:
|
||||||
|
log(f"OpenRouter fallback failed: {e}")
|
||||||
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
|
||||||
|
# If still not applied, try Ollama
|
||||||
|
if not applied and oopenrouter_ok:
|
||||||
|
try:
|
||||||
msg = fallback_to_ollama(cfg)
|
msg = fallback_to_ollama(cfg)
|
||||||
results["actions"].append(msg)
|
results["actions"].append(msg)
|
||||||
state["active_fallbacks"].append("kimi->ollama")
|
state["active_fallbacks"].append("kimi->ollama")
|
||||||
results["status"] = "degraded_ollama"
|
results["status"] = "degraded_ollama"
|
||||||
else:
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "success")
|
||||||
|
applied = True
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Ollama fallback failed: {e}")
|
||||||
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
|
||||||
|
if not applied:
|
||||||
|
try:
|
||||||
msg = enter_safe_mode(state)
|
msg = enter_safe_mode(state)
|
||||||
results["actions"].append(msg)
|
results["actions"].append(msg)
|
||||||
results["status"] = "safe_mode"
|
results["status"] = "safe_mode"
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Safe mode failed: {e}")
|
||||||
|
|
||||||
# Case 2: Already on fallback, check if primary recovered
|
# Case 2: Already on fallback, check if primary recovered — restore with resilience
|
||||||
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
|
elif kimi_ok:
|
||||||
|
restored = False
|
||||||
|
agent_name = cfg.get("agent", {}).get("name", "timmy")
|
||||||
|
# Try restore from OpenRouter fallback
|
||||||
|
if "kimi->openrouter" in state.get("active_fallbacks", []):
|
||||||
|
try:
|
||||||
msg = restore_config()
|
msg = restore_config()
|
||||||
results["actions"].append(msg)
|
results["actions"].append(msg)
|
||||||
state["active_fallbacks"].remove("kimi->local-llama")
|
state["active_fallbacks"].remove("kimi->openrouter")
|
||||||
results["status"] = "recovered"
|
results["status"] = "recovered"
|
||||||
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
|
restored = True
|
||||||
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "restored")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Restore from OpenRouter failed: {e}")
|
||||||
|
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
|
||||||
|
# Try restore from Ollama fallback if still not restored
|
||||||
|
if not restored and "kimi->ollama" in state.get("active_fallbacks", []):
|
||||||
|
try:
|
||||||
msg = restore_config()
|
msg = restore_config()
|
||||||
results["actions"].append(msg)
|
results["actions"].append(msg)
|
||||||
state["active_fallbacks"].remove("kimi->ollama")
|
state["active_fallbacks"].remove("kimi->ollama")
|
||||||
results["status"] = "recovered"
|
results["status"] = "recovered"
|
||||||
|
restored = True
|
||||||
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "restored")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Restore from Ollama failed: {e}")
|
||||||
|
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
|
||||||
|
if not restored:
|
||||||
|
log("WARNING: Primary recovered but unable to restore config")
|
||||||
|
|
||||||
# Case 3: Gitea down — just flag it, work locally
|
# Case 3: Gitea down — just flag it, work locally
|
||||||
if not gitea_ok:
|
if not gitea_ok:
|
||||||
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
||||||
if "gitea_down" not in state.get("active_fallbacks", []):
|
if "gitea_down" not in state.get("active_fallbacks", []):
|
||||||
state["active_fallbacks"].append("gitea_down")
|
state["active_fallbacks"].append("gitea_down")
|
||||||
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
|
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"] else 0)
|
||||||
elif "gitea_down" in state.get("active_fallbacks", []):
|
elif "gitea_down" in state.get("active_fallbacks", []):
|
||||||
state["active_fallbacks"].remove("gitea_down")
|
state["active_fallbacks"].remove("gitea_down")
|
||||||
results["actions"].append("Gitea recovered — resume normal operations")
|
results["actions"].append("Gitea recovered — resume normal operations")
|
||||||
|
|||||||
Reference in New Issue
Block a user