Files
timmy-config/bin/deadman-fallback.py
Step35 Burn ffd2d352c6
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 29s
Smoke Test / smoke (pull_request) Failing after 20s
Validate Config / YAML Lint (pull_request) Failing after 17s
Validate Config / JSON Validate (pull_request) Successful in 22s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 58s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Cron Syntax Check (pull_request) Successful in 14s
Validate Config / Shell Script Lint (pull_request) Failing after 58s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 16s
Validate Config / Playbook Schema Validation (pull_request) Successful in 30s
Architecture Lint / Lint Repository (pull_request) Failing after 28s
PR Checklist / pr-checklist (pull_request) Successful in 4m20s
fix(deadman-fallback): try/except/continue cascade + OpenRouter
- Add PROVIDER_TIMEOUT (30s default, env PROVIDER_TIMEOUT)
- Replace local-llama fallback with OpenRouter (openrouter/google/gemini-2.5-pro)
- Wrap fallback_to_openrouter, fallback_to_ollama, restore_config, enter_safe_mode in try/except
- Continue to next fallback on any error; no crash propagation
- Log all fallback events to request_log SQLite DB
- Provider errors caught/telemetry; never corrupt config

Closes #445
2026-04-30 01:51:14 -04:00

353 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Dead Man Switch Fallback Engine
When the dead man switch triggers (zero commits for 2+ hours, model down,
Gitea unreachable, etc.), this script diagnoses the failure and applies
common sense fallbacks automatically.
Fallback chain:
1. Primary model (Kimi) down -> switch config to local-llama.cpp
2. Gitea unreachable -> cache issues locally, retry on recovery
3. VPS agents down -> alert + lazarus protocol
4. Local llama.cpp down -> try Ollama, then alert-only mode
5. All inference dead -> safe mode (cron pauses, alert Alexander)
Each fallback is reversible. Recovery auto-restores the previous config.
"""
import os
import sys
import json
import subprocess
import time
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timedelta
import sqlite3
import urllib.request
import urllib.error
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
CONFIG_PATH = HERMES_HOME / "config.yaml"
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
FORGE_URL = "https://forge.alexanderwhitestone.com"
# Golden-state fallback chain: Kimi → OpenRouter (Gemini 2.5 Pro) → Ollama (gemma4:latest)
PROVIDER_TIMEOUT = int(os.getenv("PROVIDER_TIMEOUT", "30"))
def load_config():
with open(CONFIG_PATH) as f:
return yaml.safe_load(f)
def save_config(cfg):
with open(CONFIG_PATH, "w") as f:
yaml.dump(cfg, f, default_flow_style=False)
def load_state():
if FALLBACK_STATE.exists():
with open(FALLBACK_STATE) as f:
return json.load(f)
return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
def save_state(state):
state["last_check"] = datetime.now().isoformat()
with open(FALLBACK_STATE, "w") as f:
json.dump(state, f, indent=2)
def run(cmd, timeout=PROVIDER_TIMEOUT):
try:
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
return r.returncode, r.stdout.strip(), r.stderr.strip()
except subprocess.TimeoutExpired:
return -1, "", "timeout"
except Exception as e:
return -1, "", str(e)
# ─── HEALTH CHECKS ───
def log_fallback_event(agent_name, provider, model, status, error_message=None):
"""Log fallback events to request_log SQLite DB (telemetry)."""
try:
log_path = Path.home() / ".local" / "timmy" / "request_log.db"
if log_path.exists():
conn = sqlite3.connect(str(log_path))
cursor = conn.cursor()
cursor.execute("""
INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message)
VALUES (datetime('now'), ?, ?, ?, ?, ?, ?)
""", (agent_name, provider, model, 'fallback_switch', status, error_message))
conn.commit()
conn.close()
except Exception:
pass # Silent if telemetry unavailable
def check_kimi():
"""Can we reach Kimi Coding API?"""
key = os.environ.get("KIMI_API_KEY", "")
if not key:
# Check multiple .env locations
for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
if env_path.exists():
for line in open(env_path):
line = line.strip()
if line.startswith("KIMI_API_KEY="):
key = line.split("=", 1)[1].strip().strip('"').strip("'")
break
if key:
break
if not key:
return False, "no API key"
code, out, err = run(
f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
f'-H "x-api-provider: kimi-coding" '
f'https://api.kimi.com/coding/v1/models -X POST '
f'-H "content-type: application/json" '
f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
timeout=15
)
if code == 0 and out in ("200", "429"):
return True, f"HTTP {out}"
return False, f"HTTP {out} err={err[:80]}"
def check_openrouter():
"""Check OpenRouter API availability and credentials."""
key = os.environ.get("OPENROUTER_API_KEY", "")
if not key:
env_file = HERMES_HOME / ".env"
if env_file.exists():
for line in open(env_file):
line = line.strip()
if line.startswith("OPENROUTER_API_KEY="):
key = line.split("=", 1)[1].strip().strip('"\'')
break
if not key:
return False, "No OPENROUTER_API_KEY"
try:
req = urllib.request.Request(
"https://openrouter.ai/api/v1/models",
headers={"Authorization": "Bearer " + key}
)
resp = urllib.request.urlopen(req, timeout=PROVIDER_TIMEOUT)
if resp.status == 200:
data = json.loads(resp.read())
models = data.get("data", [])
return True, f"{len(models)} models available"
else:
return False, f"HTTP {resp.status}"
except urllib.error.HTTPError as e:
if e.code == 401:
return False, "Invalid OPENROUTER_API_KEY"
else:
return False, f"HTTP {e.code}"
except Exception as e:
return False, str(e)[:100]
def check_ollama():
"""Is Ollama running?"""
code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
if code == 0 and "models" in out:
return True, "running"
return False, f"exit={code}"
def check_gitea():
"""Can we reach the Forge?"""
token_path = Path.home() / ".config" / "gitea" / "timmy-token"
if not token_path.exists():
return False, "no token"
token = token_path.read_text().strip()
code, out, err = run(
f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
f'"{FORGE_URL}/api/v1/user"',
timeout=10
)
if code == 0 and out == "200":
return True, "reachable"
return False, f"HTTP {out}"
def check_vps(ip, name):
"""Can we SSH into a VPS?"""
code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
if code == 0 and "alive" in out:
return True, "alive"
return False, f"unreachable"
# ─── FALLBACK ACTIONS ───
def fallback_to_openrouter(cfg):
"Switch primary model from Kimi to OpenRouter (Gemini 2.5 Pro)"
if not BACKUP_CONFIG.exists():
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
openrouter_cfg = cfg.get("providers", {}).get("openrouter", {})
base_url = openrouter_cfg.get("base_url", "https://openrouter.ai/api/v1")
cfg["model"]["provider"] = "openrouter"
cfg["model"]["default"] = "google/gemini-2.5-pro"
cfg["model"]["base_url"] = base_url
save_config(cfg)
return "Switched primary model to openrouter/google/gemini-2.5-pro"
def fallback_to_ollama(cfg):
"""Switch to Ollama if llama.cpp is also down"""
if not BACKUP_CONFIG.exists():
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
cfg["model"]["provider"] = "ollama"
cfg["model"]["default"] = "gemma4:latest"
save_config(cfg)
return "Switched primary model to ollama/gemma4:latest"
def enter_safe_mode(state):
"""Pause all non-essential cron jobs, alert Alexander"""
state["safe_mode"] = True
state["safe_mode_entered"] = datetime.now().isoformat()
save_state(state)
return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
def restore_config():
"""Restore pre-fallback config when primary recovers"""
if BACKUP_CONFIG.exists():
shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
BACKUP_CONFIG.unlink()
return "Restored original config from backup"
return "No backup config to restore"
# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
def diagnose_and_fallback():
state = load_state()
cfg = load_config()
results = {
"timestamp": datetime.now().isoformat(),
"checks": {},
"actions": [],
"status": "healthy"
}
# Check all systems
kimi_ok, kimi_msg = check_kimi()
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
openrouter_ok, openrouter_msg = check_openrouter()
results["checks"]["openrouter"] = {"ok": openrouter_ok, "msg": openrouter_msg}
oopenrouter_ok, oopenrouter_msg = check_ollama()
results["checks"]["ollama"] = {"ok": oopenrouter_ok, "msg": oopenrouter_msg}
gitea_ok, gitea_msg = check_gitea()
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
# VPS checks
vpses = [
("167.99.126.228", "Allegro"),
("143.198.27.163", "Ezra"),
("159.203.146.185", "Bezalel"),
]
for ip, name in vpses:
vps_ok, vps_msg = check_vps(ip, name)
results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
# ─── FALLBACK LOGIC ───
# Case 1: Primary (Kimi) down, try fallback chain (OpenRouter -> Ollama)
if not kimi_ok and current_provider == "kimi-coding":
agent_name = cfg.get("agent", {}).get("name", "timmy")
applied = False
# Try OpenRouter fallback
if openrouter_ok:
try:
msg = fallback_to_openrouter(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->openrouter")
results["status"] = "degraded_openrouter"
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "success")
applied = True
except Exception as e:
log(f"OpenRouter fallback failed: {e}")
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
# If still not applied, try Ollama
if not applied and oopenrouter_ok:
try:
msg = fallback_to_ollama(cfg)
results["actions"].append(msg)
state["active_fallbacks"].append("kimi->ollama")
results["status"] = "degraded_ollama"
log_fallback_event(agent_name, "ollama", "gemma4:latest", "success")
applied = True
except Exception as e:
log(f"Ollama fallback failed: {e}")
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
if not applied:
try:
msg = enter_safe_mode(state)
results["actions"].append(msg)
results["status"] = "safe_mode"
except Exception as e:
log(f"Safe mode failed: {e}")
# Case 2: Already on fallback, check if primary recovered — restore with resilience
elif kimi_ok:
restored = False
agent_name = cfg.get("agent", {}).get("name", "timmy")
# Try restore from OpenRouter fallback
if "kimi->openrouter" in state.get("active_fallbacks", []):
try:
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->openrouter")
results["status"] = "recovered"
restored = True
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "restored")
except Exception as e:
log(f"Restore from OpenRouter failed: {e}")
log_fallback_event(agent_name, "openrouter", "google/gemini-2.5-pro", "error", str(e))
# Try restore from Ollama fallback if still not restored
if not restored and "kimi->ollama" in state.get("active_fallbacks", []):
try:
msg = restore_config()
results["actions"].append(msg)
state["active_fallbacks"].remove("kimi->ollama")
results["status"] = "recovered"
restored = True
log_fallback_event(agent_name, "ollama", "gemma4:latest", "restored")
except Exception as e:
log(f"Restore from Ollama failed: {e}")
log_fallback_event(agent_name, "ollama", "gemma4:latest", "error", str(e))
if not restored:
log("WARNING: Primary recovered but unable to restore config")
# Case 3: Gitea down — just flag it, work locally
if not gitea_ok:
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
if "gitea_down" not in state.get("active_fallbacks", []):
state["active_fallbacks"].append("gitea_down")
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_openrouter", "degraded_ollama", "safe_mode"] else 0)
elif "gitea_down" in state.get("active_fallbacks", []):
state["active_fallbacks"].remove("gitea_down")
results["actions"].append("Gitea recovered — resume normal operations")
# Case 4: VPS agents down
for ip, name in vpses:
key = f"vps_{name.lower()}"
if not results["checks"][key]["ok"]:
results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
save_state(state)
return results
if __name__ == "__main__":
results = diagnose_and_fallback()
print(json.dumps(results, indent=2))
# Exit codes for cron integration
if results["status"] == "safe_mode":
sys.exit(2)
elif results["status"].startswith("degraded"):
sys.exit(1)
else:
sys.exit(0)