Some checks failed
PR Checklist / pr-checklist (pull_request) Failing after 3m11s
Automatic fallback chain: Anthropic -> local-llama.cpp -> Ollama -> safe mode. Auto-recovery when primary returns. Reversible config changes with backup.
264 lines
11 KiB
Python
264 lines
11 KiB
Python
1|#!/usr/bin/env python3
|
|
2|"""
|
|
3|Dead Man Switch Fallback Engine
|
|
4|
|
|
5|When the dead man switch triggers (zero commits for 2+ hours, model down,
|
|
6|Gitea unreachable, etc.), this script diagnoses the failure and applies
|
|
7|common sense fallbacks automatically.
|
|
8|
|
|
9|Fallback chain:
|
|
10|1. Primary model (Anthropic) down -> switch config to local-llama.cpp
|
|
11|2. Gitea unreachable -> cache issues locally, retry on recovery
|
|
12|3. VPS agents down -> alert + lazarus protocol
|
|
13|4. Local llama.cpp down -> try Ollama, then alert-only mode
|
|
14|5. All inference dead -> safe mode (cron pauses, alert Alexander)
|
|
15|
|
|
16|Each fallback is reversible. Recovery auto-restores the previous config.
|
|
17|"""
|
|
18|import os
|
|
19|import sys
|
|
20|import json
|
|
21|import subprocess
|
|
22|import time
|
|
23|import yaml
|
|
24|import shutil
|
|
25|from pathlib import Path
|
|
26|from datetime import datetime, timedelta
|
|
27|
|
|
28|HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
|
29|CONFIG_PATH = HERMES_HOME / "config.yaml"
|
|
30|FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
|
31|BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
|
32|FORGE_URL = "https://forge.alexanderwhitestone.com"
|
|
33|
|
|
34|def load_config():
|
|
35| with open(CONFIG_PATH) as f:
|
|
36| return yaml.safe_load(f)
|
|
37|
|
|
38|def save_config(cfg):
|
|
39| with open(CONFIG_PATH, "w") as f:
|
|
40| yaml.dump(cfg, f, default_flow_style=False)
|
|
41|
|
|
42|def load_state():
|
|
43| if FALLBACK_STATE.exists():
|
|
44| with open(FALLBACK_STATE) as f:
|
|
45| return json.load(f)
|
|
46| return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
|
|
47|
|
|
48|def save_state(state):
|
|
49| state["last_check"] = datetime.now().isoformat()
|
|
50| with open(FALLBACK_STATE, "w") as f:
|
|
51| json.dump(state, f, indent=2)
|
|
52|
|
|
53|def run(cmd, timeout=10):
|
|
54| try:
|
|
55| r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
|
56| return r.returncode, r.stdout.strip(), r.stderr.strip()
|
|
57| except subprocess.TimeoutExpired:
|
|
58| return -1, "", "timeout"
|
|
59| except Exception as e:
|
|
60| return -1, "", str(e)
|
|
61|
|
|
62|# ─── HEALTH CHECKS ───
|
|
63|
|
|
64|def check_anthropic():
|
|
65| """Can we reach Anthropic API?"""
|
|
66| key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
67| if not key:
|
|
68| # Check multiple .env locations
|
|
69| for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
|
|
70| if env_path.exists():
|
|
71| for line in open(env_path):
|
|
72| line = line.strip()
|
|
73| if line.startswith("ANTHROPIC_API_KEY=***
|
|
74| key = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
75| break
|
|
76| if key:
|
|
77| break
|
|
78| if not key:
|
|
79| return False, "no API key"
|
|
80| code, out, err = run(
|
|
81| f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
|
|
82| f'-H "anthropic-version: 2023-06-01" '
|
|
83| f'https://api.anthropic.com/v1/messages -X POST '
|
|
84| f'-H "content-type: application/json" '
|
|
85| f'-d \'{{"model":"claude-haiku-4-5-20251001","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
|
|
86| timeout=15
|
|
87| )
|
|
88| if code == 0 and out in ("200", "429"):
|
|
89| return True, f"HTTP {out}"
|
|
90| return False, f"HTTP {out} err={err[:80]}"
|
|
91|
|
|
92|def check_local_llama():
|
|
93| """Is local llama.cpp serving?"""
|
|
94| code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
|
|
95| if code == 0 and "hermes" in out.lower():
|
|
96| return True, "serving"
|
|
97| return False, f"exit={code}"
|
|
98|
|
|
99|def check_ollama():
|
|
100| """Is Ollama running?"""
|
|
101| code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
|
|
102| if code == 0 and "models" in out:
|
|
103| return True, "running"
|
|
104| return False, f"exit={code}"
|
|
105|
|
|
106|def check_gitea():
|
|
107| """Can we reach the Forge?"""
|
|
108| token_path = Path.home() / ".config" / "gitea" / "timmy-token"
|
|
109| if not token_path.exists():
|
|
110| return False, "no token"
|
|
111| token = token_path.read_text().strip()
|
|
112| code, out, err = run(
|
|
113| f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
|
|
114| f'"{FORGE_URL}/api/v1/user"',
|
|
115| timeout=10
|
|
116| )
|
|
117| if code == 0 and out == "200":
|
|
118| return True, "reachable"
|
|
119| return False, f"HTTP {out}"
|
|
120|
|
|
121|def check_vps(ip, name):
|
|
122| """Can we SSH into a VPS?"""
|
|
123| code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
|
|
124| if code == 0 and "alive" in out:
|
|
125| return True, "alive"
|
|
126| return False, f"unreachable"
|
|
127|
|
|
128|# ─── FALLBACK ACTIONS ───
|
|
129|
|
|
130|def fallback_to_local_model(cfg):
|
|
131| """Switch primary model from Anthropic to local llama.cpp"""
|
|
132| if not BACKUP_CONFIG.exists():
|
|
133| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
|
134|
|
|
135| cfg["model"]["provider"] = "local-llama.cpp"
|
|
136| cfg["model"]["default"] = "hermes3"
|
|
137| save_config(cfg)
|
|
138| return "Switched primary model to local-llama.cpp/hermes3"
|
|
139|
|
|
140|def fallback_to_ollama(cfg):
|
|
141| """Switch to Ollama if llama.cpp is also down"""
|
|
142| if not BACKUP_CONFIG.exists():
|
|
143| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
|
144|
|
|
145| cfg["model"]["provider"] = "ollama"
|
|
146| cfg["model"]["default"] = "gemma4:latest"
|
|
147| save_config(cfg)
|
|
148| return "Switched primary model to ollama/gemma4:latest"
|
|
149|
|
|
150|def enter_safe_mode(state):
|
|
151| """Pause all non-essential cron jobs, alert Alexander"""
|
|
152| state["safe_mode"] = True
|
|
153| state["safe_mode_entered"] = datetime.now().isoformat()
|
|
154| save_state(state)
|
|
155| return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
|
|
156|
|
|
157|def restore_config():
|
|
158| """Restore pre-fallback config when primary recovers"""
|
|
159| if BACKUP_CONFIG.exists():
|
|
160| shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
|
|
161| BACKUP_CONFIG.unlink()
|
|
162| return "Restored original config from backup"
|
|
163| return "No backup config to restore"
|
|
164|
|
|
165|# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
|
|
166|
|
|
167|def diagnose_and_fallback():
|
|
168| state = load_state()
|
|
169| cfg = load_config()
|
|
170|
|
|
171| results = {
|
|
172| "timestamp": datetime.now().isoformat(),
|
|
173| "checks": {},
|
|
174| "actions": [],
|
|
175| "status": "healthy"
|
|
176| }
|
|
177|
|
|
178| # Check all systems
|
|
179| anthropic_ok, anthropic_msg = check_anthropic()
|
|
180| results["checks"]["anthropic"] = {"ok": anthropic_ok, "msg": anthropic_msg}
|
|
181|
|
|
182| llama_ok, llama_msg = check_local_llama()
|
|
183| results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
|
|
184|
|
|
185| ollama_ok, ollama_msg = check_ollama()
|
|
186| results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
|
|
187|
|
|
188| gitea_ok, gitea_msg = check_gitea()
|
|
189| results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
|
190|
|
|
191| # VPS checks
|
|
192| vpses = [
|
|
193| ("167.99.126.228", "Allegro"),
|
|
194| ("143.198.27.163", "Ezra"),
|
|
195| ("159.203.146.185", "Bezalel"),
|
|
196| ]
|
|
197| for ip, name in vpses:
|
|
198| vps_ok, vps_msg = check_vps(ip, name)
|
|
199| results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
|
|
200|
|
|
201| current_provider = cfg.get("model", {}).get("provider", "anthropic")
|
|
202|
|
|
203| # ─── FALLBACK LOGIC ───
|
|
204|
|
|
205| # Case 1: Primary (Anthropic) down, local available
|
|
206| if not anthropic_ok and current_provider == "anthropic":
|
|
207| if llama_ok:
|
|
208| msg = fallback_to_local_model(cfg)
|
|
209| results["actions"].append(msg)
|
|
210| state["active_fallbacks"].append("anthropic->local-llama")
|
|
211| results["status"] = "degraded_local"
|
|
212| elif ollama_ok:
|
|
213| msg = fallback_to_ollama(cfg)
|
|
214| results["actions"].append(msg)
|
|
215| state["active_fallbacks"].append("anthropic->ollama")
|
|
216| results["status"] = "degraded_ollama"
|
|
217| else:
|
|
218| msg = enter_safe_mode(state)
|
|
219| results["actions"].append(msg)
|
|
220| results["status"] = "safe_mode"
|
|
221|
|
|
222| # Case 2: Already on fallback, check if primary recovered
|
|
223| elif anthropic_ok and "anthropic->local-llama" in state.get("active_fallbacks", []):
|
|
224| msg = restore_config()
|
|
225| results["actions"].append(msg)
|
|
226| state["active_fallbacks"].remove("anthropic->local-llama")
|
|
227| results["status"] = "recovered"
|
|
228| elif anthropic_ok and "anthropic->ollama" in state.get("active_fallbacks", []):
|
|
229| msg = restore_config()
|
|
230| results["actions"].append(msg)
|
|
231| state["active_fallbacks"].remove("anthropic->ollama")
|
|
232| results["status"] = "recovered"
|
|
233|
|
|
234| # Case 3: Gitea down — just flag it, work locally
|
|
235| if not gitea_ok:
|
|
236| results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
|
237| if "gitea_down" not in state.get("active_fallbacks", []):
|
|
238| state["active_fallbacks"].append("gitea_down")
|
|
239| results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
|
|
240| elif "gitea_down" in state.get("active_fallbacks", []):
|
|
241| state["active_fallbacks"].remove("gitea_down")
|
|
242| results["actions"].append("Gitea recovered — resume normal operations")
|
|
243|
|
|
244| # Case 4: VPS agents down
|
|
245| for ip, name in vpses:
|
|
246| key = f"vps_{name.lower()}"
|
|
247| if not results["checks"][key]["ok"]:
|
|
248| results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
|
|
249|
|
|
250| save_state(state)
|
|
251| return results
|
|
252|
|
|
253|if __name__ == "__main__":
|
|
254| results = diagnose_and_fallback()
|
|
255| print(json.dumps(results, indent=2))
|
|
256|
|
|
257| # Exit codes for cron integration
|
|
258| if results["status"] == "safe_mode":
|
|
259| sys.exit(2)
|
|
260| elif results["status"].startswith("degraded"):
|
|
261| sys.exit(1)
|
|
262| else:
|
|
263| sys.exit(0)
|
|
264| |