Compare commits
10 Commits
sprint/iss
...
burn/auto-
| Author | SHA1 | Date | |
|---|---|---|---|
| 5d3ccd1b07 | |||
| 556e25ebe5 | |||
| 230a8c0771 | |||
| 5bbe7513fb | |||
| ecf15d4069 | |||
| c0c34cbae5 | |||
|
|
8483a6602a | ||
| af9850080a | |||
|
|
d50296e76b | ||
| 34460cc97b |
@@ -20,5 +20,13 @@ jobs:
|
||||
echo "PASS: All files parse"
|
||||
- name: Secret scan
|
||||
run: |
|
||||
if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v .gitea; then exit 1; fi
|
||||
if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null \
|
||||
| grep -v '.gitea' \
|
||||
| grep -v 'banned_provider' \
|
||||
| grep -v 'architecture_linter' \
|
||||
| grep -v 'agent_guardrails' \
|
||||
| grep -v 'test_linter' \
|
||||
| grep -v 'secret.scan' \
|
||||
| grep -v 'secret-scan' \
|
||||
| grep -v 'hermes-sovereign/security'; then exit 1; fi
|
||||
echo "PASS: No secrets"
|
||||
|
||||
17
ansible/roles/deadman_switch/handlers/main.yml
Normal file
17
ansible/roles/deadman_switch/handlers/main.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
---
|
||||
- name: "Enable deadman service"
|
||||
systemd:
|
||||
name: "deadman-{{ wizard_name | lower }}.service"
|
||||
daemon_reload: true
|
||||
enabled: true
|
||||
|
||||
- name: "Enable deadman timer"
|
||||
systemd:
|
||||
name: "deadman-{{ wizard_name | lower }}.timer"
|
||||
daemon_reload: true
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: "Load deadman plist"
|
||||
shell: "launchctl load {{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
|
||||
ignore_errors: true
|
||||
@@ -51,20 +51,3 @@
|
||||
mode: "0444"
|
||||
ignore_errors: true
|
||||
|
||||
handlers:
|
||||
- name: "Enable deadman service"
|
||||
systemd:
|
||||
name: "deadman-{{ wizard_name | lower }}.service"
|
||||
daemon_reload: true
|
||||
enabled: true
|
||||
|
||||
- name: "Enable deadman timer"
|
||||
systemd:
|
||||
name: "deadman-{{ wizard_name | lower }}.timer"
|
||||
daemon_reload: true
|
||||
enabled: true
|
||||
state: started
|
||||
|
||||
- name: "Load deadman plist"
|
||||
shell: "launchctl load {{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
|
||||
ignore_errors: true
|
||||
|
||||
@@ -202,6 +202,19 @@ curl -s -X POST "{gitea_url}/api/v1/repos/{repo}/issues/{issue_num}/comments" \\
|
||||
REVIEW CHECKLIST BEFORE YOU PUSH:
|
||||
{review}
|
||||
|
||||
COMMIT DISCIPLINE (CRITICAL):
|
||||
- Commit every 3-5 tool calls. Do NOT wait until the end.
|
||||
- After every meaningful file change: git add -A && git commit -m "WIP: <what changed>"
|
||||
- Before running any destructive command: commit current state first.
|
||||
- If you are unsure whether to commit: commit. WIP commits are safe. Lost work is not.
|
||||
- Never use --no-verify.
|
||||
- The auto-commit-guard is your safety net, but do not rely on it. Commit proactively.
|
||||
|
||||
RECOVERY COMMANDS (if interrupted, another agent can resume):
|
||||
git log --oneline -10 # see your WIP commits
|
||||
git diff HEAD~1 # see what the last commit changed
|
||||
git status # see uncommitted work
|
||||
|
||||
RULES:
|
||||
- Do not skip hooks with --no-verify.
|
||||
- Do not silently widen the scope.
|
||||
|
||||
@@ -161,6 +161,14 @@ run_worker() {
|
||||
CYCLE_END=$(date +%s)
|
||||
CYCLE_DURATION=$((CYCLE_END - CYCLE_START))
|
||||
|
||||
# --- Mid-session auto-commit: commit before timeout if work is dirty ---
|
||||
cd "$worktree" 2>/dev/null || true
|
||||
# Ensure auto-commit-guard is running
|
||||
if ! pgrep -f "auto-commit-guard.sh" >/dev/null 2>&1; then
|
||||
log "Starting auto-commit-guard daemon"
|
||||
nohup bash "$(dirname "$0")/auto-commit-guard.sh" 120 "$WORKTREE_BASE" >> "$LOG_DIR/auto-commit-guard.log" 2>&1 &
|
||||
fi
|
||||
|
||||
# Salvage
|
||||
cd "$worktree" 2>/dev/null || true
|
||||
DIRTY=$(git status --porcelain 2>/dev/null | wc -l | tr -d ' ')
|
||||
|
||||
159
bin/auto-commit-guard.sh
Normal file
159
bin/auto-commit-guard.sh
Normal file
@@ -0,0 +1,159 @@
|
||||
#!/usr/bin/env bash
|
||||
# auto-commit-guard.sh — Background daemon that auto-commits uncommitted work
|
||||
#
|
||||
# Usage: auto-commit-guard.sh [interval_seconds] [worktree_base]
|
||||
# auto-commit-guard.sh # defaults: 120s, ~/worktrees
|
||||
# auto-commit-guard.sh 60 # check every 60s
|
||||
# auto-commit-guard.sh 180 ~/my-worktrees
|
||||
#
|
||||
# Scans all git repos under the worktree base for uncommitted changes.
|
||||
# If dirty for >= 1 check cycle, auto-commits with a WIP message.
|
||||
# Pushes unpushed commits so work is always recoverable from the remote.
|
||||
#
|
||||
# Also scans /tmp for orphaned agent workdirs on startup.
|
||||
|
||||
set -uo pipefail
|
||||
|
||||
INTERVAL="${1:-120}"
|
||||
WORKTREE_BASE="${2:-$HOME/worktrees}"
|
||||
LOG_DIR="$HOME/.hermes/logs"
|
||||
LOG="$LOG_DIR/auto-commit-guard.log"
|
||||
PIDFILE="$LOG_DIR/auto-commit-guard.pid"
|
||||
ORPHAN_SCAN_DONE="$LOG_DIR/.orphan-scan-done"
|
||||
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
# Single instance guard
|
||||
if [ -f "$PIDFILE" ]; then
|
||||
old_pid=$(cat "$PIDFILE")
|
||||
if kill -0 "$old_pid" 2>/dev/null; then
|
||||
echo "auto-commit-guard already running (PID $old_pid)" >&2
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
echo $$ > "$PIDFILE"
|
||||
trap 'rm -f "$PIDFILE"' EXIT
|
||||
|
||||
log() {
|
||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] AUTO-COMMIT: $*" >> "$LOG"
|
||||
}
|
||||
|
||||
# --- Orphaned workdir scan (runs once on startup) ---
|
||||
scan_orphans() {
|
||||
if [ -f "$ORPHAN_SCAN_DONE" ]; then
|
||||
return 0
|
||||
fi
|
||||
log "Scanning /tmp for orphaned agent workdirs..."
|
||||
local found=0
|
||||
local rescued=0
|
||||
|
||||
for dir in /tmp/*-work-* /tmp/timmy-burn-* /tmp/tc-burn; do
|
||||
[ -d "$dir" ] || continue
|
||||
[ -d "$dir/.git" ] || continue
|
||||
|
||||
found=$((found + 1))
|
||||
cd "$dir" 2>/dev/null || continue
|
||||
|
||||
local dirty
|
||||
dirty=$(git status --porcelain 2>/dev/null | wc -l | tr -d " ")
|
||||
if [ "${dirty:-0}" -gt 0 ]; then
|
||||
local branch
|
||||
branch=$(git branch --show-current 2>/dev/null || echo "orphan")
|
||||
git add -A 2>/dev/null
|
||||
if git commit -m "WIP: orphan rescue — $dirty file(s) auto-committed on $(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
|
||||
Orphaned workdir detected at $dir.
|
||||
Branch: $branch
|
||||
Rescued by auto-commit-guard on startup." 2>/dev/null; then
|
||||
rescued=$((rescued + 1))
|
||||
log "RESCUED: $dir ($dirty files on branch $branch)"
|
||||
|
||||
# Try to push if remote exists
|
||||
if git remote get-url origin >/dev/null 2>&1; then
|
||||
git push -u origin "$branch" 2>/dev/null && log "PUSHED orphan rescue: $dir → $branch" || log "PUSH FAILED orphan rescue: $dir (no remote access)"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
log "Orphan scan complete: $found workdirs checked, $rescued rescued"
|
||||
touch "$ORPHAN_SCAN_DONE"
|
||||
}
|
||||
|
||||
# --- Main guard loop ---
|
||||
guard_cycle() {
|
||||
local committed=0
|
||||
local scanned=0
|
||||
|
||||
# Scan worktree base
|
||||
if [ -d "$WORKTREE_BASE" ]; then
|
||||
for dir in "$WORKTREE_BASE"/*/; do
|
||||
[ -d "$dir" ] || continue
|
||||
[ -d "$dir/.git" ] || continue
|
||||
|
||||
scanned=$((scanned + 1))
|
||||
cd "$dir" 2>/dev/null || continue
|
||||
|
||||
local dirty
|
||||
dirty=$(git status --porcelain 2>/dev/null | wc -l | tr -d " ")
|
||||
[ "${dirty:-0}" -eq 0 ] && continue
|
||||
|
||||
local branch
|
||||
branch=$(git branch --show-current 2>/dev/null || echo "detached")
|
||||
|
||||
git add -A 2>/dev/null
|
||||
if git commit -m "WIP: auto-commit — $dirty file(s) on $branch
|
||||
|
||||
Automated commit by auto-commit-guard at $(date -u +%Y-%m-%dT%H:%M:%SZ).
|
||||
Work preserved to prevent loss on crash." 2>/dev/null; then
|
||||
committed=$((committed + 1))
|
||||
log "COMMITTED: $dir ($dirty files, branch $branch)"
|
||||
|
||||
# Push to preserve remotely
|
||||
if git remote get-url origin >/dev/null 2>&1; then
|
||||
git push -u origin "$branch" 2>/dev/null && log "PUSHED: $dir → $branch" || log "PUSH FAILED: $dir (will retry next cycle)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Also scan /tmp for agent workdirs
|
||||
for dir in /tmp/*-work-*; do
|
||||
[ -d "$dir" ] || continue
|
||||
[ -d "$dir/.git" ] || continue
|
||||
|
||||
scanned=$((scanned + 1))
|
||||
cd "$dir" 2>/dev/null || continue
|
||||
|
||||
local dirty
|
||||
dirty=$(git status --porcelain 2>/dev/null | wc -l | tr -d " ")
|
||||
[ "${dirty:-0}" -eq 0 ] && continue
|
||||
|
||||
local branch
|
||||
branch=$(git branch --show-current 2>/dev/null || echo "detached")
|
||||
|
||||
git add -A 2>/dev/null
|
||||
if git commit -m "WIP: auto-commit — $dirty file(s) on $branch
|
||||
|
||||
Automated commit by auto-commit-guard at $(date -u +%Y-%m-%dT%H:%M:%SZ).
|
||||
Agent workdir preserved to prevent loss." 2>/dev/null; then
|
||||
committed=$((committed + 1))
|
||||
log "COMMITTED: $dir ($dirty files, branch $branch)"
|
||||
|
||||
if git remote get-url origin >/dev/null 2>&1; then
|
||||
git push -u origin "$branch" 2>/dev/null && log "PUSHED: $dir → $branch" || log "PUSH FAILED: $dir (will retry next cycle)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
[ "$committed" -gt 0 ] && log "Cycle done: $scanned scanned, $committed committed"
|
||||
}
|
||||
|
||||
# --- Entry point ---
|
||||
log "Starting auto-commit-guard (interval=${INTERVAL}s, worktree=${WORKTREE_BASE})"
|
||||
scan_orphans
|
||||
|
||||
while true; do
|
||||
guard_cycle
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
@@ -1,264 +1,263 @@
|
||||
1|#!/usr/bin/env python3
|
||||
2|"""
|
||||
3|Dead Man Switch Fallback Engine
|
||||
4|
|
||||
5|When the dead man switch triggers (zero commits for 2+ hours, model down,
|
||||
6|Gitea unreachable, etc.), this script diagnoses the failure and applies
|
||||
7|common sense fallbacks automatically.
|
||||
8|
|
||||
9|Fallback chain:
|
||||
10|1. Primary model (Kimi) down -> switch config to local-llama.cpp
|
||||
11|2. Gitea unreachable -> cache issues locally, retry on recovery
|
||||
12|3. VPS agents down -> alert + lazarus protocol
|
||||
13|4. Local llama.cpp down -> try Ollama, then alert-only mode
|
||||
14|5. All inference dead -> safe mode (cron pauses, alert Alexander)
|
||||
15|
|
||||
16|Each fallback is reversible. Recovery auto-restores the previous config.
|
||||
17|"""
|
||||
18|import os
|
||||
19|import sys
|
||||
20|import json
|
||||
21|import subprocess
|
||||
22|import time
|
||||
23|import yaml
|
||||
24|import shutil
|
||||
25|from pathlib import Path
|
||||
26|from datetime import datetime, timedelta
|
||||
27|
|
||||
28|HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
||||
29|CONFIG_PATH = HERMES_HOME / "config.yaml"
|
||||
30|FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
||||
31|BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
||||
32|FORGE_URL = "https://forge.alexanderwhitestone.com"
|
||||
33|
|
||||
34|def load_config():
|
||||
35| with open(CONFIG_PATH) as f:
|
||||
36| return yaml.safe_load(f)
|
||||
37|
|
||||
38|def save_config(cfg):
|
||||
39| with open(CONFIG_PATH, "w") as f:
|
||||
40| yaml.dump(cfg, f, default_flow_style=False)
|
||||
41|
|
||||
42|def load_state():
|
||||
43| if FALLBACK_STATE.exists():
|
||||
44| with open(FALLBACK_STATE) as f:
|
||||
45| return json.load(f)
|
||||
46| return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
|
||||
47|
|
||||
48|def save_state(state):
|
||||
49| state["last_check"] = datetime.now().isoformat()
|
||||
50| with open(FALLBACK_STATE, "w") as f:
|
||||
51| json.dump(state, f, indent=2)
|
||||
52|
|
||||
53|def run(cmd, timeout=10):
|
||||
54| try:
|
||||
55| r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||
56| return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||
57| except subprocess.TimeoutExpired:
|
||||
58| return -1, "", "timeout"
|
||||
59| except Exception as e:
|
||||
60| return -1, "", str(e)
|
||||
61|
|
||||
62|# ─── HEALTH CHECKS ───
|
||||
63|
|
||||
64|def check_kimi():
|
||||
65| """Can we reach Kimi Coding API?"""
|
||||
66| key = os.environ.get("KIMI_API_KEY", "")
|
||||
67| if not key:
|
||||
68| # Check multiple .env locations
|
||||
69| for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
|
||||
70| if env_path.exists():
|
||||
71| for line in open(env_path):
|
||||
72| line = line.strip()
|
||||
73| if line.startswith("KIMI_API_KEY=***
|
||||
74| key = line.split("=", 1)[1].strip().strip('"').strip("'")
|
||||
75| break
|
||||
76| if key:
|
||||
77| break
|
||||
78| if not key:
|
||||
79| return False, "no API key"
|
||||
80| code, out, err = run(
|
||||
81| f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
|
||||
82| f'-H "x-api-provider: kimi-coding" '
|
||||
83| f'https://api.kimi.com/coding/v1/models -X POST '
|
||||
84| f'-H "content-type: application/json" '
|
||||
85| f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
|
||||
86| timeout=15
|
||||
87| )
|
||||
88| if code == 0 and out in ("200", "429"):
|
||||
89| return True, f"HTTP {out}"
|
||||
90| return False, f"HTTP {out} err={err[:80]}"
|
||||
91|
|
||||
92|def check_local_llama():
|
||||
93| """Is local llama.cpp serving?"""
|
||||
94| code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
|
||||
95| if code == 0 and "hermes" in out.lower():
|
||||
96| return True, "serving"
|
||||
97| return False, f"exit={code}"
|
||||
98|
|
||||
99|def check_ollama():
|
||||
100| """Is Ollama running?"""
|
||||
101| code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
|
||||
102| if code == 0 and "models" in out:
|
||||
103| return True, "running"
|
||||
104| return False, f"exit={code}"
|
||||
105|
|
||||
106|def check_gitea():
|
||||
107| """Can we reach the Forge?"""
|
||||
108| token_path = Path.home() / ".config" / "gitea" / "timmy-token"
|
||||
109| if not token_path.exists():
|
||||
110| return False, "no token"
|
||||
111| token = token_path.read_text().strip()
|
||||
112| code, out, err = run(
|
||||
113| f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
|
||||
114| f'"{FORGE_URL}/api/v1/user"',
|
||||
115| timeout=10
|
||||
116| )
|
||||
117| if code == 0 and out == "200":
|
||||
118| return True, "reachable"
|
||||
119| return False, f"HTTP {out}"
|
||||
120|
|
||||
121|def check_vps(ip, name):
|
||||
122| """Can we SSH into a VPS?"""
|
||||
123| code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
|
||||
124| if code == 0 and "alive" in out:
|
||||
125| return True, "alive"
|
||||
126| return False, f"unreachable"
|
||||
127|
|
||||
128|# ─── FALLBACK ACTIONS ───
|
||||
129|
|
||||
130|def fallback_to_local_model(cfg):
|
||||
131| """Switch primary model from Kimi to local llama.cpp"""
|
||||
132| if not BACKUP_CONFIG.exists():
|
||||
133| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
||||
134|
|
||||
135| cfg["model"]["provider"] = "local-llama.cpp"
|
||||
136| cfg["model"]["default"] = "hermes3"
|
||||
137| save_config(cfg)
|
||||
138| return "Switched primary model to local-llama.cpp/hermes3"
|
||||
139|
|
||||
140|def fallback_to_ollama(cfg):
|
||||
141| """Switch to Ollama if llama.cpp is also down"""
|
||||
142| if not BACKUP_CONFIG.exists():
|
||||
143| shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
||||
144|
|
||||
145| cfg["model"]["provider"] = "ollama"
|
||||
146| cfg["model"]["default"] = "gemma4:latest"
|
||||
147| save_config(cfg)
|
||||
148| return "Switched primary model to ollama/gemma4:latest"
|
||||
149|
|
||||
150|def enter_safe_mode(state):
|
||||
151| """Pause all non-essential cron jobs, alert Alexander"""
|
||||
152| state["safe_mode"] = True
|
||||
153| state["safe_mode_entered"] = datetime.now().isoformat()
|
||||
154| save_state(state)
|
||||
155| return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
|
||||
156|
|
||||
157|def restore_config():
|
||||
158| """Restore pre-fallback config when primary recovers"""
|
||||
159| if BACKUP_CONFIG.exists():
|
||||
160| shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
|
||||
161| BACKUP_CONFIG.unlink()
|
||||
162| return "Restored original config from backup"
|
||||
163| return "No backup config to restore"
|
||||
164|
|
||||
165|# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
|
||||
166|
|
||||
167|def diagnose_and_fallback():
|
||||
168| state = load_state()
|
||||
169| cfg = load_config()
|
||||
170|
|
||||
171| results = {
|
||||
172| "timestamp": datetime.now().isoformat(),
|
||||
173| "checks": {},
|
||||
174| "actions": [],
|
||||
175| "status": "healthy"
|
||||
176| }
|
||||
177|
|
||||
178| # Check all systems
|
||||
179| kimi_ok, kimi_msg = check_kimi()
|
||||
180| results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
||||
181|
|
||||
182| llama_ok, llama_msg = check_local_llama()
|
||||
183| results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
|
||||
184|
|
||||
185| ollama_ok, ollama_msg = check_ollama()
|
||||
186| results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
|
||||
187|
|
||||
188| gitea_ok, gitea_msg = check_gitea()
|
||||
189| results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
||||
190|
|
||||
191| # VPS checks
|
||||
192| vpses = [
|
||||
193| ("167.99.126.228", "Allegro"),
|
||||
194| ("143.198.27.163", "Ezra"),
|
||||
195| ("159.203.146.185", "Bezalel"),
|
||||
196| ]
|
||||
197| for ip, name in vpses:
|
||||
198| vps_ok, vps_msg = check_vps(ip, name)
|
||||
199| results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
|
||||
200|
|
||||
201| current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
|
||||
202|
|
||||
203| # ─── FALLBACK LOGIC ───
|
||||
204|
|
||||
205| # Case 1: Primary (Kimi) down, local available
|
||||
206| if not kimi_ok and current_provider == "kimi-coding":
|
||||
207| if llama_ok:
|
||||
208| msg = fallback_to_local_model(cfg)
|
||||
209| results["actions"].append(msg)
|
||||
210| state["active_fallbacks"].append("kimi->local-llama")
|
||||
211| results["status"] = "degraded_local"
|
||||
212| elif ollama_ok:
|
||||
213| msg = fallback_to_ollama(cfg)
|
||||
214| results["actions"].append(msg)
|
||||
215| state["active_fallbacks"].append("kimi->ollama")
|
||||
216| results["status"] = "degraded_ollama"
|
||||
217| else:
|
||||
218| msg = enter_safe_mode(state)
|
||||
219| results["actions"].append(msg)
|
||||
220| results["status"] = "safe_mode"
|
||||
221|
|
||||
222| # Case 2: Already on fallback, check if primary recovered
|
||||
223| elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
|
||||
224| msg = restore_config()
|
||||
225| results["actions"].append(msg)
|
||||
226| state["active_fallbacks"].remove("kimi->local-llama")
|
||||
227| results["status"] = "recovered"
|
||||
228| elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
|
||||
229| msg = restore_config()
|
||||
230| results["actions"].append(msg)
|
||||
231| state["active_fallbacks"].remove("kimi->ollama")
|
||||
232| results["status"] = "recovered"
|
||||
233|
|
||||
234| # Case 3: Gitea down — just flag it, work locally
|
||||
235| if not gitea_ok:
|
||||
236| results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
||||
237| if "gitea_down" not in state.get("active_fallbacks", []):
|
||||
238| state["active_fallbacks"].append("gitea_down")
|
||||
239| results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
|
||||
240| elif "gitea_down" in state.get("active_fallbacks", []):
|
||||
241| state["active_fallbacks"].remove("gitea_down")
|
||||
242| results["actions"].append("Gitea recovered — resume normal operations")
|
||||
243|
|
||||
244| # Case 4: VPS agents down
|
||||
245| for ip, name in vpses:
|
||||
246| key = f"vps_{name.lower()}"
|
||||
247| if not results["checks"][key]["ok"]:
|
||||
248| results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
|
||||
249|
|
||||
250| save_state(state)
|
||||
251| return results
|
||||
252|
|
||||
253|if __name__ == "__main__":
|
||||
254| results = diagnose_and_fallback()
|
||||
255| print(json.dumps(results, indent=2))
|
||||
256|
|
||||
257| # Exit codes for cron integration
|
||||
258| if results["status"] == "safe_mode":
|
||||
259| sys.exit(2)
|
||||
260| elif results["status"].startswith("degraded"):
|
||||
261| sys.exit(1)
|
||||
262| else:
|
||||
263| sys.exit(0)
|
||||
264|
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dead Man Switch Fallback Engine
|
||||
|
||||
When the dead man switch triggers (zero commits for 2+ hours, model down,
|
||||
Gitea unreachable, etc.), this script diagnoses the failure and applies
|
||||
common sense fallbacks automatically.
|
||||
|
||||
Fallback chain:
|
||||
1. Primary model (Kimi) down -> switch config to local-llama.cpp
|
||||
2. Gitea unreachable -> cache issues locally, retry on recovery
|
||||
3. VPS agents down -> alert + lazarus protocol
|
||||
4. Local llama.cpp down -> try Ollama, then alert-only mode
|
||||
5. All inference dead -> safe mode (cron pauses, alert Alexander)
|
||||
|
||||
Each fallback is reversible. Recovery auto-restores the previous config.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
import yaml
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
||||
CONFIG_PATH = HERMES_HOME / "config.yaml"
|
||||
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
||||
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
||||
FORGE_URL = "https://forge.alexanderwhitestone.com"
|
||||
|
||||
def load_config():
|
||||
with open(CONFIG_PATH) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
def save_config(cfg):
|
||||
with open(CONFIG_PATH, "w") as f:
|
||||
yaml.dump(cfg, f, default_flow_style=False)
|
||||
|
||||
def load_state():
|
||||
if FALLBACK_STATE.exists():
|
||||
with open(FALLBACK_STATE) as f:
|
||||
return json.load(f)
|
||||
return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
|
||||
|
||||
def save_state(state):
|
||||
state["last_check"] = datetime.now().isoformat()
|
||||
with open(FALLBACK_STATE, "w") as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
def run(cmd, timeout=10):
|
||||
try:
|
||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
return -1, "", "timeout"
|
||||
except Exception as e:
|
||||
return -1, "", str(e)
|
||||
|
||||
# ─── HEALTH CHECKS ───
|
||||
|
||||
def check_kimi():
|
||||
"""Can we reach Kimi Coding API?"""
|
||||
key = os.environ.get("KIMI_API_KEY", "")
|
||||
if not key:
|
||||
# Check multiple .env locations
|
||||
for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
|
||||
if env_path.exists():
|
||||
for line in open(env_path):
|
||||
line = line.strip()
|
||||
if line.startswith("KIMI_API_KEY="):
|
||||
key = line.split("=", 1)[1].strip().strip('"').strip("'")
|
||||
break
|
||||
if key:
|
||||
break
|
||||
if not key:
|
||||
return False, "no API key"
|
||||
code, out, err = run(
|
||||
f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
|
||||
f'-H "x-api-provider: kimi-coding" '
|
||||
f'https://api.kimi.com/coding/v1/models -X POST '
|
||||
f'-H "content-type: application/json" '
|
||||
f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
|
||||
timeout=15
|
||||
)
|
||||
if code == 0 and out in ("200", "429"):
|
||||
return True, f"HTTP {out}"
|
||||
return False, f"HTTP {out} err={err[:80]}"
|
||||
|
||||
def check_local_llama():
|
||||
"""Is local llama.cpp serving?"""
|
||||
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
|
||||
if code == 0 and "hermes" in out.lower():
|
||||
return True, "serving"
|
||||
return False, f"exit={code}"
|
||||
|
||||
def check_ollama():
|
||||
"""Is Ollama running?"""
|
||||
code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
|
||||
if code == 0 and "models" in out:
|
||||
return True, "running"
|
||||
return False, f"exit={code}"
|
||||
|
||||
def check_gitea():
|
||||
"""Can we reach the Forge?"""
|
||||
token_path = Path.home() / ".config" / "gitea" / "timmy-token"
|
||||
if not token_path.exists():
|
||||
return False, "no token"
|
||||
token = token_path.read_text().strip()
|
||||
code, out, err = run(
|
||||
f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
|
||||
f'"{FORGE_URL}/api/v1/user"',
|
||||
timeout=10
|
||||
)
|
||||
if code == 0 and out == "200":
|
||||
return True, "reachable"
|
||||
return False, f"HTTP {out}"
|
||||
|
||||
def check_vps(ip, name):
|
||||
"""Can we SSH into a VPS?"""
|
||||
code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
|
||||
if code == 0 and "alive" in out:
|
||||
return True, "alive"
|
||||
return False, f"unreachable"
|
||||
|
||||
# ─── FALLBACK ACTIONS ───
|
||||
|
||||
def fallback_to_local_model(cfg):
|
||||
"""Switch primary model from Kimi to local llama.cpp"""
|
||||
if not BACKUP_CONFIG.exists():
|
||||
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
||||
|
||||
cfg["model"]["provider"] = "local-llama.cpp"
|
||||
cfg["model"]["default"] = "hermes3"
|
||||
save_config(cfg)
|
||||
return "Switched primary model to local-llama.cpp/hermes3"
|
||||
|
||||
def fallback_to_ollama(cfg):
|
||||
"""Switch to Ollama if llama.cpp is also down"""
|
||||
if not BACKUP_CONFIG.exists():
|
||||
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
||||
|
||||
cfg["model"]["provider"] = "ollama"
|
||||
cfg["model"]["default"] = "gemma4:latest"
|
||||
save_config(cfg)
|
||||
return "Switched primary model to ollama/gemma4:latest"
|
||||
|
||||
def enter_safe_mode(state):
|
||||
"""Pause all non-essential cron jobs, alert Alexander"""
|
||||
state["safe_mode"] = True
|
||||
state["safe_mode_entered"] = datetime.now().isoformat()
|
||||
save_state(state)
|
||||
return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
|
||||
|
||||
def restore_config():
|
||||
"""Restore pre-fallback config when primary recovers"""
|
||||
if BACKUP_CONFIG.exists():
|
||||
shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
|
||||
BACKUP_CONFIG.unlink()
|
||||
return "Restored original config from backup"
|
||||
return "No backup config to restore"
|
||||
|
||||
# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
|
||||
|
||||
def diagnose_and_fallback():
|
||||
state = load_state()
|
||||
cfg = load_config()
|
||||
|
||||
results = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"checks": {},
|
||||
"actions": [],
|
||||
"status": "healthy"
|
||||
}
|
||||
|
||||
# Check all systems
|
||||
kimi_ok, kimi_msg = check_kimi()
|
||||
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
||||
|
||||
llama_ok, llama_msg = check_local_llama()
|
||||
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
|
||||
|
||||
ollama_ok, ollama_msg = check_ollama()
|
||||
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
|
||||
|
||||
gitea_ok, gitea_msg = check_gitea()
|
||||
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
||||
|
||||
# VPS checks
|
||||
vpses = [
|
||||
("167.99.126.228", "Allegro"),
|
||||
("143.198.27.163", "Ezra"),
|
||||
("159.203.146.185", "Bezalel"),
|
||||
]
|
||||
for ip, name in vpses:
|
||||
vps_ok, vps_msg = check_vps(ip, name)
|
||||
results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
|
||||
|
||||
current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
|
||||
|
||||
# ─── FALLBACK LOGIC ───
|
||||
|
||||
# Case 1: Primary (Kimi) down, local available
|
||||
if not kimi_ok and current_provider == "kimi-coding":
|
||||
if llama_ok:
|
||||
msg = fallback_to_local_model(cfg)
|
||||
results["actions"].append(msg)
|
||||
state["active_fallbacks"].append("kimi->local-llama")
|
||||
results["status"] = "degraded_local"
|
||||
elif ollama_ok:
|
||||
msg = fallback_to_ollama(cfg)
|
||||
results["actions"].append(msg)
|
||||
state["active_fallbacks"].append("kimi->ollama")
|
||||
results["status"] = "degraded_ollama"
|
||||
else:
|
||||
msg = enter_safe_mode(state)
|
||||
results["actions"].append(msg)
|
||||
results["status"] = "safe_mode"
|
||||
|
||||
# Case 2: Already on fallback, check if primary recovered
|
||||
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
|
||||
msg = restore_config()
|
||||
results["actions"].append(msg)
|
||||
state["active_fallbacks"].remove("kimi->local-llama")
|
||||
results["status"] = "recovered"
|
||||
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
|
||||
msg = restore_config()
|
||||
results["actions"].append(msg)
|
||||
state["active_fallbacks"].remove("kimi->ollama")
|
||||
results["status"] = "recovered"
|
||||
|
||||
# Case 3: Gitea down — just flag it, work locally
|
||||
if not gitea_ok:
|
||||
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
||||
if "gitea_down" not in state.get("active_fallbacks", []):
|
||||
state["active_fallbacks"].append("gitea_down")
|
||||
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
|
||||
elif "gitea_down" in state.get("active_fallbacks", []):
|
||||
state["active_fallbacks"].remove("gitea_down")
|
||||
results["actions"].append("Gitea recovered — resume normal operations")
|
||||
|
||||
# Case 4: VPS agents down
|
||||
for ip, name in vpses:
|
||||
key = f"vps_{name.lower()}"
|
||||
if not results["checks"][key]["ok"]:
|
||||
results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
|
||||
|
||||
save_state(state)
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
results = diagnose_and_fallback()
|
||||
print(json.dumps(results, indent=2))
|
||||
|
||||
# Exit codes for cron integration
|
||||
if results["status"] == "safe_mode":
|
||||
sys.exit(2)
|
||||
elif results["status"].startswith("degraded"):
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# Uses Hermes CLI plus workforce-manager to triage and review.
|
||||
# Timmy is the brain. Other agents are the hands.
|
||||
|
||||
set -uo pipefail
|
||||
set -uo pipefail\n\nSCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
LOG_DIR="$HOME/.hermes/logs"
|
||||
LOG="$LOG_DIR/timmy-orchestrator.log"
|
||||
@@ -40,6 +40,7 @@ gather_state() {
|
||||
> "$state_dir/unassigned.txt"
|
||||
> "$state_dir/open_prs.txt"
|
||||
> "$state_dir/agent_status.txt"
|
||||
> "$state_dir/uncommitted_work.txt"
|
||||
|
||||
for repo in $REPOS; do
|
||||
local short=$(echo "$repo" | cut -d/ -f2)
|
||||
@@ -71,6 +72,24 @@ for p in json.load(sys.stdin):
|
||||
tail -50 "/tmp/kimi-heartbeat.log" 2>/dev/null | grep -c "FAILED:" | xargs -I{} echo "Kimi recent failures: {}" >> "$state_dir/agent_status.txt"
|
||||
tail -1 "/tmp/kimi-heartbeat.log" 2>/dev/null | xargs -I{} echo "Kimi last event: {}" >> "$state_dir/agent_status.txt"
|
||||
|
||||
# Scan worktrees for uncommitted work
|
||||
for wt_dir in "$HOME/worktrees"/*/; do
|
||||
[ -d "$wt_dir" ] || continue
|
||||
[ -d "$wt_dir/.git" ] || continue
|
||||
local dirty
|
||||
dirty=$(cd "$wt_dir" && git status --porcelain 2>/dev/null | wc -l | tr -d " ")
|
||||
if [ "${dirty:-0}" -gt 0 ]; then
|
||||
local branch
|
||||
branch=$(cd "$wt_dir" && git branch --show-current 2>/dev/null || echo "?")
|
||||
local age=""
|
||||
local last_commit
|
||||
last_commit=$(cd "$wt_dir" && git log -1 --format=%ct 2>/dev/null || echo 0)
|
||||
local now=$(date +%s)
|
||||
local stale_mins=$(( (now - last_commit) / 60 ))
|
||||
echo "DIR=$wt_dir BRANCH=$branch DIRTY=$dirty STALE=${stale_mins}m" >> "$state_dir/uncommitted_work.txt"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "$state_dir"
|
||||
}
|
||||
|
||||
@@ -81,6 +100,25 @@ run_triage() {
|
||||
|
||||
log "Cycle: $unassigned_count unassigned, $pr_count open PRs"
|
||||
|
||||
# Check for uncommitted work — nag if stale
|
||||
local uncommitted_count
|
||||
uncommitted_count=$(wc -l < "$state_dir/uncommitted_work.txt" 2>/dev/null | tr -d " " || echo 0)
|
||||
if [ "${uncommitted_count:-0}" -gt 0 ]; then
|
||||
log "WARNING: $uncommitted_count worktree(s) with uncommitted work"
|
||||
while IFS= read -r line; do
|
||||
log " UNCOMMITTED: $line"
|
||||
# Auto-commit stale work (>60 min without commit)
|
||||
local stale=$(echo "$line" | sed 's/.*STALE=\([0-9]*\)m.*/\1/')
|
||||
local wt_dir=$(echo "$line" | sed 's/.*DIR=\([^ ]*\) .*/\1/')
|
||||
if [ "${stale:-0}" -gt 60 ]; then
|
||||
log " AUTO-COMMITTING stale work in $wt_dir (${stale}m stale)"
|
||||
(cd "$wt_dir" && git add -A && git commit -m "WIP: orchestrator auto-commit — ${stale}m stale work
|
||||
|
||||
Preserved by timmy-orchestrator to prevent loss." 2>/dev/null && git push 2>/dev/null) && log " COMMITTED: $wt_dir" || log " COMMIT FAILED: $wt_dir"
|
||||
fi
|
||||
done < "$state_dir/uncommitted_work.txt"
|
||||
fi
|
||||
|
||||
# If nothing to do, skip the LLM call
|
||||
if [ "$unassigned_count" -eq 0 ] && [ "$pr_count" -eq 0 ]; then
|
||||
log "Nothing to triage"
|
||||
@@ -198,6 +236,12 @@ FOOTER
|
||||
log "=== Timmy Orchestrator Started (PID $$) ==="
|
||||
log "Cycle: ${CYCLE_INTERVAL}s | Auto-assign: ${AUTO_ASSIGN_UNASSIGNED} | Inference surface: Hermes CLI"
|
||||
|
||||
# Start auto-commit-guard daemon for work preservation
|
||||
if ! pgrep -f "auto-commit-guard.sh" >/dev/null 2>&1; then
|
||||
nohup bash "$SCRIPT_DIR/auto-commit-guard.sh" 120 >> "$LOG_DIR/auto-commit-guard.log" 2>&1 &
|
||||
log "Started auto-commit-guard daemon (PID $!)"
|
||||
fi
|
||||
|
||||
WORKFORCE_CYCLE=0
|
||||
|
||||
while true; do
|
||||
|
||||
24
deploy/auto-commit-guard.plist
Normal file
24
deploy/auto-commit-guard.plist
Normal file
@@ -0,0 +1,24 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>ai.timmy.auto-commit-guard</string>
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>/bin/bash</string>
|
||||
<string>/Users/apayne/.hermes/bin/auto-commit-guard.sh</string>
|
||||
<string>120</string>
|
||||
</array>
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
<key>KeepAlive</key>
|
||||
<true/>
|
||||
<key>StandardOutPath</key>
|
||||
<string>/Users/apayne/.hermes/logs/auto-commit-guard.stdout.log</string>
|
||||
<key>StandardErrorPath</key>
|
||||
<string>/Users/apayne/.hermes/logs/auto-commit-guard.stderr.log</string>
|
||||
<key>WorkingDirectory</key>
|
||||
<string>/Users/apayne</string>
|
||||
</dict>
|
||||
</plist>
|
||||
@@ -14,7 +14,7 @@ from crewai.tools import BaseTool
|
||||
|
||||
OPENROUTER_API_KEY = os.getenv(
|
||||
"OPENROUTER_API_KEY",
|
||||
"dsk-or-v1-f60c89db12040267458165cf192e815e339eb70548e4a0a461f5f0f69e6ef8b0",
|
||||
os.environ.get("OPENROUTER_API_KEY", ""),
|
||||
)
|
||||
|
||||
llm = LLM(
|
||||
|
||||
@@ -111,7 +111,7 @@ def update_uptime(checks: dict):
|
||||
save(data)
|
||||
|
||||
if new_milestones:
|
||||
print(f" UPTIME MILESTONE: {','.join(str(m) + '%') for m in new_milestones}")
|
||||
print(f" UPTIME MILESTONE: {','.join((str(m) + '%') for m in new_milestones)}")
|
||||
print(f" Current uptime: {recent_ok:.1f}%")
|
||||
|
||||
return data["uptime"]
|
||||
|
||||
@@ -25,7 +25,7 @@ services:
|
||||
- "traefik.http.routers.matrix-client.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.routers.matrix-client.entrypoints=websecure"
|
||||
- "traefik.http.services.matrix-client.loadbalancer.server.port=6167"
|
||||
|
||||
|
||||
# Federation (TCP 8448) - direct or via Traefik TCP entrypoint
|
||||
# Option A: Direct host port mapping
|
||||
# Option B: Traefik TCP router (requires Traefik federation entrypoint)
|
||||
|
||||
@@ -163,4 +163,4 @@ overrides:
|
||||
Post a comment on the issue with the format:
|
||||
GUARDRAIL_OVERRIDE: <constraint_name> REASON: <explanation>
|
||||
override_expiry_hours: 24
|
||||
require_post_override_review: true
|
||||
require_post_override_review: true
|
||||
|
||||
@@ -582,9 +582,9 @@ def main() -> int:
|
||||
# Relax exclusions if no agent found
|
||||
agent = find_best_agent(agents, role, wolf_scores, priority, exclude=[])
|
||||
if not agent:
|
||||
logging.warning("No suitable agent for issue #%d: %s (role=%s)",
|
||||
issue.get("number"), issue.get("title", ""), role)
|
||||
continue
|
||||
logging.warning("No suitable agent for issue #%d: %s (role=%s)",
|
||||
issue.get("number"), issue.get("title", ""), role)
|
||||
continue
|
||||
|
||||
result = dispatch_assignment(api, issue, agent, dry_run=args.dry_run)
|
||||
assignments.append(result)
|
||||
|
||||
Reference in New Issue
Block a user