Compare commits
2 Commits
step35/595
...
step35/444
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0ae1f4823b | ||
|
|
67d4041566 |
@@ -8,7 +8,7 @@
|
|||||||
# --- Deadman Switch ---
|
# --- Deadman Switch ---
|
||||||
deadman_enabled: true
|
deadman_enabled: true
|
||||||
deadman_check_interval: 300 # 5 minutes between health checks
|
deadman_check_interval: 300 # 5 minutes between health checks
|
||||||
deadman_snapshot_dir: "~/.local/timmy/snapshots"
|
deadman_snapshot_dir: "{{ wizard_home }}/.snapshots"
|
||||||
deadman_max_snapshots: 10 # Rolling window of good configs
|
deadman_max_snapshots: 10 # Rolling window of good configs
|
||||||
deadman_restart_cooldown: 60 # Seconds to wait before restart after failure
|
deadman_restart_cooldown: 60 # Seconds to wait before restart after failure
|
||||||
deadman_max_restart_attempts: 3
|
deadman_max_restart_attempts: 3
|
||||||
@@ -50,6 +50,12 @@ cron_jobs:
|
|||||||
hour: "*"
|
hour: "*"
|
||||||
enabled: "{{ deadman_enabled }}"
|
enabled: "{{ deadman_enabled }}"
|
||||||
|
|
||||||
|
- name: "Deadman switch action — {{ wizard_name }}"
|
||||||
|
job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
|
||||||
|
minute: "*/5"
|
||||||
|
hour: "*"
|
||||||
|
enabled: "{{ deadman_enabled }}"
|
||||||
|
|
||||||
- name: "Muda audit"
|
- name: "Muda audit"
|
||||||
job: "cd {{ wizard_home }}/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1"
|
job: "cd {{ wizard_home }}/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1"
|
||||||
minute: "0"
|
minute: "0"
|
||||||
|
|||||||
@@ -20,17 +20,6 @@
|
|||||||
loop: "{{ cron_jobs }}"
|
loop: "{{ cron_jobs }}"
|
||||||
when: cron_jobs is defined
|
when: cron_jobs is defined
|
||||||
|
|
||||||
- name: "Deploy deadman switch cron (fallback if systemd timer unavailable)"
|
|
||||||
cron:
|
|
||||||
name: "Deadman switch — {{ wizard_name }}"
|
|
||||||
job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
|
|
||||||
minute: "*/5"
|
|
||||||
hour: "*"
|
|
||||||
state: present
|
|
||||||
user: "{{ ansible_user | default('root') }}"
|
|
||||||
when: deadman_enabled and machine_type != 'vps'
|
|
||||||
# VPS machines use systemd timers instead
|
|
||||||
|
|
||||||
- name: "Remove legacy cron jobs (cleanup)"
|
- name: "Remove legacy cron jobs (cleanup)"
|
||||||
cron:
|
cron:
|
||||||
name: "{{ item }}"
|
name: "{{ item }}"
|
||||||
|
|||||||
@@ -19,30 +19,6 @@
|
|||||||
dest: "{{ wizard_home }}/deadman_action.sh"
|
dest: "{{ wizard_home }}/deadman_action.sh"
|
||||||
mode: "0755"
|
mode: "0755"
|
||||||
|
|
||||||
- name: "Deploy deadman systemd service"
|
|
||||||
template:
|
|
||||||
src: deadman_switch.service.j2
|
|
||||||
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.service"
|
|
||||||
mode: "0644"
|
|
||||||
when: machine_type == 'vps'
|
|
||||||
notify: "Enable deadman service"
|
|
||||||
|
|
||||||
- name: "Deploy deadman systemd timer"
|
|
||||||
template:
|
|
||||||
src: deadman_switch.timer.j2
|
|
||||||
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.timer"
|
|
||||||
mode: "0644"
|
|
||||||
when: machine_type == 'vps'
|
|
||||||
notify: "Enable deadman timer"
|
|
||||||
|
|
||||||
- name: "Deploy deadman launchd plist (Mac)"
|
|
||||||
template:
|
|
||||||
src: deadman_switch.plist.j2
|
|
||||||
dest: "{{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
|
|
||||||
mode: "0644"
|
|
||||||
when: machine_type == 'mac'
|
|
||||||
notify: "Load deadman plist"
|
|
||||||
|
|
||||||
- name: "Take initial config snapshot"
|
- name: "Take initial config snapshot"
|
||||||
copy:
|
copy:
|
||||||
src: "{{ wizard_home }}/config.yaml"
|
src: "{{ wizard_home }}/config.yaml"
|
||||||
|
|||||||
@@ -54,10 +54,18 @@ snapshot_config() {
|
|||||||
|
|
||||||
rollback_config() {
|
rollback_config() {
|
||||||
if [ -f "${SNAPSHOT_FILE}" ]; then
|
if [ -f "${SNAPSHOT_FILE}" ]; then
|
||||||
|
# Compute hashes for rollback audit
|
||||||
|
bad_hash="unknown"
|
||||||
|
good_hash="unknown"
|
||||||
|
if [ -f "${CONFIG_FILE}" ]; then
|
||||||
|
bad_hash=$(sha256sum "${CONFIG_FILE}" 2>/dev/null | awk '{print $1}' || echo "unknown")
|
||||||
|
fi
|
||||||
|
good_hash=$(sha256sum "${SNAPSHOT_FILE}" 2>/dev/null | awk '{print $1}' || echo "unknown")
|
||||||
|
log "Rollback event: agent=${WIZARD_NAME} old_hash=${bad_hash} new_hash=${good_hash}"
|
||||||
log "Rolling back config to last known good..."
|
log "Rolling back config to last known good..."
|
||||||
cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}"
|
cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}"
|
||||||
log "Config rolled back."
|
log "Config rolled back."
|
||||||
log_telemetry "fallback" "Config rolled back to last known good by deadman switch"
|
log_telemetry "fallback" "Config rolled back: old_hash=${bad_hash}, new_hash=${good_hash}"
|
||||||
else
|
else
|
||||||
log "ERROR: No known good snapshot found. Pulling from upstream..."
|
log "ERROR: No known good snapshot found. Pulling from upstream..."
|
||||||
cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \
|
cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \
|
||||||
|
|||||||
@@ -1,263 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Dead Man Switch Fallback Engine
|
|
||||||
|
|
||||||
When the dead man switch triggers (zero commits for 2+ hours, model down,
|
|
||||||
Gitea unreachable, etc.), this script diagnoses the failure and applies
|
|
||||||
common sense fallbacks automatically.
|
|
||||||
|
|
||||||
Fallback chain:
|
|
||||||
1. Primary model (Kimi) down -> switch config to local-llama.cpp
|
|
||||||
2. Gitea unreachable -> cache issues locally, retry on recovery
|
|
||||||
3. VPS agents down -> alert + lazarus protocol
|
|
||||||
4. Local llama.cpp down -> try Ollama, then alert-only mode
|
|
||||||
5. All inference dead -> safe mode (cron pauses, alert Alexander)
|
|
||||||
|
|
||||||
Each fallback is reversible. Recovery auto-restores the previous config.
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import json
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
import yaml
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
HERMES_HOME = Path(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")))
|
|
||||||
CONFIG_PATH = HERMES_HOME / "config.yaml"
|
|
||||||
FALLBACK_STATE = HERMES_HOME / "deadman-fallback-state.json"
|
|
||||||
BACKUP_CONFIG = HERMES_HOME / "config.yaml.pre-fallback"
|
|
||||||
FORGE_URL = "https://forge.alexanderwhitestone.com"
|
|
||||||
|
|
||||||
def load_config():
|
|
||||||
with open(CONFIG_PATH) as f:
|
|
||||||
return yaml.safe_load(f)
|
|
||||||
|
|
||||||
def save_config(cfg):
|
|
||||||
with open(CONFIG_PATH, "w") as f:
|
|
||||||
yaml.dump(cfg, f, default_flow_style=False)
|
|
||||||
|
|
||||||
def load_state():
|
|
||||||
if FALLBACK_STATE.exists():
|
|
||||||
with open(FALLBACK_STATE) as f:
|
|
||||||
return json.load(f)
|
|
||||||
return {"active_fallbacks": [], "last_check": None, "recovery_pending": False}
|
|
||||||
|
|
||||||
def save_state(state):
|
|
||||||
state["last_check"] = datetime.now().isoformat()
|
|
||||||
with open(FALLBACK_STATE, "w") as f:
|
|
||||||
json.dump(state, f, indent=2)
|
|
||||||
|
|
||||||
def run(cmd, timeout=10):
|
|
||||||
try:
|
|
||||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=timeout)
|
|
||||||
return r.returncode, r.stdout.strip(), r.stderr.strip()
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
return -1, "", "timeout"
|
|
||||||
except Exception as e:
|
|
||||||
return -1, "", str(e)
|
|
||||||
|
|
||||||
# ─── HEALTH CHECKS ───
|
|
||||||
|
|
||||||
def check_kimi():
|
|
||||||
"""Can we reach Kimi Coding API?"""
|
|
||||||
key = os.environ.get("KIMI_API_KEY", "")
|
|
||||||
if not key:
|
|
||||||
# Check multiple .env locations
|
|
||||||
for env_path in [HERMES_HOME / ".env", Path.home() / ".hermes" / ".env"]:
|
|
||||||
if env_path.exists():
|
|
||||||
for line in open(env_path):
|
|
||||||
line = line.strip()
|
|
||||||
if line.startswith("KIMI_API_KEY="):
|
|
||||||
key = line.split("=", 1)[1].strip().strip('"').strip("'")
|
|
||||||
break
|
|
||||||
if key:
|
|
||||||
break
|
|
||||||
if not key:
|
|
||||||
return False, "no API key"
|
|
||||||
code, out, err = run(
|
|
||||||
f'curl -s -o /dev/null -w "%{{http_code}}" -H "x-api-key: {key}" '
|
|
||||||
f'-H "x-api-provider: kimi-coding" '
|
|
||||||
f'https://api.kimi.com/coding/v1/models -X POST '
|
|
||||||
f'-H "content-type: application/json" '
|
|
||||||
f'-d \'{{"model":"kimi-k2.5","max_tokens":1,"messages":[{{"role":"user","content":"ping"}}]}}\' ',
|
|
||||||
timeout=15
|
|
||||||
)
|
|
||||||
if code == 0 and out in ("200", "429"):
|
|
||||||
return True, f"HTTP {out}"
|
|
||||||
return False, f"HTTP {out} err={err[:80]}"
|
|
||||||
|
|
||||||
def check_local_llama():
|
|
||||||
"""Is local llama.cpp serving?"""
|
|
||||||
code, out, err = run("curl -s http://localhost:8081/v1/models", timeout=5)
|
|
||||||
if code == 0 and "hermes" in out.lower():
|
|
||||||
return True, "serving"
|
|
||||||
return False, f"exit={code}"
|
|
||||||
|
|
||||||
def check_ollama():
|
|
||||||
"""Is Ollama running?"""
|
|
||||||
code, out, err = run("curl -s http://localhost:11434/api/tags", timeout=5)
|
|
||||||
if code == 0 and "models" in out:
|
|
||||||
return True, "running"
|
|
||||||
return False, f"exit={code}"
|
|
||||||
|
|
||||||
def check_gitea():
|
|
||||||
"""Can we reach the Forge?"""
|
|
||||||
token_path = Path.home() / ".config" / "gitea" / "timmy-token"
|
|
||||||
if not token_path.exists():
|
|
||||||
return False, "no token"
|
|
||||||
token = token_path.read_text().strip()
|
|
||||||
code, out, err = run(
|
|
||||||
f'curl -s -o /dev/null -w "%{{http_code}}" -H "Authorization: token {token}" '
|
|
||||||
f'"{FORGE_URL}/api/v1/user"',
|
|
||||||
timeout=10
|
|
||||||
)
|
|
||||||
if code == 0 and out == "200":
|
|
||||||
return True, "reachable"
|
|
||||||
return False, f"HTTP {out}"
|
|
||||||
|
|
||||||
def check_vps(ip, name):
|
|
||||||
"""Can we SSH into a VPS?"""
|
|
||||||
code, out, err = run(f"ssh -o ConnectTimeout=5 root@{ip} 'echo alive'", timeout=10)
|
|
||||||
if code == 0 and "alive" in out:
|
|
||||||
return True, "alive"
|
|
||||||
return False, f"unreachable"
|
|
||||||
|
|
||||||
# ─── FALLBACK ACTIONS ───
|
|
||||||
|
|
||||||
def fallback_to_local_model(cfg):
|
|
||||||
"""Switch primary model from Kimi to local llama.cpp"""
|
|
||||||
if not BACKUP_CONFIG.exists():
|
|
||||||
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
|
||||||
|
|
||||||
cfg["model"]["provider"] = "local-llama.cpp"
|
|
||||||
cfg["model"]["default"] = "hermes3"
|
|
||||||
save_config(cfg)
|
|
||||||
return "Switched primary model to local-llama.cpp/hermes3"
|
|
||||||
|
|
||||||
def fallback_to_ollama(cfg):
|
|
||||||
"""Switch to Ollama if llama.cpp is also down"""
|
|
||||||
if not BACKUP_CONFIG.exists():
|
|
||||||
shutil.copy2(CONFIG_PATH, BACKUP_CONFIG)
|
|
||||||
|
|
||||||
cfg["model"]["provider"] = "ollama"
|
|
||||||
cfg["model"]["default"] = "gemma4:latest"
|
|
||||||
save_config(cfg)
|
|
||||||
return "Switched primary model to ollama/gemma4:latest"
|
|
||||||
|
|
||||||
def enter_safe_mode(state):
|
|
||||||
"""Pause all non-essential cron jobs, alert Alexander"""
|
|
||||||
state["safe_mode"] = True
|
|
||||||
state["safe_mode_entered"] = datetime.now().isoformat()
|
|
||||||
save_state(state)
|
|
||||||
return "SAFE MODE: All inference down. Cron jobs should be paused. Alert Alexander."
|
|
||||||
|
|
||||||
def restore_config():
|
|
||||||
"""Restore pre-fallback config when primary recovers"""
|
|
||||||
if BACKUP_CONFIG.exists():
|
|
||||||
shutil.copy2(BACKUP_CONFIG, CONFIG_PATH)
|
|
||||||
BACKUP_CONFIG.unlink()
|
|
||||||
return "Restored original config from backup"
|
|
||||||
return "No backup config to restore"
|
|
||||||
|
|
||||||
# ─── MAIN DIAGNOSIS AND FALLBACK ENGINE ───
|
|
||||||
|
|
||||||
def diagnose_and_fallback():
|
|
||||||
state = load_state()
|
|
||||||
cfg = load_config()
|
|
||||||
|
|
||||||
results = {
|
|
||||||
"timestamp": datetime.now().isoformat(),
|
|
||||||
"checks": {},
|
|
||||||
"actions": [],
|
|
||||||
"status": "healthy"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Check all systems
|
|
||||||
kimi_ok, kimi_msg = check_kimi()
|
|
||||||
results["checks"]["kimi-coding"] = {"ok": kimi_ok, "msg": kimi_msg}
|
|
||||||
|
|
||||||
llama_ok, llama_msg = check_local_llama()
|
|
||||||
results["checks"]["local_llama"] = {"ok": llama_ok, "msg": llama_msg}
|
|
||||||
|
|
||||||
ollama_ok, ollama_msg = check_ollama()
|
|
||||||
results["checks"]["ollama"] = {"ok": ollama_ok, "msg": ollama_msg}
|
|
||||||
|
|
||||||
gitea_ok, gitea_msg = check_gitea()
|
|
||||||
results["checks"]["gitea"] = {"ok": gitea_ok, "msg": gitea_msg}
|
|
||||||
|
|
||||||
# VPS checks
|
|
||||||
vpses = [
|
|
||||||
("167.99.126.228", "Allegro"),
|
|
||||||
("143.198.27.163", "Ezra"),
|
|
||||||
("159.203.146.185", "Bezalel"),
|
|
||||||
]
|
|
||||||
for ip, name in vpses:
|
|
||||||
vps_ok, vps_msg = check_vps(ip, name)
|
|
||||||
results["checks"][f"vps_{name.lower()}"] = {"ok": vps_ok, "msg": vps_msg}
|
|
||||||
|
|
||||||
current_provider = cfg.get("model", {}).get("provider", "kimi-coding")
|
|
||||||
|
|
||||||
# ─── FALLBACK LOGIC ───
|
|
||||||
|
|
||||||
# Case 1: Primary (Kimi) down, local available
|
|
||||||
if not kimi_ok and current_provider == "kimi-coding":
|
|
||||||
if llama_ok:
|
|
||||||
msg = fallback_to_local_model(cfg)
|
|
||||||
results["actions"].append(msg)
|
|
||||||
state["active_fallbacks"].append("kimi->local-llama")
|
|
||||||
results["status"] = "degraded_local"
|
|
||||||
elif ollama_ok:
|
|
||||||
msg = fallback_to_ollama(cfg)
|
|
||||||
results["actions"].append(msg)
|
|
||||||
state["active_fallbacks"].append("kimi->ollama")
|
|
||||||
results["status"] = "degraded_ollama"
|
|
||||||
else:
|
|
||||||
msg = enter_safe_mode(state)
|
|
||||||
results["actions"].append(msg)
|
|
||||||
results["status"] = "safe_mode"
|
|
||||||
|
|
||||||
# Case 2: Already on fallback, check if primary recovered
|
|
||||||
elif kimi_ok and "kimi->local-llama" in state.get("active_fallbacks", []):
|
|
||||||
msg = restore_config()
|
|
||||||
results["actions"].append(msg)
|
|
||||||
state["active_fallbacks"].remove("kimi->local-llama")
|
|
||||||
results["status"] = "recovered"
|
|
||||||
elif kimi_ok and "kimi->ollama" in state.get("active_fallbacks", []):
|
|
||||||
msg = restore_config()
|
|
||||||
results["actions"].append(msg)
|
|
||||||
state["active_fallbacks"].remove("kimi->ollama")
|
|
||||||
results["status"] = "recovered"
|
|
||||||
|
|
||||||
# Case 3: Gitea down — just flag it, work locally
|
|
||||||
if not gitea_ok:
|
|
||||||
results["actions"].append("WARN: Gitea unreachable — work cached locally until recovery")
|
|
||||||
if "gitea_down" not in state.get("active_fallbacks", []):
|
|
||||||
state["active_fallbacks"].append("gitea_down")
|
|
||||||
results["status"] = max(results["status"], "degraded_gitea", key=lambda x: ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"].index(x) if x in ["healthy", "recovered", "degraded_gitea", "degraded_local", "degraded_ollama", "safe_mode"] else 0)
|
|
||||||
elif "gitea_down" in state.get("active_fallbacks", []):
|
|
||||||
state["active_fallbacks"].remove("gitea_down")
|
|
||||||
results["actions"].append("Gitea recovered — resume normal operations")
|
|
||||||
|
|
||||||
# Case 4: VPS agents down
|
|
||||||
for ip, name in vpses:
|
|
||||||
key = f"vps_{name.lower()}"
|
|
||||||
if not results["checks"][key]["ok"]:
|
|
||||||
results["actions"].append(f"ALERT: {name} VPS ({ip}) unreachable — lazarus protocol needed")
|
|
||||||
|
|
||||||
save_state(state)
|
|
||||||
return results
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
results = diagnose_and_fallback()
|
|
||||||
print(json.dumps(results, indent=2))
|
|
||||||
|
|
||||||
# Exit codes for cron integration
|
|
||||||
if results["status"] == "safe_mode":
|
|
||||||
sys.exit(2)
|
|
||||||
elif results["status"].startswith("degraded"):
|
|
||||||
sys.exit(1)
|
|
||||||
else:
|
|
||||||
sys.exit(0)
|
|
||||||
@@ -1,78 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# deadman-switch.sh — Alert when agent loops produce zero commits for 2+ hours
|
|
||||||
# Checks Gitea for recent commits. Sends Telegram alert if threshold exceeded.
|
|
||||||
# Designed to run as a cron job every 30 minutes.
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
THRESHOLD_HOURS="${1:-2}"
|
|
||||||
THRESHOLD_SECS=$((THRESHOLD_HOURS * 3600))
|
|
||||||
LOG_DIR="$HOME/.hermes/logs"
|
|
||||||
LOG_FILE="$LOG_DIR/deadman.log"
|
|
||||||
GITEA_URL="https://forge.alexanderwhitestone.com"
|
|
||||||
GITEA_TOKEN=$(cat "$HOME/.hermes/gitea_token_vps" 2>/dev/null || echo "")
|
|
||||||
TELEGRAM_TOKEN=$(cat "$HOME/.config/telegram/special_bot" 2>/dev/null || echo "")
|
|
||||||
TELEGRAM_CHAT="-1003664764329"
|
|
||||||
|
|
||||||
REPOS=(
|
|
||||||
"Timmy_Foundation/timmy-config"
|
|
||||||
"Timmy_Foundation/the-nexus"
|
|
||||||
)
|
|
||||||
|
|
||||||
mkdir -p "$LOG_DIR"
|
|
||||||
|
|
||||||
log() {
|
|
||||||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
|
||||||
}
|
|
||||||
|
|
||||||
now=$(date +%s)
|
|
||||||
latest_commit_time=0
|
|
||||||
|
|
||||||
for repo in "${REPOS[@]}"; do
|
|
||||||
# Get most recent commit timestamp
|
|
||||||
response=$(curl -sf --max-time 10 \
|
|
||||||
-H "Authorization: token ${GITEA_TOKEN}" \
|
|
||||||
"${GITEA_URL}/api/v1/repos/${repo}/commits?limit=1" 2>/dev/null || echo "[]")
|
|
||||||
|
|
||||||
commit_date=$(echo "$response" | python3 -c "
|
|
||||||
import json, sys, datetime
|
|
||||||
try:
|
|
||||||
commits = json.load(sys.stdin)
|
|
||||||
if commits:
|
|
||||||
ts = commits[0]['created']
|
|
||||||
dt = datetime.datetime.fromisoformat(ts.replace('Z', '+00:00'))
|
|
||||||
print(int(dt.timestamp()))
|
|
||||||
else:
|
|
||||||
print(0)
|
|
||||||
except:
|
|
||||||
print(0)
|
|
||||||
" 2>/dev/null || echo "0")
|
|
||||||
|
|
||||||
if [ "$commit_date" -gt "$latest_commit_time" ]; then
|
|
||||||
latest_commit_time=$commit_date
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
gap=$((now - latest_commit_time))
|
|
||||||
gap_hours=$((gap / 3600))
|
|
||||||
gap_mins=$(((gap % 3600) / 60))
|
|
||||||
|
|
||||||
if [ "$latest_commit_time" -eq 0 ]; then
|
|
||||||
log "WARN: Could not fetch any commit timestamps. API may be down."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$gap" -gt "$THRESHOLD_SECS" ]; then
|
|
||||||
msg="DEADMAN ALERT: No commits in ${gap_hours}h${gap_mins}m across all repos. Loops may be dead. Last commit: $(date -r "$latest_commit_time" '+%Y-%m-%d %H:%M' 2>/dev/null || echo 'unknown')"
|
|
||||||
log "ALERT: $msg"
|
|
||||||
|
|
||||||
# Send Telegram alert
|
|
||||||
if [ -n "$TELEGRAM_TOKEN" ]; then
|
|
||||||
curl -sf --max-time 10 -X POST \
|
|
||||||
"https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
|
||||||
-d "chat_id=${TELEGRAM_CHAT}" \
|
|
||||||
-d "text=${msg}" >/dev/null 2>&1 || true
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
log "OK: Last commit ${gap_hours}h${gap_mins}m ago (threshold: ${THRESHOLD_HOURS}h)"
|
|
||||||
fi
|
|
||||||
112
docs/deadman-testing.md
Normal file
112
docs/deadman-testing.md
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
# Deadman Switch — Test & Verification Procedure
|
||||||
|
|
||||||
|
**Issue:** #444 — Wire Deadman Switch ACTION (Snapshot + Rollback + Restart)
|
||||||
|
**Last updated:** 2026-04-30 (STEP35 burn contribution)
|
||||||
|
|
||||||
|
This document describes how to verify that the deadman switch is operational
|
||||||
|
end-to-end on the wizards fleet. All tests assume Ansible deployment has been
|
||||||
|
run (`ansible-playbook -i ansible/inventory/hosts.yml ansible/playbooks/deadman_switch.yml`).
|
||||||
|
|
||||||
|
## Architecture (post #444)
|
||||||
|
|
||||||
|
- `deadman_action.sh` — deployed to `{{ wizard_home }}/deadman_action.sh` by Ansible.
|
||||||
|
- Scheduling: Ansible `cron_manager` role installs a cron entry `*/5 * * * *` that runs
|
||||||
|
`deadman_action.sh` and logs to `{{ timmy_log_dir }}/deadman-<wizard>.log`.
|
||||||
|
- No systemd timer, no launchd plist, no separate `deadman-switch.sh` watch — single
|
||||||
|
implementation via universal cron (cron_manager).
|
||||||
|
|
||||||
|
## Acceptance Criteria Test Plan
|
||||||
|
|
||||||
|
### 1. Health-check success → config snapshot saved
|
||||||
|
|
||||||
|
**Test:**
|
||||||
|
1. Ensure wizard agent is healthy.
|
||||||
|
2. Manually run: `{{ wizard_home }}/deadman_action.sh`
|
||||||
|
3. Verify: `{{ deadman_snapshot_dir }}/config.yaml.known_good` exists and matches current config.
|
||||||
|
|
||||||
|
Expected log output:
|
||||||
|
```
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Health check starting...
|
||||||
|
[timestamp] [deadman] [<WIZARD>] HEALTHY — snapshotting config.
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Config snapshot saved.
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Health check complete.
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Health-check failure → config rolled back + agent restarted + rollback event logged
|
||||||
|
|
||||||
|
**Test:**
|
||||||
|
1. Corrupt agent config to trigger failure:
|
||||||
|
```bash
|
||||||
|
echo 'provider: anthropic' >> {{ wizard_home }}/config.yaml
|
||||||
|
```
|
||||||
|
2. Run `{{ wizard_home }}/deadman_action.sh`.
|
||||||
|
3. Verify:
|
||||||
|
- `grep -q anthropic {{ wizard_home }}/config.yaml` → **false** (removed)
|
||||||
|
- `systemctl status hermes-{{ wizard_name | lower }}` shows active (or recent restart)
|
||||||
|
- Log contains `Rollback event: agent=... old_hash=... new_hash=...`
|
||||||
|
4. Optionally check telemetry:
|
||||||
|
```bash
|
||||||
|
sqlite3 {{ request_log_path }} "SELECT status,error_message FROM request_log WHERE endpoint='health_check' ORDER BY timestamp DESC LIMIT 1;"
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected log snippet:
|
||||||
|
```
|
||||||
|
[timestamp] [deadman] [<WIZARD>] FAIL: Config contains banned provider...
|
||||||
|
[timestamp] [deadman] [<WIZARD>] UNHEALTHY — initiating recovery.
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Rolling back config to last known good...
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Config rolled back.
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Rollback event: agent=<WIZARD> old_hash=<sha> new_hash=<sha>
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Restarting hermes-<wizard>...
|
||||||
|
[timestamp] [deadman] [<WIZARD>] Agent restarted via systemd.
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Simulate full cascade-failure death → verify rollback+restart
|
||||||
|
|
||||||
|
- Stop agent: `systemctl stop hermes-<wizard>` or kill process.
|
||||||
|
- Modify config to inject banned provider.
|
||||||
|
- Run `deadman_action.sh` manually (or wait for cron).
|
||||||
|
- Verify that config is rolled back and agent is restarted.
|
||||||
|
|
||||||
|
### 4. Snapshot stored in predictable per-agent location
|
||||||
|
|
||||||
|
**Check on each wizard:**
|
||||||
|
```bash
|
||||||
|
ls -la {{ deadman_snapshot_dir }}/
|
||||||
|
# Expected: config.yaml.known_good + config.yaml.<timestamp> (rolling, max {{ deadman_max_snapshots }})
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Works with Ansible-deployed cron schedule
|
||||||
|
|
||||||
|
Run:
|
||||||
|
```bash
|
||||||
|
ansible-playbook -i ansible/inventory/hosts.yml ansible/playbooks/deadman_switch.yml
|
||||||
|
```
|
||||||
|
Verify:
|
||||||
|
- `{{ wizard_home }}/deadman_action.sh` exists, mode 0755
|
||||||
|
- `{{ deadman_snapshot_dir }}` exists
|
||||||
|
- No systemd timer named `deadman-<wizard>.timer` exists (per one-impl rule)
|
||||||
|
- Cron entry present: `crontab -l | grep deadman`
|
||||||
|
|
||||||
|
### 6. One implementation only — all overlaps killed
|
||||||
|
|
||||||
|
- **Deleted** `bin/deadman-switch.sh` — central watch removed.
|
||||||
|
- **Deleted** `bin/deadman-fallback.py` — provider fallback not part of deadman recovery.
|
||||||
|
- No systemd timer deployed (deadman_switch role no longer deploys service/timer).
|
||||||
|
- Scheduling handled exclusively by `cron_manager`'s universal cron (5 min interval).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
| Symptom | Likely cause | Check |
|
||||||
|
|---------|--------------|-------|
|
||||||
|
| No snapshot created | Config unreadable or permissions wrong | `ls -l {{ wizard_home }}/config.yaml`; run as root |
|
||||||
|
| Rollback doesn't restore | No snapshot file exists | `ls -l {{ deadman_snapshot_dir }}/config.yaml.known_good` |
|
||||||
|
| Agent not restarting | systemd not available or service name mismatch | `systemctl status hermes-{{ wizard_name | lower }}` |
|
||||||
|
| SSH watch never fires | Cron not running or Gitea token missing | `service cron status`; check `~/.hermes/gitea_token_vps` |
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
This follows the KT Bezalel Architecture Session (2026-04-08) design. The deadman
|
||||||
|
switch now closes the loop from detection to recovery automatically on a 5-minute
|
||||||
|
cadence. See issue #444 for full acceptance criteria and design rules.
|
||||||
Reference in New Issue
Block a user