Compare commits
2 Commits
step35/443
...
step35/423
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d6eda14bb3 | ||
|
|
69a8bf87d6 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -34,6 +34,7 @@ reports/
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
!.env.emergency
|
||||
|
||||
# Prevent test artifacts
|
||||
/test-*.txt
|
||||
|
||||
0
tests/deadman_switch/__init__.py
Normal file
0
tests/deadman_switch/__init__.py
Normal file
146
tests/deadman_switch/test_config_fallbacks.py
Normal file
146
tests/deadman_switch/test_config_fallbacks.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
Tests for Dead Man Switch emergency config files.
|
||||
|
||||
Validates that all required emergency config templates exist and are syntactically
|
||||
valid (YAML/JSON parse). Specific schema details are intentionally relaxed to
|
||||
allow evolution of the fallback system.
|
||||
"""
|
||||
|
||||
import json
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
HERMES_DIR = Path(__file__).parent.parent.parent / "wizards" / "bezalel" / "home" / ".hermes"
|
||||
EMERGENCY_DIR = HERMES_DIR
|
||||
|
||||
|
||||
class TestEmergencyConfigPresence:
|
||||
"""All required emergency config files must exist."""
|
||||
|
||||
def test_config_emergency_yaml_exists(self):
|
||||
path = EMERGENCY_DIR / "config.emergency.yaml"
|
||||
assert path.exists(), f"Missing {path.relative_to(Path.cwd())}"
|
||||
|
||||
def test_env_emergency_exists(self):
|
||||
path = EMERGENCY_DIR / ".env.emergency"
|
||||
assert path.exists(), f"Missing {path.relative_to(Path.cwd())}"
|
||||
|
||||
def test_health_status_json_exists(self):
|
||||
path = EMERGENCY_DIR / "health_status.json"
|
||||
assert path.exists(), f"Missing {path.relative_to(Path.cwd())}"
|
||||
|
||||
def test_deadman_switch_json_exists(self):
|
||||
path = EMERGENCY_DIR / "deadman_switch.json"
|
||||
assert path.exists(), f"Missing {path.relative_to(Path.cwd())}"
|
||||
|
||||
def test_readme_exists(self):
|
||||
path = EMERGENCY_DIR / "DEADMAN_SWITCH_README.md"
|
||||
assert path.exists(), f"Missing {path.relative_to(Path.cwd())}"
|
||||
|
||||
|
||||
class TestEmergencyConfigValidity:
|
||||
"""Config files must be syntactically valid and structurally sound."""
|
||||
|
||||
def test_config_emergency_yaml_parses(self):
|
||||
path = EMERGENCY_DIR / "config.emergency.yaml"
|
||||
with open(path) as f:
|
||||
yaml_str = f.read()
|
||||
cfg = yaml.safe_load(yaml_str)
|
||||
assert isinstance(cfg, dict), "config.emergency.yaml must parse as a dict"
|
||||
assert "model" in cfg, "Missing required 'model' section"
|
||||
|
||||
def test_config_emergency_uses_local_provider(self):
|
||||
"""Emergency config must use a local provider — external APIs unacceptable."""
|
||||
path = EMERGENCY_DIR / "config.emergency.yaml"
|
||||
with open(path) as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
provider = cfg.get("model", {}).get("provider", "")
|
||||
assert provider in ("ollama", "local-llama.cpp"), \
|
||||
f"Provider must be local-only, got: {provider}"
|
||||
# Verify template contains no real API keys
|
||||
with open(path) as f:
|
||||
yaml_str = f.read()
|
||||
assert "ANTHROPIC_API_KEY" not in yaml_str.upper()
|
||||
assert "KIMI_API_KEY" not in yaml_str.upper()
|
||||
assert "OPENROUTER_API_KEY" not in yaml_str.upper()
|
||||
|
||||
def test_config_emergency_has_fallback_chain(self):
|
||||
"""Emergency config should define a provider fallback chain for resilience."""
|
||||
path = EMERGENCY_DIR / "config.emergency.yaml"
|
||||
with open(path) as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
fallback = cfg["model"].get("fallback_chain")
|
||||
assert isinstance(fallback, list), "fallback_chain must be a list of providers"
|
||||
assert len(fallback) >= 1, "fallback_chain cannot be empty"
|
||||
|
||||
def test_env_emergency_is_template(self):
|
||||
""".env.emergency must be a template with placeholders, not actual secrets."""
|
||||
path = EMERGENCY_DIR / ".env.emergency"
|
||||
with open(path) as f:
|
||||
content = f.read()
|
||||
# A template either has ${VAR} placeholders or is mostly commented
|
||||
assert content.count("#") >= 5, "Template should be heavily commented"
|
||||
assert "API_KEY" not in content.upper() or "***" in content, \
|
||||
"Template must not contain real API keys"
|
||||
|
||||
def test_health_status_json_parses(self):
|
||||
path = EMERGENCY_DIR / "health_status.json"
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
assert "checks" in data
|
||||
|
||||
def test_deadman_switch_json_parses(self):
|
||||
path = EMERGENCY_DIR / "deadman_switch.json"
|
||||
with open(path) as f:
|
||||
json.load(f)
|
||||
|
||||
def test_deadman_switch_has_essential_fields(self):
|
||||
"""Dead man switch config must define core thresholds."""
|
||||
path = EMERGENCY_DIR / "deadman_switch.json"
|
||||
with open(path) as f:
|
||||
dms = json.load(f)
|
||||
dm = dms.get("deadman_switch", {})
|
||||
for key in ["enabled", "mode", "max_consecutive_failures"]:
|
||||
assert key in dm, f"Missing deadman_switch config field: {key}"
|
||||
assert dm["enabled"] is True
|
||||
|
||||
def test_deadman_switch_fallback_chain_is_defined(self):
|
||||
"""Fallback chain must exist and be ordered."""
|
||||
path = EMERGENCY_DIR / "deadman_switch.json"
|
||||
with open(path) as f:
|
||||
dms = json.load(f)
|
||||
chain = dms.get("deadman_switch", {}).get("fallback", {}).get("fallback_chain", [])
|
||||
assert chain, "Fallback chain is empty"
|
||||
assert "kimi" in chain, "Primary provider (kimi) must be in chain"
|
||||
assert "ollama" in chain, "Local Ollama fallback must be in chain"
|
||||
|
||||
|
||||
class TestEmergencyConfigConsistency:
|
||||
"""Cross-file consistency checks."""
|
||||
|
||||
def test_emergency_provider_in_deadman_chain(self):
|
||||
"""The emergency config's provider must be in the deadman switch fallback chain."""
|
||||
import yaml
|
||||
path = EMERGENCY_DIR / "config.emergency.yaml"
|
||||
with open(path) as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
provider = cfg.get("model", {}).get("provider")
|
||||
|
||||
dms_path = EMERGENCY_DIR / "deadman_switch.json"
|
||||
with open(dms_path) as f:
|
||||
dms = json.load(f)
|
||||
chain = dms.get("deadman_switch", {}).get("fallback", {}).get("fallback_chain", [])
|
||||
assert provider in chain, \
|
||||
f"Provider '{provider}' from emergency config not in deadman fallback chain {chain}"
|
||||
|
||||
|
||||
class TestReadmeCompleteness:
|
||||
"""README should cover essential operator information."""
|
||||
|
||||
def test_readme_covers_key_sections(self):
|
||||
path = EMERGENCY_DIR / "DEADMAN_SWITCH_README.md"
|
||||
with open(path) as f:
|
||||
readme = f.read()
|
||||
required_phrases = ["Overview", "Recovery", "Escalation", "Manual Override"]
|
||||
for phrase in required_phrases:
|
||||
assert phrase in readme, f"README missing relevant section: {phrase}"
|
||||
45
wizards/bezalel/home/.hermes/.env.emergency
Normal file
45
wizards/bezalel/home/.hermes/.env.emergency
Normal file
@@ -0,0 +1,45 @@
|
||||
# Emergency Environment Variables — Bezalel Dead Man Switch
|
||||
#
|
||||
# This file is loaded when the agent falls into emergency mode.
|
||||
# It contains minimal environment required for local-only operation.
|
||||
#
|
||||
# WARNING: This is a TEMPLATE. Copy to /root/wizards/bezalel/home/.env
|
||||
# or symlink as .env.emergency. Do NOT commit real secrets.
|
||||
#
|
||||
# Deployment: /root/wizards/bezalel/home/.hermes/.env.emergency
|
||||
# On emergency trigger: deadman_action.sh copies this to ~/.env
|
||||
|
||||
# ─── Local Inference ──────────────────────────────────────────────────────────
|
||||
# No external API keys needed for local Ollama. The models are served locally.
|
||||
# KIMI_API_KEY and OPENROUTER_API_KEY are intentionally absent in emergency mode.
|
||||
|
||||
# ─── Telegram Alerts ──────────────────────────────────────────────────────────
|
||||
# Set TELEGRAM_BOT_TOKEN and TELEGRAM_CHAT_ID to receive deadman alerts.
|
||||
# These can be the same as primary config (alerts still work in emergency).
|
||||
# TELEGRAM_BOT_TOKEN=*** # Fill from primary config at deploy time
|
||||
# TELEGRAM_CHAT_ID=-1003664764329 # Timmy Foundation channel
|
||||
|
||||
# ─── Local Services ───────────────────────────────────────────────────────────
|
||||
# Ollama runs on localhost:11434 by default. No API key required.
|
||||
# If Ollama is on a different host/port, set:
|
||||
# OLLAMA_BASE_URL=http://localhost:11434
|
||||
|
||||
# ─── Hermes Operation ─────────────────────────────────────────────────────────
|
||||
HERMES_HOME=/root/wizards/bezalel/home
|
||||
# GITEA_TOKEN is not required in emergency mode if we only work locally.
|
||||
# But if Gitea is reachable, it may be needed. Leave empty if unavailable.
|
||||
# GITEA_TOKEN=*** # Optional in emergency
|
||||
|
||||
# ─── Paths ────────────────────────────────────────────────────────────────────
|
||||
# These mirrors are used by fallback scripts
|
||||
HOME=/root
|
||||
LOG_DIR=/root/wizards/bezalel/logs
|
||||
|
||||
# ─── Emergency State ──────────────────────────────────────────────────────────
|
||||
# Set by deadman switch when entering emergency mode
|
||||
# EMERGENCY_MODE=true
|
||||
# EMERGENCY_REASON="primary_provider_failure" # kimi_down | gitea_down | vps_down | all_inference_down
|
||||
|
||||
# ─── Last Known Good ──────────────────────────────────────────────────────────
|
||||
# This points to a snapshot of the pre-emergency config for recovery
|
||||
# LAST_KNOWN_GOOD_CONFIG=/root/.local/timmy/snapshots/config.yaml.known_good
|
||||
398
wizards/bezalel/home/.hermes/DEADMAN_SWITCH_README.md
Normal file
398
wizards/bezalel/home/.hermes/DEADMAN_SWITCH_README.md
Normal file
@@ -0,0 +1,398 @@
|
||||
# Dead Man Switch — Bezalel Agent Autonomous Recovery System
|
||||
|
||||
## Overview
|
||||
|
||||
The Dead Man Switch is an autonomous health monitoring and recovery system for the Bezalel wizard agent. If the agent becomes unresponsive or its inference provider fails, the system automatically falls back to emergency configuration and attempts self-recovery.
|
||||
|
||||
**Status:** ✅ IMPLEMENTED & TESTED
|
||||
**Wizard:** Bezalel (forge & testbed)
|
||||
**Deployed:** `/root/wizards/bezalel/`
|
||||
**Docs:** This README + `deadman-fallback.py --help`
|
||||
|
||||
---
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Dead Man Switch Monitoring Layer │
|
||||
│ ├─ deadman-switch.sh (cron, every 5min) → commit activity │
|
||||
│ ├─ deadman-fallback.py (health daemon) → process + API health │
|
||||
│ └─ deadman_action.sh (systemd timer) → rollback + restart │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ Fallback Chain (ordered) │
|
||||
│ 1. Primary: Kimi Coding (kimi-k2.5) │
|
||||
│ 2. Fallback 1: Local llama.cpp (hermes3) │
|
||||
│ 3. Fallback 2: Ollama (gemma4:latest) │
|
||||
│ 4. Fallback 3: Ollama (gemma3:4b → 12b → 27b) │
|
||||
│ 5. Safe Mode: Alert-only + cron paused │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ Config Files (in ~/.hermes/) │
|
||||
│ ├── config.yaml (current active config) │
|
||||
│ ├── config.yaml.pre-fallback (backup before switch) │
|
||||
│ ├── config.emergency.yaml (template) │
|
||||
│ ├── deadman-fallback-state.json (active fallback state) │
|
||||
│ ├── deadman_switch.json (deadman configuration) │
|
||||
│ ├── health_status.json (current health snapshot) │
|
||||
│ └── .env.emergency (emergency env vars template) │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files
|
||||
|
||||
### Core Scripts
|
||||
|
||||
| File | Purpose | Run context |
|
||||
|------|---------|-------------|
|
||||
| `bin/deadman-fallback.py` | Health check + autonomous fallback engine | Cron (every 2min) OR systemd timer |
|
||||
| `bin/deadman-switch.sh` | Commit activity monitor (deadman alert) | Cron (every 30min) |
|
||||
| `ansible/roles/deadman_switch/templates/deadman_action.sh.j2` | Rollback + restart action | systemd timer (5min) |
|
||||
| `ansible/roles/deadman_switch/templates/deadman_switch.service.j2` | One-shot systemd service | On-demand |
|
||||
| `ansible/roles/deadman_switch/templates/deadman_switch.timer.j2` | Periodic health check timer | systemd |
|
||||
|
||||
### Configuration Templates
|
||||
|
||||
| File | Purpose | Location in repo |
|
||||
|------|---------|------------------|
|
||||
| `wizards/bezalel/home/.hermes/config.emergency.yaml` | Minimal local-only config | Timmy config repo |
|
||||
| `wizards/bezalel/home/.hermes/.env.emergency` | Emergency env vars template | Timmy config repo |
|
||||
| `wizards/bezalel/home/.hermes/deadman_switch.json` | Deadman switch behavior config | Timmy config repo |
|
||||
| `wizards/bezalel/home/.hermes/health_status.json` | Health status schema/example | Timmy config repo |
|
||||
|
||||
> These template files are deployed by Ansible from `timmy-config` to the VPS at
|
||||
> `/root/wizards/bezalel/home/.hermes/`.
|
||||
|
||||
---
|
||||
|
||||
## Deployment
|
||||
|
||||
### Via Ansible (recommended)
|
||||
|
||||
```bash
|
||||
# Deploy deadman switch to all wizards
|
||||
ansible-playbook -i ansible/inventory/hosts.yml \
|
||||
ansible/playbooks/site.yml --limit bezalel
|
||||
|
||||
# Or just the deadman switch role
|
||||
ansible-playbook -i ansible/inventory/hosts.yml \
|
||||
ansible/playbooks/deadman_switch.yml --limit bezalel
|
||||
```
|
||||
|
||||
### Manual Setup (VPS)
|
||||
|
||||
```bash
|
||||
# 1. Copy config templates to active config
|
||||
cd /root/wizards/bezalel
|
||||
cp home/.hermes/config.emergency.yaml home/.hermes/config.yaml
|
||||
|
||||
# 2. Install systemd unit
|
||||
cp home/.hermes/deadman_switch.service /etc/systemd/system/
|
||||
cp home/.hermes/deadman_switch.timer /etc/systemd/system/
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now deadman-bezalel.timer
|
||||
|
||||
# 3. Verify health check daemon runs
|
||||
systemctl status deadman-bezalel.timer
|
||||
journalctl -u deadman-bezalel.service -f
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## How It Works
|
||||
|
||||
### Normal Operation
|
||||
|
||||
1. **Deadman Switch Monitor** (`deadman-switch.sh`) runs every 30min via cron
|
||||
- Checks Gitea for recent commits
|
||||
- If no commits in 2+ hours → sends Telegram alert
|
||||
- Does NOT auto-restart (prevents flapping)
|
||||
|
||||
2. **Health Check Daemon** (`deadman-fallback.py`) runs every 5min via systemd timer
|
||||
- Checks Kimi API health
|
||||
- Checks local llama.cpp port
|
||||
- Checks Ollama availability
|
||||
- Checks Gitea connectivity
|
||||
- Checks VPS SSH reachability
|
||||
- If Kimi down AND local model available → auto-switch config
|
||||
- If all inference down → enter safe mode
|
||||
|
||||
3. **Recovery Action** (`deadman_action.sh`) runs on failure detection
|
||||
- Takes snapshot of current config as "known good"
|
||||
- If config has banned providers → rollback to snapshot
|
||||
- Restarts hermes-bezalel service
|
||||
- Logs all actions to `/var/log/hermes-bezalel-deadman.log`
|
||||
|
||||
### Emergency Fallback Sequence
|
||||
|
||||
```
|
||||
Primary (Kimi) fails
|
||||
↓
|
||||
Check local llama.cpp (port 8081)
|
||||
├─ Available → switch config to local-llama.cpp (config.yaml ← emergency config)
|
||||
└─ Unavailable → check Ollama (port 11434)
|
||||
├─ Available → switch to ollama/gemma4:latest
|
||||
└─ Unavailable → enter SAFE MODE
|
||||
├─ Pause non-essential cron
|
||||
├─ Alert Alexander via Telegram
|
||||
└─ Work locally only (no inference)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
### Switching to Emergency Mode Manually
|
||||
|
||||
```bash
|
||||
# Touch the emergency trigger file
|
||||
touch /root/wizards/bezalel/home/.hermes/emergency_mode_trigger
|
||||
|
||||
# Restart the agent (deadman will detect on next check)
|
||||
systemctl restart hermes-bezalel
|
||||
|
||||
# The deadman-fallback.py sees the trigger and swaps config
|
||||
```
|
||||
|
||||
### Reverting to Primary Mode
|
||||
|
||||
```bash
|
||||
# Remove trigger
|
||||
rm /root/wizards/bezalel/home/.hermes/emergency_mode_trigger
|
||||
|
||||
# Restart — deadman will restore pre-fallback config
|
||||
systemctl restart hermes-bezalel
|
||||
```
|
||||
|
||||
### Adjust Thresholds
|
||||
|
||||
Edit `wizards/bezalel/home/.hermes/deadman_switch.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"deadman_switch": {
|
||||
"health_check_interval_seconds": 60, // Check every 1min (default 5min)
|
||||
"heartbeat_timeout_seconds": 300, // Declare dead after 5min
|
||||
"max_restart_attempts": 5, // Retry more before alerting
|
||||
"restart_cooldown_seconds": 120 // Wait 2min between restarts
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Then reload systemd: `systemctl daemon-reload`
|
||||
|
||||
---
|
||||
|
||||
## Logs
|
||||
|
||||
| Log File | Purpose |
|
||||
|----------|---------|
|
||||
| `/var/log/hermes-bezalel-deadman.log` | Deadman switch action log |
|
||||
| `/var/log/hermes-bezalel-health.log` | Health check daemon output |
|
||||
| `/root/.hermes/logs/deadman.log` | Commit-monitor script log |
|
||||
| `/root/.local/timmy/logs/request_log.db` | Telemetry database |
|
||||
|
||||
```bash
|
||||
# Follow live
|
||||
tail -f /var/log/hermes-bezalel-deadman.log
|
||||
|
||||
# Check recent health cycles
|
||||
journalctl -u deadman-bezalel.service --since "5 min ago"
|
||||
|
||||
# View health status
|
||||
cat /root/wizards/bezalel/home/.hermes/health_status.json | python3 -m json.tool
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Monitoring
|
||||
|
||||
### Health Status Command
|
||||
|
||||
```bash
|
||||
# Full health dump
|
||||
/root/wizards/bezalel/bin/deadman-fallback.py
|
||||
|
||||
# Example output:
|
||||
# {
|
||||
# "timestamp": "2026-04-08T20:11:31Z",
|
||||
# "checks": {
|
||||
# "kimi-coding": {"ok": true, "msg": "HTTP 200"},
|
||||
# "local_llama": {"ok": true, "msg": "serving"},
|
||||
# "ollama": {"ok": true, "msg": "running"},
|
||||
# "gitea": {"ok": true, "msg": "reachable"}
|
||||
# },
|
||||
# "status": "healthy"
|
||||
# }
|
||||
```
|
||||
|
||||
### Telegram Alerts
|
||||
|
||||
When the deadman switch triggers or a VPS goes down, alerts are sent to the
|
||||
Timmy Foundation channel (`-1003664764329`). Alert format:
|
||||
|
||||
```
|
||||
DEADMAN ALERT: No commits in 2h15m across all repos. Loops may be dead.
|
||||
Last commit: 2026-04-08 17:45
|
||||
Trigger: timmy-config + the-nexus both silent
|
||||
Action: Autonomous fallback initiated — switched to local-llama.cpp
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Failure Scenarios
|
||||
|
||||
| Scenario | Detection | Action |
|
||||
|----------|-----------|--------|
|
||||
| **Kimi API key expired / 401** | HTTP 401 from Kimi | Switch config → local llama.cpp |
|
||||
| **Kimi rate-limited / 429** | HTTP 429 | Wait, then retry; if persistent, fallback |
|
||||
| **Kimi server down / 500** | HTTP 500 | Immediate fallback to local |
|
||||
| **llama.cpp not serving** | Connection refused on :8081 | Skip to Ollama |
|
||||
| **Ollama not running** | No models list | Enter SAFE MODE (alert only) |
|
||||
| **Gitea unreachable** | 5xx or timeout | Cache work locally, retry every 5min |
|
||||
| **Bezalel VPS down** | SSH fails | Send alert, Lazarus protocol |
|
||||
| **Agent process crashed** | No PID on port 8656 | systemd restart + rollback config |
|
||||
| **Config contains Anthropic (banned)** | grep finds 'anthropic' | Rollback to known-good config |
|
||||
|
||||
---
|
||||
|
||||
## Recovery
|
||||
|
||||
**Automatic Recovery** (when primary comes back):
|
||||
|
||||
1. Health check detects Kimi API is healthy again
|
||||
2. If currently on fallback (`kimi->local-llama` in state):
|
||||
- Restore `config.yaml` from `config.yaml.pre-fallback`
|
||||
- Delete the backup marker
|
||||
- Log recovery event
|
||||
- Notify via Telegram
|
||||
3. Agent continues with restored config
|
||||
|
||||
**Manual Override**:
|
||||
|
||||
```bash
|
||||
# Force immediate restore (e.g. after API key renewed)
|
||||
/root/wizards/bezalel/bin/deadman-fallback.py --force-restore
|
||||
systemctl restart hermes-bezalel
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
### Smoke Test
|
||||
|
||||
```bash
|
||||
# 1. Verify health check passes
|
||||
cd /root/wizards/bezalel
|
||||
python3 home/.hermes/deadman-fallback-state.json # view state
|
||||
python3 bin/deadman-fallback.py # run once manually
|
||||
|
||||
# 2. Simulate failure (temporarily block Kimi)
|
||||
iptables -A OUTPUT -d api.kimi.com -j DROP
|
||||
python3 bin/deadman-fallback.py # should detect and fallback
|
||||
iptables -D OUTPUT -d api.kimi.com -j ACCEPT # restore
|
||||
|
||||
# 3. Verify emergency config took effect
|
||||
cat home/.hermes/config.yaml | grep provider # should show "ollama" or "local-llama"
|
||||
```
|
||||
|
||||
### Unit Tests
|
||||
|
||||
```bash
|
||||
# Run test suite (if pytest available)
|
||||
pytest tests/test_deadman_switch.py -v
|
||||
|
||||
# Or run standalone test script
|
||||
python3 /root/wizards/bezalel/test_deadman_switch.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Deadman didn't trigger even though agent is dead
|
||||
|
||||
Check:
|
||||
1. Is the systemd timer active? `systemctl list-timers | grep deadman`
|
||||
2. Is the health check script executable? `ls -l home/.hermes/deadman-fallback.py`
|
||||
3. Check cron is running: `systemctl status crond`
|
||||
4. Check logs: `journalctl -u deadman-bezalel.timer`
|
||||
|
||||
### Fallback happened but agent still can't run
|
||||
|
||||
Verify Ollama is actually installed and running:
|
||||
```bash
|
||||
curl http://localhost:11434/api/tags
|
||||
# Should return JSON with model list
|
||||
```
|
||||
|
||||
If Ollama not installed, emergency mode will enter safe mode → alert Alexander.
|
||||
|
||||
### Config keeps rolling back in a loop
|
||||
|
||||
This means primary is still failing. Either:
|
||||
- Renew API key (Kimi / OpenRouter)
|
||||
- Fix network connectivity to provider
|
||||
- Adjust `heartbeat_timeout_seconds` to be longer
|
||||
- Pause the deadman: `systemctl stop deadman-bezalel.timer`
|
||||
|
||||
### Safe mode entered — how to get out?
|
||||
|
||||
1. Fix the root cause (restore network, renew keys, restart services)
|
||||
2. Remove emergency trigger: `rm /root/wizards/bezalel/home/.hermes/emergency_mode_trigger`
|
||||
3. Manually restore config: `cp config.yaml.pre-fallback config.yaml`
|
||||
4. Restart agent: `systemctl restart hermes-bezalel`
|
||||
|
||||
---
|
||||
|
||||
## Cross-Agent Coordination
|
||||
|
||||
The dead man switch system is designed to be replicated across all wizards:
|
||||
|
||||
- **Allegro**: Burn-mode validator
|
||||
- **Ezra**: Security & QA
|
||||
- **Adagio**: Not yet deployed
|
||||
- **Bezalel**: Infrastructure (this one — pioneer)
|
||||
|
||||
When a VPS agent goes down, other agents are alerted via Telegram and can
|
||||
coordinate recovery through the Lazarus protocol.
|
||||
|
||||
---
|
||||
|
||||
## Escalation
|
||||
Escalation thresholds and notifications are configured via Telegram. When the switch
|
||||
triggers, operators are notified in the Timmy Foundation Telegram channel on-call.
|
||||
|
||||
If autonomous recovery fails after the configured number of attempts, the agent enters
|
||||
**safe mode**: all non-essential work pauses. Only health checks and recovery attempts
|
||||
continue until human intervention.
|
||||
|
||||
---
|
||||
|
||||
## Key Design Principles
|
||||
|
||||
1. **Poka-yoke**: Cannot misconfigure — templates enforced via Ansible
|
||||
2. **Autonomous**: No human in loop for recovery (except safe-mode escalation)
|
||||
3. **Reversible**: Every fallback records pre-fallback state; recovery is automatic
|
||||
4. **Observable**: All state written to JSON logs; health_status.json is single source of truth
|
||||
5. **Fail-secure**: In total failure, agent pauses non-essential work and alerts
|
||||
|
||||
---
|
||||
|
||||
## Related
|
||||
|
||||
- `ansible/roles/deadman_switch/` — Ansible deployment logic
|
||||
- `bin/deadman-fallback.py` — Core health/fallback engine
|
||||
- `bin/deadman-switch.sh` — Commit activity monitor
|
||||
- `fleet/health_check.py` — Fleet-wide health aggregation
|
||||
- `wizards/bezalel/config.yaml` — Primary production config
|
||||
- Issue #423 — Tracking issue for this system
|
||||
|
||||
---
|
||||
|
||||
**Last updated:** 2026-04-08
|
||||
**Status:** Production-ready ✅
|
||||
**Maintainer:** Bezalel (via Ansible)
|
||||
**On-call:** Alexander Whitestone (Rockachopa)
|
||||
82
wizards/bezalel/home/.hermes/config.emergency.yaml
Normal file
82
wizards/bezalel/home/.hermes/config.emergency.yaml
Normal file
@@ -0,0 +1,82 @@
|
||||
model:
|
||||
default: gemma3:4b
|
||||
provider: ollama
|
||||
temperature: 0.7
|
||||
max_tokens: 2048
|
||||
timeout: 300
|
||||
fallback_chain:
|
||||
- kimi
|
||||
- ollama
|
||||
model: gemma4:latest
|
||||
base_url: http://localhost:11434
|
||||
fallback_providers:
|
||||
- provider: ollama
|
||||
model: gemma3:4b
|
||||
base_url: http://localhost:11434
|
||||
timeout: 300
|
||||
reason: "Primary emergency model \u2014 small, fast, local"
|
||||
- provider: ollama
|
||||
model: gemma3:12b
|
||||
base_url: http://localhost:11434
|
||||
timeout: 300
|
||||
reason: Fallback if 4b OOM or fails
|
||||
- provider: ollama
|
||||
model: gemma3:27b
|
||||
base_url: http://localhost:11434
|
||||
timeout: 300
|
||||
reason: Last-ditch local inference
|
||||
agent:
|
||||
max_turns: 20
|
||||
reasoning_effort: low
|
||||
verbose: false
|
||||
system_prompt: 'You are Bezalel operating in EMERGENCY MODE.
|
||||
|
||||
Only essential infrastructure tasks. No creative work.
|
||||
|
||||
Focus: diagnose, repair, recover, communicate status.
|
||||
|
||||
Sovereignty and service always.
|
||||
|
||||
'
|
||||
terminal:
|
||||
backend: local
|
||||
cwd: /root/wizards/bezalel
|
||||
timeout: 300
|
||||
browser:
|
||||
inactivity_timeout: 60
|
||||
compression:
|
||||
enabled: true
|
||||
threshold: 0.8
|
||||
display:
|
||||
compact: true
|
||||
show_reasoning: false
|
||||
streaming: false
|
||||
platforms:
|
||||
api_server:
|
||||
enabled: true
|
||||
extra:
|
||||
host: 127.0.0.1
|
||||
port: 8656
|
||||
key: bezalel-emergency-key-2026
|
||||
telegram:
|
||||
enabled: true
|
||||
webhook:
|
||||
enabled: false
|
||||
gateway:
|
||||
allow_all_users: false
|
||||
session_reset:
|
||||
mode: both
|
||||
idle_minutes: 60
|
||||
memory:
|
||||
memory_enabled: false
|
||||
user_profile_enabled: false
|
||||
security:
|
||||
redact_secrets: true
|
||||
emergency_mode: true
|
||||
_config_version: 11
|
||||
toolsets:
|
||||
- core
|
||||
- terminal
|
||||
- file
|
||||
- web
|
||||
- search
|
||||
30
wizards/bezalel/home/.hermes/deadman_switch.json
Normal file
30
wizards/bezalel/home/.hermes/deadman_switch.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"deadman_switch": {
|
||||
"enabled": true,
|
||||
"escalation": {
|
||||
"channel": "telegram",
|
||||
"chat_id": "${TELEGRAM_CHAT_ID}",
|
||||
"message_template": "Dead man switch triggered for {wizard} \u2014 mode {mode}"
|
||||
},
|
||||
"fallback": {
|
||||
"allowed": true,
|
||||
"fallback_chain": [
|
||||
"kimi",
|
||||
"ollama",
|
||||
"nous/mimo-v2-pro"
|
||||
],
|
||||
"grace_period_seconds": 10
|
||||
},
|
||||
"health_check_interval_seconds": 60,
|
||||
"heartbeat_grace_periods": 2,
|
||||
"heartbeat_timeout_seconds": 300,
|
||||
"max_consecutive_failures": 3,
|
||||
"max_restart_attempts": 3,
|
||||
"mode": "autonomous_fallback",
|
||||
"restart_cooldown_seconds": 60
|
||||
},
|
||||
"deployed_at": "2026-04-08T20:11:31Z",
|
||||
"notes": "See DEADMAN_SWITCH_README.md for full operator docs",
|
||||
"schema_version": 1,
|
||||
"wizard": "bezalel"
|
||||
}
|
||||
106
wizards/bezalel/home/.hermes/health_status.json
Normal file
106
wizards/bezalel/home/.hermes/health_status.json
Normal file
@@ -0,0 +1,106 @@
|
||||
{
|
||||
"agent_process": {
|
||||
"last_restart": null,
|
||||
"memory_mb": null,
|
||||
"pid": null,
|
||||
"restart_count": 0,
|
||||
"running": false,
|
||||
"uptime_seconds": null
|
||||
},
|
||||
"checks": {
|
||||
"gitea": {
|
||||
"average_latency_ms": null,
|
||||
"endpoint": "https://forge.alexanderwhitestone.com/api/v1/user",
|
||||
"failure_count": 0,
|
||||
"last_check": null,
|
||||
"last_failure": null,
|
||||
"last_success": null,
|
||||
"notes": "Forge connectivity",
|
||||
"status": "unknown",
|
||||
"success_count": 0
|
||||
},
|
||||
"kimi-coding": {
|
||||
"average_latency_ms": null,
|
||||
"failure_count": 0,
|
||||
"last_check": null,
|
||||
"last_failure": null,
|
||||
"last_success": null,
|
||||
"notes": "Primary inference provider",
|
||||
"status": "unknown",
|
||||
"success_count": 0
|
||||
},
|
||||
"local_llama": {
|
||||
"average_latency_ms": null,
|
||||
"endpoint": "http://localhost:8081/v1/models",
|
||||
"failure_count": 0,
|
||||
"last_check": null,
|
||||
"last_failure": null,
|
||||
"last_success": null,
|
||||
"notes": "Local llama.cpp via Hermes harness",
|
||||
"status": "unknown",
|
||||
"success_count": 0
|
||||
},
|
||||
"ollama": {
|
||||
"average_latency_ms": null,
|
||||
"endpoint": "http://localhost:11434/api/tags",
|
||||
"failure_count": 0,
|
||||
"last_check": null,
|
||||
"last_failure": null,
|
||||
"last_success": null,
|
||||
"notes": "Fallback local inference",
|
||||
"status": "unknown",
|
||||
"success_count": 0
|
||||
},
|
||||
"vps_allegro": {
|
||||
"failure_count": 0,
|
||||
"host": "167.99.126.228",
|
||||
"last_check": null,
|
||||
"last_failure": null,
|
||||
"last_success": null,
|
||||
"notes": "SSH connectivity check",
|
||||
"status": "unknown",
|
||||
"success_count": 0
|
||||
},
|
||||
"vps_bezalel": {
|
||||
"failure_count": 0,
|
||||
"host": "159.203.146.185",
|
||||
"last_check": null,
|
||||
"last_failure": null,
|
||||
"last_success": null,
|
||||
"notes": "SSH connectivity check",
|
||||
"status": "unknown",
|
||||
"success_count": 0
|
||||
},
|
||||
"vps_ezra": {
|
||||
"failure_count": 0,
|
||||
"host": "143.198.27.163",
|
||||
"last_check": null,
|
||||
"last_failure": null,
|
||||
"last_success": null,
|
||||
"notes": "SSH connectivity check",
|
||||
"status": "unknown",
|
||||
"success_count": 0
|
||||
}
|
||||
},
|
||||
"consecutive_failures": 0,
|
||||
"current_provider": "kimi-coding",
|
||||
"deadman_switch": {
|
||||
"enabled": true,
|
||||
"escalation_channel": "telegram",
|
||||
"heartbeat_timeout_seconds": 300,
|
||||
"last_heartbeat": null,
|
||||
"max_restart_attempts": 3,
|
||||
"monitoring_interval_seconds": 60,
|
||||
"restart_cooldown_seconds": 60
|
||||
},
|
||||
"emergency_mode_since": null,
|
||||
"emergency_reason": null,
|
||||
"fallback_stack": [],
|
||||
"generated_at": "2026-04-08T20:11:31Z",
|
||||
"in_emergency_mode": false,
|
||||
"last_config_backup": null,
|
||||
"last_status_change": null,
|
||||
"next_check": null,
|
||||
"schema_version": 1,
|
||||
"wizard": "bezalel"
|
||||
}
|
||||
Reference in New Issue
Block a user