feat(ops): deadman switch, model health check, issue filter
Closes #115: bin/deadman-switch.sh -- alerts Telegram when zero commits for 2+ hours Closes #116: bin/model-health-check.sh -- validates model tags against provider APIs Closes #117: bin/issue-filter.json + live loop patches -- excludes DO-NOT-CLOSE, EPIC, META, RETRO, INTEL, MORNING REPORT, Rockachopa-assigned issues from agent pickup All three tested locally: - deadman-switch correctly detected 14h gap and would alert - model-health-check parses config.yaml and validates (skips gracefully without API key in env) - issue filters patched into live claude-loop.sh and gemini-loop.sh
This commit is contained in:
78
bin/deadman-switch.sh
Executable file
78
bin/deadman-switch.sh
Executable file
@@ -0,0 +1,78 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# deadman-switch.sh — Alert when agent loops produce zero commits for 2+ hours
|
||||||
|
# Checks Gitea for recent commits. Sends Telegram alert if threshold exceeded.
|
||||||
|
# Designed to run as a cron job every 30 minutes.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
THRESHOLD_HOURS="${1:-2}"
|
||||||
|
THRESHOLD_SECS=$((THRESHOLD_HOURS * 3600))
|
||||||
|
LOG_DIR="$HOME/.hermes/logs"
|
||||||
|
LOG_FILE="$LOG_DIR/deadman.log"
|
||||||
|
GITEA_URL="http://143.198.27.163:3000"
|
||||||
|
GITEA_TOKEN=$(cat "$HOME/.hermes/gitea_token_vps" 2>/dev/null || echo "")
|
||||||
|
TELEGRAM_TOKEN=$(cat "$HOME/.config/telegram/special_bot" 2>/dev/null || echo "")
|
||||||
|
TELEGRAM_CHAT="-1003664764329"
|
||||||
|
|
||||||
|
REPOS=(
|
||||||
|
"Timmy_Foundation/timmy-config"
|
||||||
|
"Timmy_Foundation/the-nexus"
|
||||||
|
)
|
||||||
|
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >> "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
now=$(date +%s)
|
||||||
|
latest_commit_time=0
|
||||||
|
|
||||||
|
for repo in "${REPOS[@]}"; do
|
||||||
|
# Get most recent commit timestamp
|
||||||
|
response=$(curl -sf --max-time 10 \
|
||||||
|
-H "Authorization: token ${GITEA_TOKEN}" \
|
||||||
|
"${GITEA_URL}/api/v1/repos/${repo}/commits?limit=1" 2>/dev/null || echo "[]")
|
||||||
|
|
||||||
|
commit_date=$(echo "$response" | python3 -c "
|
||||||
|
import json, sys, datetime
|
||||||
|
try:
|
||||||
|
commits = json.load(sys.stdin)
|
||||||
|
if commits:
|
||||||
|
ts = commits[0]['created']
|
||||||
|
dt = datetime.datetime.fromisoformat(ts.replace('Z', '+00:00'))
|
||||||
|
print(int(dt.timestamp()))
|
||||||
|
else:
|
||||||
|
print(0)
|
||||||
|
except:
|
||||||
|
print(0)
|
||||||
|
" 2>/dev/null || echo "0")
|
||||||
|
|
||||||
|
if [ "$commit_date" -gt "$latest_commit_time" ]; then
|
||||||
|
latest_commit_time=$commit_date
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
gap=$((now - latest_commit_time))
|
||||||
|
gap_hours=$((gap / 3600))
|
||||||
|
gap_mins=$(((gap % 3600) / 60))
|
||||||
|
|
||||||
|
if [ "$latest_commit_time" -eq 0 ]; then
|
||||||
|
log "WARN: Could not fetch any commit timestamps. API may be down."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$gap" -gt "$THRESHOLD_SECS" ]; then
|
||||||
|
msg="DEADMAN ALERT: No commits in ${gap_hours}h${gap_mins}m across all repos. Loops may be dead. Last commit: $(date -r "$latest_commit_time" '+%Y-%m-%d %H:%M' 2>/dev/null || echo 'unknown')"
|
||||||
|
log "ALERT: $msg"
|
||||||
|
|
||||||
|
# Send Telegram alert
|
||||||
|
if [ -n "$TELEGRAM_TOKEN" ]; then
|
||||||
|
curl -sf --max-time 10 -X POST \
|
||||||
|
"https://api.telegram.org/bot${TELEGRAM_TOKEN}/sendMessage" \
|
||||||
|
-d "chat_id=${TELEGRAM_CHAT}" \
|
||||||
|
-d "text=${msg}" >/dev/null 2>&1 || true
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "OK: Last commit ${gap_hours}h${gap_mins}m ago (threshold: ${THRESHOLD_HOURS}h)"
|
||||||
|
fi
|
||||||
19
bin/issue-filter.json
Normal file
19
bin/issue-filter.json
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
{
|
||||||
|
"skip_title_patterns": [
|
||||||
|
"[DO NOT CLOSE",
|
||||||
|
"[EPIC]",
|
||||||
|
"[META]",
|
||||||
|
"[GOVERNING]",
|
||||||
|
"[PERMANENT]",
|
||||||
|
"[MORNING REPORT]",
|
||||||
|
"[RETRO]",
|
||||||
|
"[INTEL]",
|
||||||
|
"[SHOWCASE]",
|
||||||
|
"[PHILOSOPHY]",
|
||||||
|
"Master Escalation"
|
||||||
|
],
|
||||||
|
"skip_assignees": [
|
||||||
|
"Rockachopa"
|
||||||
|
],
|
||||||
|
"comment": "Shared filter config for agent loops. Loaded by claude-loop.sh and gemini-loop.sh at issue selection time."
|
||||||
|
}
|
||||||
125
bin/model-health-check.sh
Executable file
125
bin/model-health-check.sh
Executable file
@@ -0,0 +1,125 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# model-health-check.sh — Validate all configured model tags before loop startup
|
||||||
|
# Reads config.yaml, extracts model tags, tests each against its provider API.
|
||||||
|
# Exit 1 if primary model is dead. Warnings for auxiliary models.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
CONFIG="${HERMES_HOME:-$HOME/.hermes}/config.yaml"
|
||||||
|
LOG_DIR="$HOME/.hermes/logs"
|
||||||
|
LOG_FILE="$LOG_DIR/model-health.log"
|
||||||
|
|
||||||
|
mkdir -p "$LOG_DIR"
|
||||||
|
|
||||||
|
log() {
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"
|
||||||
|
}
|
||||||
|
|
||||||
|
PASS=0
|
||||||
|
FAIL=0
|
||||||
|
WARN=0
|
||||||
|
|
||||||
|
check_anthropic_model() {
|
||||||
|
local model="$1"
|
||||||
|
local label="$2"
|
||||||
|
local api_key="${ANTHROPIC_API_KEY:-}"
|
||||||
|
|
||||||
|
if [ -z "$api_key" ]; then
|
||||||
|
# Try loading from .env
|
||||||
|
api_key=$(grep '^ANTHROPIC_API_KEY=' "${HERMES_HOME:-$HOME/.hermes}/.env" 2>/dev/null | head -1 | cut -d= -f2- | tr -d "'\"" || echo "")
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -z "$api_key" ]; then
|
||||||
|
log "SKIP [$label] $model -- no ANTHROPIC_API_KEY"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
response=$(curl -sf --max-time 10 -X POST \
|
||||||
|
"https://api.anthropic.com/v1/messages" \
|
||||||
|
-H "x-api-key: ${api_key}" \
|
||||||
|
-H "anthropic-version: 2023-06-01" \
|
||||||
|
-H "content-type: application/json" \
|
||||||
|
-d "{\"model\":\"${model}\",\"max_tokens\":1,\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}" 2>&1 || echo "ERROR")
|
||||||
|
|
||||||
|
if echo "$response" | grep -q '"not_found_error"'; then
|
||||||
|
log "FAIL [$label] $model -- model not found (404)"
|
||||||
|
return 1
|
||||||
|
elif echo "$response" | grep -q '"rate_limit_error"\|"overloaded_error"'; then
|
||||||
|
log "PASS [$label] $model -- rate limited but model exists"
|
||||||
|
return 0
|
||||||
|
elif echo "$response" | grep -q '"content"'; then
|
||||||
|
log "PASS [$label] $model -- healthy"
|
||||||
|
return 0
|
||||||
|
elif echo "$response" | grep -q 'ERROR'; then
|
||||||
|
log "WARN [$label] $model -- could not reach API"
|
||||||
|
return 2
|
||||||
|
else
|
||||||
|
log "PASS [$label] $model -- responded (non-404)"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract models from config
|
||||||
|
log "=== Model Health Check ==="
|
||||||
|
|
||||||
|
# Primary model
|
||||||
|
primary=$(python3 -c "
|
||||||
|
import yaml
|
||||||
|
with open('$CONFIG') as f:
|
||||||
|
c = yaml.safe_load(f)
|
||||||
|
m = c.get('model', {})
|
||||||
|
if isinstance(m, dict):
|
||||||
|
print(m.get('default', ''))
|
||||||
|
else:
|
||||||
|
print(m or '')
|
||||||
|
" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
provider=$(python3 -c "
|
||||||
|
import yaml
|
||||||
|
with open('$CONFIG') as f:
|
||||||
|
c = yaml.safe_load(f)
|
||||||
|
m = c.get('model', {})
|
||||||
|
if isinstance(m, dict):
|
||||||
|
print(m.get('provider', ''))
|
||||||
|
else:
|
||||||
|
print('')
|
||||||
|
" 2>/dev/null || echo "")
|
||||||
|
|
||||||
|
if [ -n "$primary" ] && [ "$provider" = "anthropic" ]; then
|
||||||
|
if check_anthropic_model "$primary" "PRIMARY"; then
|
||||||
|
PASS=$((PASS + 1))
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
if [ "$rc" -eq 1 ]; then
|
||||||
|
FAIL=$((FAIL + 1))
|
||||||
|
log "CRITICAL: Primary model $primary is DEAD. Loops will fail."
|
||||||
|
log "Known good alternatives: claude-opus-4.6, claude-haiku-4-5-20251001"
|
||||||
|
else
|
||||||
|
WARN=$((WARN + 1))
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
elif [ -n "$primary" ]; then
|
||||||
|
log "SKIP [PRIMARY] $primary -- non-anthropic provider ($provider), no validator yet"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Cron model check (haiku)
|
||||||
|
CRON_MODEL="claude-haiku-4-5-20251001"
|
||||||
|
if check_anthropic_model "$CRON_MODEL" "CRON"; then
|
||||||
|
PASS=$((PASS + 1))
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
if [ "$rc" -eq 1 ]; then
|
||||||
|
FAIL=$((FAIL + 1))
|
||||||
|
else
|
||||||
|
WARN=$((WARN + 1))
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
log "=== Results: PASS=$PASS FAIL=$FAIL WARN=$WARN ==="
|
||||||
|
|
||||||
|
if [ "$FAIL" -gt 0 ]; then
|
||||||
|
log "BLOCKING: $FAIL model(s) are dead. Fix config before starting loops."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
Reference in New Issue
Block a user