Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 59s
287 lines
10 KiB
Python
287 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Model Watchdog — monitors tmux panes for model drift.
|
|
Checks all hermes TUI sessions in dev and timmy tmux sessions.
|
|
If any pane is running a non-mimo model, kills and restarts it.
|
|
|
|
Usage: python3 ~/.hermes/bin/model-watchdog.py [--fix]
|
|
--fix Actually restart drifted panes (default: dry-run)
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
import re
|
|
import time
|
|
import os
|
|
|
|
ALLOWED_MODEL = "mimo-v2-pro"
|
|
|
|
# Profile -> expected model. If a pane is running this profile with this model, it's healthy.
|
|
# Profiles not in this map are checked against ALLOWED_MODEL.
|
|
PROFILE_MODELS = {
|
|
"default": "mimo-v2-pro",
|
|
"timmy-sprint": "mimo-v2-pro",
|
|
"fenrir": "mimo-v2-pro",
|
|
"bezalel": "gpt-5.4",
|
|
"burn": "mimo-v2-pro",
|
|
"creative": "claude-sonnet",
|
|
"research": "claude-sonnet",
|
|
"review": "claude-sonnet",
|
|
}
|
|
|
|
TMUX_SESSIONS = ["dev", "timmy"]
|
|
LOG_FILE = os.path.expanduser("~/.hermes/logs/model-watchdog.log")
|
|
|
|
def log(msg):
|
|
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
|
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
line = f"[{ts}] {msg}"
|
|
print(line)
|
|
with open(LOG_FILE, "a") as f:
|
|
f.write(line + "\n")
|
|
|
|
def run(cmd):
|
|
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
|
|
return r.stdout.strip(), r.returncode
|
|
|
|
def get_panes(session):
|
|
"""Get all pane info from ALL windows in a tmux session."""
|
|
# First get all windows
|
|
win_out, win_rc = run(f"tmux list-windows -t {session} -F '#{{window_name}}' 2>/dev/null")
|
|
if win_rc != 0:
|
|
return []
|
|
|
|
panes = []
|
|
for window_name in win_out.split("\n"):
|
|
if not window_name.strip():
|
|
continue
|
|
target = f"{session}:{window_name}"
|
|
out, rc = run(f"tmux list-panes -t {target} -F '#{{pane_index}}|#{{pane_pid}}|#{{pane_tty}}' 2>/dev/null")
|
|
if rc != 0:
|
|
continue
|
|
for line in out.split("\n"):
|
|
if "|" in line:
|
|
idx, pid, tty = line.split("|")
|
|
panes.append({
|
|
"session": session,
|
|
"window": window_name,
|
|
"index": int(idx),
|
|
"pid": int(pid),
|
|
"tty": tty,
|
|
})
|
|
return panes
|
|
|
|
def get_hermes_pid_for_tty(tty):
|
|
"""Find hermes process running on a specific TTY."""
|
|
out, _ = run(f"ps aux | grep '{tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | awk '{{print $2}}'")
|
|
if out:
|
|
return int(out.split("\n")[0])
|
|
return None
|
|
|
|
def get_model_from_pane(session, pane_idx, window=None):
|
|
"""Capture the pane and extract the model from the status bar."""
|
|
target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}"
|
|
out, _ = run(f"tmux capture-pane -t {target} -p 2>/dev/null | tail -30")
|
|
# Look for model in status bar: ⚕ model-name │
|
|
matches = re.findall(r'⚕\s+(\S+)\s+│', out)
|
|
if matches:
|
|
return matches[0]
|
|
return None
|
|
|
|
def check_session_meta(session_id):
|
|
"""Check what model a hermes session was last using from its session file."""
|
|
import json
|
|
session_file = os.path.expanduser(f"~/.hermes/sessions/session_{session_id}.json")
|
|
if os.path.exists(session_file):
|
|
try:
|
|
with open(session_file) as f:
|
|
data = json.load(f)
|
|
return data.get("model"), data.get("provider")
|
|
except:
|
|
pass
|
|
# Try jsonl
|
|
jsonl_file = os.path.expanduser(f"~/.hermes/sessions/{session_id}.jsonl")
|
|
if os.path.exists(jsonl_file):
|
|
try:
|
|
with open(jsonl_file) as f:
|
|
for line in f:
|
|
d = json.loads(line.strip())
|
|
if d.get("role") == "session_meta":
|
|
return d.get("model"), d.get("provider")
|
|
break
|
|
except:
|
|
pass
|
|
return None, None
|
|
|
|
def is_drifted(model_name, profile=None):
|
|
"""Check if a model name indicates drift from the expected model for this profile."""
|
|
if model_name is None:
|
|
return False, "no-model-detected"
|
|
|
|
# If we know the profile, check against its expected model
|
|
if profile and profile in PROFILE_MODELS:
|
|
expected = PROFILE_MODELS[profile]
|
|
if expected in model_name:
|
|
return False, model_name
|
|
return True, model_name
|
|
|
|
# No profile known — fall back to ALLOWED_MODEL
|
|
if ALLOWED_MODEL in model_name:
|
|
return False, model_name
|
|
return True, model_name
|
|
|
|
def get_profile_from_pane(tty):
|
|
"""Detect which hermes profile a pane is running by inspecting its process args."""
|
|
# ps shows short TTY (s031) not full path (/dev/ttys031)
|
|
short_tty = tty.replace("/dev/ttys", "s").replace("/dev/ttys", "")
|
|
out, _ = run(f"ps aux | grep '{short_tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | grep -v cron")
|
|
if not out:
|
|
return None
|
|
# Look for -p <profile> in the command line
|
|
match = re.search(r'-p\s+(\S+)', out)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def kill_and_restart(session, pane_idx, window=None):
|
|
"""Kill the hermes process in a pane and restart it with the same profile."""
|
|
target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}"
|
|
|
|
# Get the pane's TTY
|
|
out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'")
|
|
tty = out.strip()
|
|
|
|
# Detect which profile was running
|
|
profile = get_profile_from_pane(tty)
|
|
|
|
# Find and kill hermes on that TTY
|
|
hermes_pid = get_hermes_pid_for_tty(tty)
|
|
if hermes_pid:
|
|
log(f"Killing hermes PID {hermes_pid} on {target} (tty={tty}, profile={profile})")
|
|
run(f"kill {hermes_pid}")
|
|
time.sleep(2)
|
|
|
|
# Send Ctrl+C to clear any state
|
|
run(f"tmux send-keys -t {target} C-c")
|
|
time.sleep(1)
|
|
|
|
# Restart hermes with the same profile
|
|
if profile:
|
|
cmd = f"hermes -p {profile} chat"
|
|
else:
|
|
cmd = "hermes chat"
|
|
run(f"tmux send-keys -t {target} '{cmd}' Enter")
|
|
log(f"Restarted hermes in {target} with: {cmd}")
|
|
|
|
# Wait and verify
|
|
time.sleep(8)
|
|
new_model = get_model_from_pane(session, pane_idx, window)
|
|
if new_model and ALLOWED_MODEL in new_model:
|
|
log(f"✓ {target} now on {new_model}")
|
|
return True
|
|
else:
|
|
log(f"⚠ {target} model after restart: {new_model}")
|
|
return False
|
|
|
|
def verify_expected_model(provider_yaml, expected):
|
|
"""Compare actual provider in a YAML config against expected value."""
|
|
return provider_yaml.strip() == expected.strip()
|
|
|
|
def check_config_drift():
|
|
"""Scan all relevant config.yaml files for provider drift. Does NOT modify anything.
|
|
Returns list of drift issues found."""
|
|
issues = []
|
|
CONFIGS = {
|
|
"main_config": (os.path.expanduser("~/.hermes/config.yaml"), "nous"),
|
|
"fenrir": (os.path.expanduser("~/.hermes/profiles/fenrir/config.yaml"), "nous"),
|
|
"timmy_sprint": (os.path.expanduser("~/.hermes/profiles/timmy-sprint/config.yaml"), "nous"),
|
|
"default_profile": (os.path.expanduser("~/.hermes/profiles/default/config.yaml"), "nous"),
|
|
}
|
|
for name, (path, expected_provider) in CONFIGS.items():
|
|
if not os.path.exists(path):
|
|
continue
|
|
try:
|
|
with open(path, "r") as f:
|
|
content = f.read()
|
|
# Parse YAML to correctly read model.provider (not the first provider: line)
|
|
try:
|
|
import yaml
|
|
cfg = yaml.safe_load(content) or {}
|
|
except ImportError:
|
|
# Fallback: find provider under model: block via indentation-aware scan
|
|
cfg = {}
|
|
in_model = False
|
|
for line in content.split("\n"):
|
|
stripped = line.strip()
|
|
indent = len(line) - len(line.lstrip())
|
|
if stripped.startswith("model:") and indent == 0:
|
|
in_model = True
|
|
continue
|
|
if in_model and indent == 0 and stripped:
|
|
in_model = False
|
|
if in_model and stripped.startswith("provider:"):
|
|
cfg = {"model": {"provider": stripped.split(":", 1)[1].strip()}}
|
|
break
|
|
actual = (cfg.get("model") or {}).get("provider", "")
|
|
if actual and expected_provider and actual != expected_provider:
|
|
issues.append(f"CONFIG DRIFT [{name}]: provider is '{actual}' (expected '{expected_provider}')")
|
|
except Exception as e:
|
|
issues.append(f"CONFIG CHECK ERROR [{name}]: {e}")
|
|
return issues
|
|
|
|
def main():
|
|
fix_mode = "--fix" in sys.argv
|
|
drift_found = False
|
|
issues = []
|
|
|
|
# Always check config files for provider drift (read-only, never writes)
|
|
config_drift_issues = check_config_drift()
|
|
if config_drift_issues:
|
|
for issue in config_drift_issues:
|
|
log(f"CONFIG DRIFT: {issue}")
|
|
|
|
for session in TMUX_SESSIONS:
|
|
panes = get_panes(session)
|
|
for pane in panes:
|
|
window = pane.get("window")
|
|
target = f"{session}:{window}.{pane['index']}" if window else f"{session}.{pane['index']}"
|
|
|
|
# Detect profile from running process
|
|
out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'")
|
|
tty = out.strip()
|
|
profile = get_profile_from_pane(tty)
|
|
|
|
model = get_model_from_pane(session, pane["index"], window)
|
|
drifted, model_name = is_drifted(model, profile)
|
|
|
|
if drifted:
|
|
drift_found = True
|
|
issues.append(f"{target}: {model_name} (profile={profile})")
|
|
log(f"DRIFT DETECTED: {target} is on '{model_name}' (profile={profile}, expected='{PROFILE_MODELS.get(profile, ALLOWED_MODEL)}')")
|
|
|
|
if fix_mode:
|
|
log(f"Auto-fixing {target}...")
|
|
success = kill_and_restart(session, pane["index"], window)
|
|
if not success:
|
|
issues.append(f" ↳ RESTART FAILED for {target}")
|
|
|
|
if not drift_found:
|
|
total = sum(len(get_panes(s)) for s in TMUX_SESSIONS)
|
|
log(f"All {total} panes healthy (on {ALLOWED_MODEL})")
|
|
|
|
# Print summary for cron output
|
|
if issues or config_drift_issues:
|
|
print("\n=== MODEL DRIFT REPORT ===")
|
|
for issue in issues:
|
|
print(f" [PANE] {issue}")
|
|
if config_drift_issues:
|
|
for issue in config_drift_issues:
|
|
print(f" [CONFIG] {issue}")
|
|
if not fix_mode:
|
|
print("\nRun with --fix to auto-restart drifted panes.")
|
|
return 1
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|