diff --git a/model-watchdog.py b/model-watchdog.py new file mode 100644 index 000000000..d83d4718f --- /dev/null +++ b/model-watchdog.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +""" +Model Watchdog — monitors tmux panes for model drift. +Checks all hermes TUI sessions in dev and timmy tmux sessions. +If any pane is running a non-mimo model, kills and restarts it. + +Usage: python3 ~/.hermes/bin/model-watchdog.py [--fix] + --fix Actually restart drifted panes (default: dry-run) +""" + +import subprocess +import sys +import re +import time +import os + +ALLOWED_MODEL = "mimo-v2-pro" + +# Profile -> expected model. If a pane is running this profile with this model, it's healthy. +# Profiles not in this map are checked against ALLOWED_MODEL. +PROFILE_MODELS = { + "default": "mimo-v2-pro", + "timmy-sprint": "mimo-v2-pro", + "fenrir": "mimo-v2-pro", + "bezalel": "gpt-5.4", + "burn": "mimo-v2-pro", + "creative": "claude-sonnet", + "research": "claude-sonnet", + "review": "claude-sonnet", +} + +TMUX_SESSIONS = ["dev", "timmy"] +LOG_FILE = os.path.expanduser("~/.hermes/logs/model-watchdog.log") + +def log(msg): + os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True) + ts = time.strftime("%Y-%m-%d %H:%M:%S") + line = f"[{ts}] {msg}" + print(line) + with open(LOG_FILE, "a") as f: + f.write(line + "\n") + +def run(cmd): + r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10) + return r.stdout.strip(), r.returncode + +def get_panes(session): + """Get all pane info from ALL windows in a tmux session.""" + # First get all windows + win_out, win_rc = run(f"tmux list-windows -t {session} -F '#{{window_name}}' 2>/dev/null") + if win_rc != 0: + return [] + + panes = [] + for window_name in win_out.split("\n"): + if not window_name.strip(): + continue + target = f"{session}:{window_name}" + out, rc = run(f"tmux list-panes -t {target} -F '#{{pane_index}}|#{{pane_pid}}|#{{pane_tty}}' 2>/dev/null") + if rc != 0: + continue + for line in out.split("\n"): + if "|" in line: + idx, pid, tty = line.split("|") + panes.append({ + "session": session, + "window": window_name, + "index": int(idx), + "pid": int(pid), + "tty": tty, + }) + return panes + +def get_hermes_pid_for_tty(tty): + """Find hermes process running on a specific TTY.""" + out, _ = run(f"ps aux | grep '{tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | awk '{{print $2}}'") + if out: + return int(out.split("\n")[0]) + return None + +def get_model_from_pane(session, pane_idx, window=None): + """Capture the pane and extract the model from the status bar.""" + target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}" + out, _ = run(f"tmux capture-pane -t {target} -p 2>/dev/null | tail -30") + # Look for model in status bar: ⚕ model-name │ + matches = re.findall(r'⚕\s+(\S+)\s+│', out) + if matches: + return matches[0] + return None + +def check_session_meta(session_id): + """Check what model a hermes session was last using from its session file.""" + import json + session_file = os.path.expanduser(f"~/.hermes/sessions/session_{session_id}.json") + if os.path.exists(session_file): + try: + with open(session_file) as f: + data = json.load(f) + return data.get("model"), data.get("provider") + except: + pass + # Try jsonl + jsonl_file = os.path.expanduser(f"~/.hermes/sessions/{session_id}.jsonl") + if os.path.exists(jsonl_file): + try: + with open(jsonl_file) as f: + for line in f: + d = json.loads(line.strip()) + if d.get("role") == "session_meta": + return d.get("model"), d.get("provider") + break + except: + pass + return None, None + +def is_drifted(model_name, profile=None): + """Check if a model name indicates drift from the expected model for this profile.""" + if model_name is None: + return False, "no-model-detected" + + # If we know the profile, check against its expected model + if profile and profile in PROFILE_MODELS: + expected = PROFILE_MODELS[profile] + if expected in model_name: + return False, model_name + return True, model_name + + # No profile known — fall back to ALLOWED_MODEL + if ALLOWED_MODEL in model_name: + return False, model_name + return True, model_name + +def get_profile_from_pane(tty): + """Detect which hermes profile a pane is running by inspecting its process args.""" + # ps shows short TTY (s031) not full path (/dev/ttys031) + short_tty = tty.replace("/dev/ttys", "s").replace("/dev/ttys", "") + out, _ = run(f"ps aux | grep '{short_tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | grep -v cron") + if not out: + return None + # Look for -p in the command line + match = re.search(r'-p\s+(\S+)', out) + if match: + return match.group(1) + return None + +def kill_and_restart(session, pane_idx, window=None): + """Kill the hermes process in a pane and restart it with the same profile.""" + target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}" + + # Get the pane's TTY + out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'") + tty = out.strip() + + # Detect which profile was running + profile = get_profile_from_pane(tty) + + # Find and kill hermes on that TTY + hermes_pid = get_hermes_pid_for_tty(tty) + if hermes_pid: + log(f"Killing hermes PID {hermes_pid} on {target} (tty={tty}, profile={profile})") + run(f"kill {hermes_pid}") + time.sleep(2) + + # Send Ctrl+C to clear any state + run(f"tmux send-keys -t {target} C-c") + time.sleep(1) + + # Restart hermes with the same profile + if profile: + cmd = f"hermes -p {profile} chat" + else: + cmd = "hermes chat" + run(f"tmux send-keys -t {target} '{cmd}' Enter") + log(f"Restarted hermes in {target} with: {cmd}") + + # Wait and verify + time.sleep(8) + new_model = get_model_from_pane(session, pane_idx, window) + if new_model and ALLOWED_MODEL in new_model: + log(f"✓ {target} now on {new_model}") + return True + else: + log(f"⚠ {target} model after restart: {new_model}") + return False + +def verify_expected_model(provider_yaml, expected): + """Compare actual provider in a YAML config against expected value.""" + return provider_yaml.strip() == expected.strip() + +def check_config_drift(): + """Scan all relevant config.yaml files for provider drift. Does NOT modify anything. + Returns list of drift issues found.""" + issues = [] + CONFIGS = { + "main_config": (os.path.expanduser("~/.hermes/config.yaml"), "nous"), + "fenrir": (os.path.expanduser("~/.hermes/profiles/fenrir/config.yaml"), "nous"), + "timmy_sprint": (os.path.expanduser("~/.hermes/profiles/timmy-sprint/config.yaml"), "nous"), + "default_profile": (os.path.expanduser("~/.hermes/profiles/default/config.yaml"), "nous"), + } + for name, (path, expected_provider) in CONFIGS.items(): + if not os.path.exists(path): + continue + try: + with open(path, "r") as f: + content = f.read() + # Parse YAML to correctly read model.provider (not the first provider: line) + try: + import yaml + cfg = yaml.safe_load(content) or {} + except ImportError: + # Fallback: find provider under model: block via indentation-aware scan + cfg = {} + in_model = False + for line in content.split("\n"): + stripped = line.strip() + indent = len(line) - len(line.lstrip()) + if stripped.startswith("model:") and indent == 0: + in_model = True + continue + if in_model and indent == 0 and stripped: + in_model = False + if in_model and stripped.startswith("provider:"): + cfg = {"model": {"provider": stripped.split(":", 1)[1].strip()}} + break + actual = (cfg.get("model") or {}).get("provider", "") + if actual and expected_provider and actual != expected_provider: + issues.append(f"CONFIG DRIFT [{name}]: provider is '{actual}' (expected '{expected_provider}')") + except Exception as e: + issues.append(f"CONFIG CHECK ERROR [{name}]: {e}") + return issues + +def main(): + fix_mode = "--fix" in sys.argv + drift_found = False + issues = [] + + # Always check config files for provider drift (read-only, never writes) + config_drift_issues = check_config_drift() + if config_drift_issues: + for issue in config_drift_issues: + log(f"CONFIG DRIFT: {issue}") + + for session in TMUX_SESSIONS: + panes = get_panes(session) + for pane in panes: + window = pane.get("window") + target = f"{session}:{window}.{pane['index']}" if window else f"{session}.{pane['index']}" + + # Detect profile from running process + out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'") + tty = out.strip() + profile = get_profile_from_pane(tty) + + model = get_model_from_pane(session, pane["index"], window) + drifted, model_name = is_drifted(model, profile) + + if drifted: + drift_found = True + issues.append(f"{target}: {model_name} (profile={profile})") + log(f"DRIFT DETECTED: {target} is on '{model_name}' (profile={profile}, expected='{PROFILE_MODELS.get(profile, ALLOWED_MODEL)}')") + + if fix_mode: + log(f"Auto-fixing {target}...") + success = kill_and_restart(session, pane["index"], window) + if not success: + issues.append(f" ↳ RESTART FAILED for {target}") + + if not drift_found: + total = sum(len(get_panes(s)) for s in TMUX_SESSIONS) + log(f"All {total} panes healthy (on {ALLOWED_MODEL})") + + # Print summary for cron output + if issues or config_drift_issues: + print("\n=== MODEL DRIFT REPORT ===") + for issue in issues: + print(f" [PANE] {issue}") + if config_drift_issues: + for issue in config_drift_issues: + print(f" [CONFIG] {issue}") + if not fix_mode: + print("\nRun with --fix to auto-restart drifted panes.") + return 1 + return 0 + +if __name__ == "__main__": + sys.exit(main())