#!/usr/bin/env python3 """ hermes_cleanup.py — Kill stale hermes processes consuming resources. Identifies hermes sessions that have been idle too long and terminates them along with their child processes (MCP servers, etc.). Usage: python3 hermes_cleanup.py # dry run (report only) python3 hermes_cleanup.py --kill # kill stale processes python3 hermes_cleanup.py --max-age 24 # custom age threshold (hours) python3 hermes_cleanup.py --max-sessions 50 # custom session limit python3 hermes_cleanup.py --json # JSON output """ import json import os import signal import subprocess import sys import time from datetime import datetime, timedelta from typing import Dict, List, Optional def get_hermes_processes() -> List[dict]: """Get all hermes-related processes with details.""" try: # Get process list with age, CPU, memory, command result = subprocess.run( ["ps", "aux"], capture_output=True, text=True, timeout=10 ) processes = [] for line in result.stdout.split('\n'): if 'hermes' in line.lower() and 'grep' not in line: parts = line.split(None, 10) if len(parts) >= 11: processes.append({ "user": parts[0], "pid": int(parts[1]), "cpu": float(parts[2]), "mem": float(parts[3]), "vsz": int(parts[4]), "rss": int(parts[5]), "tty": parts[6], "stat": parts[7], "start": parts[8], "time": parts[9], "command": parts[10], }) return processes except (subprocess.TimeoutExpired, ValueError): return [] def get_process_age_hours(pid: int) -> Optional[float]: """Get process age in hours.""" try: result = subprocess.run( ["ps", "-o", "etimes=", "-p", str(pid)], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: elapsed_seconds = int(result.stdout.strip()) return elapsed_seconds / 3600 except (subprocess.TimeoutExpired, ValueError): pass return None def get_child_pids(pid: int) -> List[int]: """Get child PIDs of a process.""" try: result = subprocess.run( ["pgrep", "-P", str(pid)], capture_output=True, text=True, timeout=5 ) if result.returncode == 0 and result.stdout.strip(): return [int(p) for p in result.stdout.strip().split('\n')] except (subprocess.TimeoutExpired, ValueError): pass return [] def get_session_processes() -> Dict[str, List[dict]]: """Group hermes processes by session.""" processes = get_hermes_processes() sessions = {} for proc in processes: cmd = proc["command"] # Extract session identifier from command if "hermes" in cmd: # Use PID as session key if we can't extract a better one key = str(proc["pid"]) sessions[key] = [proc] # Get children children = get_child_pids(proc["pid"]) for child_pid in children: try: child_result = subprocess.run( ["ps", "-p", str(child_pid), "-o", "pid,cpu,mem,rss,command"], capture_output=True, text=True, timeout=5 ) if child_result.returncode == 0: lines = child_result.stdout.strip().split('\n') if len(lines) > 1: parts = lines[1].split(None, 4) if len(parts) >= 5: sessions[key].append({ "pid": int(parts[0]), "cpu": float(parts[1]), "mem": float(parts[2]), "rss": int(parts[3]), "command": parts[4], }) except: pass return sessions def identify_stale_sessions(max_age_hours: float = 24, max_cpu_threshold: float = 0.5) -> List[dict]: """Identify sessions that are stale (old + idle).""" sessions = get_session_processes() stale = [] for session_key, procs in sessions.items(): if not procs: continue main_proc = procs[0] pid = main_proc["pid"] age = get_process_age_hours(pid) if age is None: continue # Check if stale: old AND idle is_old = age > max_age_hours is_idle = main_proc["cpu"] < max_cpu_threshold if is_old and is_idle: total_rss = sum(p.get("rss", 0) for p in procs) stale.append({ "session_key": session_key, "main_pid": pid, "age_hours": round(age, 1), "cpu_percent": main_proc["cpu"], "total_rss_kb": total_rss, "total_rss_mb": round(total_rss / 1024, 1), "process_count": len(procs), "command": main_proc["command"][:100], "children": [p["pid"] for p in procs[1:]], }) return sorted(stale, key=lambda x: -x["age_hours"]) def kill_session(session: dict, dry_run: bool = True) -> dict: """Kill a stale session and its children.""" killed = [] errors = [] # Kill children first for child_pid in session["children"]: if dry_run: killed.append(child_pid) else: try: os.kill(child_pid, signal.SIGTERM) killed.append(child_pid) except ProcessLookupError: pass except Exception as e: errors.append(f"PID {child_pid}: {e}") # Kill main process main_pid = session["main_pid"] if dry_run: killed.append(main_pid) else: try: os.kill(main_pid, signal.SIGTERM) killed.append(main_pid) except ProcessLookupError: pass except Exception as e: errors.append(f"PID {main_pid}: {e}") return { "session": session["session_key"], "killed": killed, "errors": errors, "dry_run": dry_run, } def generate_report(stale: List[dict]) -> str: """Generate human-readable report.""" lines = [] lines.append("=" * 60) lines.append(" HERMES STALE PROCESS REPORT") lines.append(f" {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}") lines.append("=" * 60) if not stale: lines.append("\n No stale sessions found. System healthy.") lines.append("=" * 60) return "\n".join(lines) total_rss = sum(s["total_rss_mb"] for s in stale) total_procs = sum(s["process_count"] for s in stale) lines.append(f"\n Stale sessions: {len(stale)}") lines.append(f" Total processes: {total_procs}") lines.append(f" Total memory waste: {total_rss:.1f} MB ({total_rss/1024:.1f} GB)") lines.append("") for i, s in enumerate(stale[:20], 1): lines.append(f" {i:>2}. PID {s['main_pid']:<8} age={s['age_hours']:>6.1f}h " f"cpu={s['cpu_percent']:>5.1f}% rss={s['total_rss_mb']:>6.1f}MB " f"procs={s['process_count']}") lines.append(f" cmd: {s['command'][:70]}") if len(stale) > 20: lines.append(f"\n ... and {len(stale) - 20} more") lines.append("=" * 60) return "\n".join(lines) def main(): import argparse parser = argparse.ArgumentParser(description="Hermes stale process cleanup") parser.add_argument("--kill", action="store_true", help="Actually kill stale processes") parser.add_argument("--max-age", type=float, default=24, help="Max age in hours (default: 24)") parser.add_argument("--max-cpu", type=float, default=0.5, help="Max CPU% to consider idle (default: 0.5)") parser.add_argument("--json", action="store_true", help="JSON output") parser.add_argument("--dry-run", action="store_true", help="Report only (default)") args = parser.parse_args() stale = identify_stale_sessions(args.max_age, args.max_cpu) if args.json: output = { "stale_count": len(stale), "total_memory_mb": sum(s["total_rss_mb"] for s in stale), "sessions": stale, } print(json.dumps(output, indent=2)) else: print(generate_report(stale)) if args.kill and stale: print(f"\nKilling {len(stale)} stale sessions...") for session in stale: result = kill_session(session, dry_run=False) if result["errors"]: print(f" PID {session['main_pid']}: errors: {result['errors']}") else: print(f" PID {session['main_pid']}: killed {len(result['killed'])} processes") if not args.kill and stale: print(f"\nDry run. Use --kill to terminate {len(stale)} stale sessions.") if __name__ == "__main__": main()