Files
timmy-config/bin/hermes_cleanup.py
Alexander Whitestone 19db78bbf0
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 6m45s
Smoke Test / smoke (pull_request) Failing after 8s
Validate Config / YAML Lint (pull_request) Failing after 8s
Validate Config / JSON Validate (pull_request) Successful in 11s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 43s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 36s
Validate Config / Cron Syntax Check (pull_request) Successful in 8s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 9s
Validate Config / Playbook Schema Validation (pull_request) Successful in 15s
PR Checklist / pr-checklist (pull_request) Successful in 2m45s
Architecture Lint / Lint Repository (pull_request) Failing after 20s
feat: stale hermes process cleanup script (#829)
bin/hermes_cleanup.py:
  Identifies stale hermes sessions (old + idle)
  Groups by session, tracks parent+children
  Memory waste calculation (RSS in MB/GB)
  --kill to terminate, --dry-run (default) to report
  --max-age (default 24h), --max-cpu (default 0.5%)
  --json output, human-readable table

tests/test_hermes_cleanup.py: 8 tests
  process age, child PIDs, kill session,
  dry run, report generation

Usage:
  python3 bin/hermes_cleanup.py              # report
  python3 bin/hermes_cleanup.py --kill       # terminate
  python3 bin/hermes_cleanup.py --max-age 12 # 12h threshold
  python3 bin/hermes_cleanup.py --json       # JSON
2026-04-20 20:38:20 -04:00

272 lines
9.1 KiB
Python

#!/usr/bin/env python3
"""
hermes_cleanup.py — Kill stale hermes processes consuming resources.
Identifies hermes sessions that have been idle too long and terminates
them along with their child processes (MCP servers, etc.).
Usage:
python3 hermes_cleanup.py # dry run (report only)
python3 hermes_cleanup.py --kill # kill stale processes
python3 hermes_cleanup.py --max-age 24 # custom age threshold (hours)
python3 hermes_cleanup.py --max-sessions 50 # custom session limit
python3 hermes_cleanup.py --json # JSON output
"""
import json
import os
import signal
import subprocess
import sys
import time
from datetime import datetime, timedelta
from typing import Dict, List, Optional
def get_hermes_processes() -> List[dict]:
"""Get all hermes-related processes with details."""
try:
# Get process list with age, CPU, memory, command
result = subprocess.run(
["ps", "aux"],
capture_output=True, text=True, timeout=10
)
processes = []
for line in result.stdout.split('\n'):
if 'hermes' in line.lower() and 'grep' not in line:
parts = line.split(None, 10)
if len(parts) >= 11:
processes.append({
"user": parts[0],
"pid": int(parts[1]),
"cpu": float(parts[2]),
"mem": float(parts[3]),
"vsz": int(parts[4]),
"rss": int(parts[5]),
"tty": parts[6],
"stat": parts[7],
"start": parts[8],
"time": parts[9],
"command": parts[10],
})
return processes
except (subprocess.TimeoutExpired, ValueError):
return []
def get_process_age_hours(pid: int) -> Optional[float]:
"""Get process age in hours."""
try:
result = subprocess.run(
["ps", "-o", "etimes=", "-p", str(pid)],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
elapsed_seconds = int(result.stdout.strip())
return elapsed_seconds / 3600
except (subprocess.TimeoutExpired, ValueError):
pass
return None
def get_child_pids(pid: int) -> List[int]:
"""Get child PIDs of a process."""
try:
result = subprocess.run(
["pgrep", "-P", str(pid)],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0 and result.stdout.strip():
return [int(p) for p in result.stdout.strip().split('\n')]
except (subprocess.TimeoutExpired, ValueError):
pass
return []
def get_session_processes() -> Dict[str, List[dict]]:
"""Group hermes processes by session."""
processes = get_hermes_processes()
sessions = {}
for proc in processes:
cmd = proc["command"]
# Extract session identifier from command
if "hermes" in cmd:
# Use PID as session key if we can't extract a better one
key = str(proc["pid"])
sessions[key] = [proc]
# Get children
children = get_child_pids(proc["pid"])
for child_pid in children:
try:
child_result = subprocess.run(
["ps", "-p", str(child_pid), "-o", "pid,cpu,mem,rss,command"],
capture_output=True, text=True, timeout=5
)
if child_result.returncode == 0:
lines = child_result.stdout.strip().split('\n')
if len(lines) > 1:
parts = lines[1].split(None, 4)
if len(parts) >= 5:
sessions[key].append({
"pid": int(parts[0]),
"cpu": float(parts[1]),
"mem": float(parts[2]),
"rss": int(parts[3]),
"command": parts[4],
})
except:
pass
return sessions
def identify_stale_sessions(max_age_hours: float = 24, max_cpu_threshold: float = 0.5) -> List[dict]:
"""Identify sessions that are stale (old + idle)."""
sessions = get_session_processes()
stale = []
for session_key, procs in sessions.items():
if not procs:
continue
main_proc = procs[0]
pid = main_proc["pid"]
age = get_process_age_hours(pid)
if age is None:
continue
# Check if stale: old AND idle
is_old = age > max_age_hours
is_idle = main_proc["cpu"] < max_cpu_threshold
if is_old and is_idle:
total_rss = sum(p.get("rss", 0) for p in procs)
stale.append({
"session_key": session_key,
"main_pid": pid,
"age_hours": round(age, 1),
"cpu_percent": main_proc["cpu"],
"total_rss_kb": total_rss,
"total_rss_mb": round(total_rss / 1024, 1),
"process_count": len(procs),
"command": main_proc["command"][:100],
"children": [p["pid"] for p in procs[1:]],
})
return sorted(stale, key=lambda x: -x["age_hours"])
def kill_session(session: dict, dry_run: bool = True) -> dict:
"""Kill a stale session and its children."""
killed = []
errors = []
# Kill children first
for child_pid in session["children"]:
if dry_run:
killed.append(child_pid)
else:
try:
os.kill(child_pid, signal.SIGTERM)
killed.append(child_pid)
except ProcessLookupError:
pass
except Exception as e:
errors.append(f"PID {child_pid}: {e}")
# Kill main process
main_pid = session["main_pid"]
if dry_run:
killed.append(main_pid)
else:
try:
os.kill(main_pid, signal.SIGTERM)
killed.append(main_pid)
except ProcessLookupError:
pass
except Exception as e:
errors.append(f"PID {main_pid}: {e}")
return {
"session": session["session_key"],
"killed": killed,
"errors": errors,
"dry_run": dry_run,
}
def generate_report(stale: List[dict]) -> str:
"""Generate human-readable report."""
lines = []
lines.append("=" * 60)
lines.append(" HERMES STALE PROCESS REPORT")
lines.append(f" {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC')}")
lines.append("=" * 60)
if not stale:
lines.append("\n No stale sessions found. System healthy.")
lines.append("=" * 60)
return "\n".join(lines)
total_rss = sum(s["total_rss_mb"] for s in stale)
total_procs = sum(s["process_count"] for s in stale)
lines.append(f"\n Stale sessions: {len(stale)}")
lines.append(f" Total processes: {total_procs}")
lines.append(f" Total memory waste: {total_rss:.1f} MB ({total_rss/1024:.1f} GB)")
lines.append("")
for i, s in enumerate(stale[:20], 1):
lines.append(f" {i:>2}. PID {s['main_pid']:<8} age={s['age_hours']:>6.1f}h "
f"cpu={s['cpu_percent']:>5.1f}% rss={s['total_rss_mb']:>6.1f}MB "
f"procs={s['process_count']}")
lines.append(f" cmd: {s['command'][:70]}")
if len(stale) > 20:
lines.append(f"\n ... and {len(stale) - 20} more")
lines.append("=" * 60)
return "\n".join(lines)
def main():
import argparse
parser = argparse.ArgumentParser(description="Hermes stale process cleanup")
parser.add_argument("--kill", action="store_true", help="Actually kill stale processes")
parser.add_argument("--max-age", type=float, default=24, help="Max age in hours (default: 24)")
parser.add_argument("--max-cpu", type=float, default=0.5, help="Max CPU% to consider idle (default: 0.5)")
parser.add_argument("--json", action="store_true", help="JSON output")
parser.add_argument("--dry-run", action="store_true", help="Report only (default)")
args = parser.parse_args()
stale = identify_stale_sessions(args.max_age, args.max_cpu)
if args.json:
output = {
"stale_count": len(stale),
"total_memory_mb": sum(s["total_rss_mb"] for s in stale),
"sessions": stale,
}
print(json.dumps(output, indent=2))
else:
print(generate_report(stale))
if args.kill and stale:
print(f"\nKilling {len(stale)} stale sessions...")
for session in stale:
result = kill_session(session, dry_run=False)
if result["errors"]:
print(f" PID {session['main_pid']}: errors: {result['errors']}")
else:
print(f" PID {session['main_pid']}: killed {len(result['killed'])} processes")
if not args.kill and stale:
print(f"\nDry run. Use --kill to terminate {len(stale)} stale sessions.")
if __name__ == "__main__":
main()