Files
Timmy-time-dashboard/src/infrastructure/hermes/monitor.py
Claude (Opus 4.6) 495c1ac2bd
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
[claude] Fix 27 ruff lint errors blocking all pushes (#1149) (#1153)
Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
2026-03-23 19:06:11 +00:00

661 lines
24 KiB
Python

"""Hermes health monitor — system resources + model management.
Monitors the local machine (Hermes/M3 Max) and keeps it running smoothly.
Runs every 5 minutes, auto-resolves issues where possible, alerts when
human intervention is needed.
Monitors:
1. Memory pressure — unified memory, alert if <4GB free, unload models
2. Disk usage — alert if <10GB free, clean temp files
3. Ollama status — verify reachable, restart if crashed, manage loaded models
4. Process health — detect zombie processes
5. Network — verify Gitea connectivity
Refs: #1073
"""
import asyncio
import json
import logging
import shutil
import subprocess
import tempfile
import time
import urllib.request
from dataclasses import dataclass, field
from datetime import UTC, datetime
from enum import StrEnum
from typing import Any
from config import settings
logger = logging.getLogger(__name__)
class HealthLevel(StrEnum):
"""Severity level for a health check result."""
OK = "ok"
WARNING = "warning"
CRITICAL = "critical"
UNKNOWN = "unknown"
@dataclass
class CheckResult:
"""Result of a single health check."""
name: str
level: HealthLevel
message: str
details: dict[str, Any] = field(default_factory=dict)
auto_resolved: bool = False
needs_human: bool = False
def to_dict(self) -> dict[str, Any]:
return {
"name": self.name,
"level": self.level.value,
"message": self.message,
"details": self.details,
"auto_resolved": self.auto_resolved,
"needs_human": self.needs_human,
}
@dataclass
class HealthReport:
"""Full health report from a single monitor cycle."""
timestamp: str
checks: list[CheckResult]
overall: HealthLevel
@property
def has_issues(self) -> bool:
return any(c.level != HealthLevel.OK for c in self.checks)
def to_dict(self) -> dict[str, Any]:
return {
"timestamp": self.timestamp,
"overall": self.overall.value,
"has_issues": self.has_issues,
"checks": [c.to_dict() for c in self.checks],
}
class HermesMonitor:
"""System health monitor for Hermes (local M3 Max machine).
All blocking I/O (subprocess, HTTP) is wrapped in asyncio.to_thread()
so it never blocks the event loop. Results are cached so the dashboard
can read the last report without triggering a new cycle.
"""
OLLAMA_REQUEST_TIMEOUT = 5
NETWORK_REQUEST_TIMEOUT = 5
def __init__(self) -> None:
self._last_report: HealthReport | None = None
self._last_run_ts: float = 0.0
@property
def last_report(self) -> HealthReport | None:
"""Most recent health report, or None if no cycle has run yet."""
return self._last_report
@property
def seconds_since_last_run(self) -> float:
if self._last_run_ts == 0.0:
return float("inf")
return time.monotonic() - self._last_run_ts
async def run_cycle(self) -> HealthReport:
"""Run a full health check cycle and return the report."""
self._last_run_ts = time.monotonic()
logger.info("Hermes health cycle starting")
check_fns = [
self._check_memory(),
self._check_disk(),
self._check_ollama(),
self._check_processes(),
self._check_network(),
]
raw_results = await asyncio.gather(*check_fns, return_exceptions=True)
checks: list[CheckResult] = []
for i, r in enumerate(raw_results):
if isinstance(r, Exception):
name = ["memory", "disk", "ollama", "processes", "network"][i]
logger.warning("Hermes check '%s' raised: %s", name, r)
checks.append(
CheckResult(
name=name,
level=HealthLevel.UNKNOWN,
message=f"Check error: {r}",
)
)
else:
checks.append(r)
# Compute overall level
levels = {c.level for c in checks}
if HealthLevel.CRITICAL in levels:
overall = HealthLevel.CRITICAL
elif HealthLevel.WARNING in levels:
overall = HealthLevel.WARNING
elif HealthLevel.UNKNOWN in levels:
overall = HealthLevel.UNKNOWN
else:
overall = HealthLevel.OK
report = HealthReport(
timestamp=datetime.now(UTC).isoformat(),
checks=checks,
overall=overall,
)
self._last_report = report
await self._handle_alerts(report)
logger.info("Hermes health cycle complete — overall: %s", overall.value)
return report
# ── Memory ───────────────────────────────────────────────────────────────
async def _check_memory(self) -> CheckResult:
"""Check unified memory usage (macOS vm_stat)."""
memory_free_min_gb = getattr(settings, "hermes_memory_free_min_gb", 4.0)
try:
info = await asyncio.to_thread(self._get_memory_info)
free_gb = info.get("free_gb", 0.0)
total_gb = info.get("total_gb", 0.0)
details: dict[str, Any] = {
"free_gb": round(free_gb, 2),
"total_gb": round(total_gb, 2),
}
if free_gb < memory_free_min_gb:
# Attempt auto-remediation: unload Ollama models
unloaded = await self._unload_ollama_models()
if unloaded:
return CheckResult(
name="memory",
level=HealthLevel.WARNING,
message=(
f"Low memory ({free_gb:.1f}GB free) — "
f"unloaded {unloaded} Ollama model(s)"
),
details={**details, "models_unloaded": unloaded},
auto_resolved=True,
)
return CheckResult(
name="memory",
level=HealthLevel.CRITICAL,
message=(
f"Critical: only {free_gb:.1f}GB free (threshold: {memory_free_min_gb}GB)"
),
details=details,
needs_human=True,
)
return CheckResult(
name="memory",
level=HealthLevel.OK,
message=f"Memory OK — {free_gb:.1f}GB free of {total_gb:.1f}GB",
details=details,
)
except Exception as exc:
logger.warning("Memory check failed: %s", exc)
return CheckResult(
name="memory",
level=HealthLevel.UNKNOWN,
message=f"Memory check unavailable: {exc}",
)
def _get_memory_info(self) -> dict[str, float]:
"""Get memory stats via macOS sysctl + vm_stat.
Falls back gracefully on non-macOS systems.
"""
gb = 1024**3
total_bytes = 0.0
free_bytes = 0.0
# Total memory via sysctl
try:
result = subprocess.run(
["sysctl", "-n", "hw.memsize"],
capture_output=True,
text=True,
timeout=3,
)
total_bytes = float(result.stdout.strip())
except Exception:
pass
# Free + inactive pages via vm_stat (macOS)
try:
result = subprocess.run(
["vm_stat"],
capture_output=True,
text=True,
timeout=3,
)
page_size = 16384 # 16 KB default on Apple Silicon
for line in result.stdout.splitlines():
if "page size of" in line:
parts = line.split()
for i, part in enumerate(parts):
if part == "of" and i + 1 < len(parts):
try:
page_size = int(parts[i + 1])
except ValueError:
pass
elif "Pages free:" in line:
pages = int(line.split(":")[1].strip().rstrip("."))
free_bytes += pages * page_size
elif "Pages inactive:" in line:
pages = int(line.split(":")[1].strip().rstrip("."))
free_bytes += pages * page_size
except Exception:
pass
return {
"total_gb": total_bytes / gb if total_bytes else 0.0,
"free_gb": free_bytes / gb if free_bytes else 0.0,
}
# ── Disk ─────────────────────────────────────────────────────────────────
async def _check_disk(self) -> CheckResult:
"""Check disk usage via shutil.disk_usage."""
disk_free_min_gb = getattr(settings, "hermes_disk_free_min_gb", 10.0)
try:
usage = await asyncio.to_thread(shutil.disk_usage, "/")
free_gb = usage.free / (1024**3)
total_gb = usage.total / (1024**3)
used_pct = (usage.used / usage.total) * 100
details: dict[str, Any] = {
"free_gb": round(free_gb, 2),
"total_gb": round(total_gb, 2),
"used_pct": round(used_pct, 1),
}
if free_gb < disk_free_min_gb:
cleaned_gb = await self._cleanup_temp_files()
if cleaned_gb > 0.01:
return CheckResult(
name="disk",
level=HealthLevel.WARNING,
message=(
f"Low disk ({free_gb:.1f}GB free) — "
f"cleaned {cleaned_gb:.2f}GB from /tmp"
),
details={**details, "cleaned_gb": round(cleaned_gb, 2)},
auto_resolved=True,
)
return CheckResult(
name="disk",
level=HealthLevel.CRITICAL,
message=(
f"Critical: only {free_gb:.1f}GB free (threshold: {disk_free_min_gb}GB)"
),
details=details,
needs_human=True,
)
return CheckResult(
name="disk",
level=HealthLevel.OK,
message=f"Disk OK — {free_gb:.1f}GB free ({used_pct:.0f}% used)",
details=details,
)
except Exception as exc:
logger.warning("Disk check failed: %s", exc)
return CheckResult(
name="disk",
level=HealthLevel.UNKNOWN,
message=f"Disk check unavailable: {exc}",
)
async def _cleanup_temp_files(self) -> float:
"""Remove /tmp files older than 24 hours. Returns GB freed."""
return await asyncio.to_thread(self._cleanup_temp_files_sync)
def _cleanup_temp_files_sync(self) -> float:
"""Synchronous /tmp cleanup — only touches files older than 24 hours."""
from pathlib import Path
freed_bytes = 0
cutoff = time.time() - 86400 # 24 hours ago
try:
tmp = Path(tempfile.gettempdir())
for item in tmp.iterdir():
try:
stat = item.stat()
if stat.st_mtime >= cutoff:
continue
if item.is_file():
freed_bytes += stat.st_size
item.unlink(missing_ok=True)
elif item.is_dir():
dir_size = sum(f.stat().st_size for f in item.rglob("*") if f.is_file())
freed_bytes += dir_size
shutil.rmtree(str(item), ignore_errors=True)
except (PermissionError, OSError):
pass # Skip files we can't touch
except Exception as exc:
logger.warning("Temp cleanup error: %s", exc)
freed_gb = freed_bytes / (1024**3)
if freed_gb > 0.001:
logger.info("Hermes disk cleanup: freed %.2fGB from /tmp", freed_gb)
return freed_gb
# ── Ollama ───────────────────────────────────────────────────────────────
async def _check_ollama(self) -> CheckResult:
"""Check Ollama status and loaded models."""
try:
status = await asyncio.to_thread(self._get_ollama_status)
if not status.get("reachable"):
restarted = await self._restart_ollama()
if restarted:
return CheckResult(
name="ollama",
level=HealthLevel.WARNING,
message="Ollama was unreachable — restart initiated",
details={"restart_attempted": True},
auto_resolved=True,
)
return CheckResult(
name="ollama",
level=HealthLevel.CRITICAL,
message="Ollama unreachable and restart failed",
details={"reachable": False},
needs_human=True,
)
models = status.get("models", [])
loaded = status.get("loaded_models", [])
return CheckResult(
name="ollama",
level=HealthLevel.OK,
message=(f"Ollama OK — {len(models)} model(s) available, {len(loaded)} loaded"),
details={
"reachable": True,
"model_count": len(models),
"loaded_count": len(loaded),
"loaded_models": [m.get("name", "") for m in loaded],
},
)
except Exception as exc:
logger.warning("Ollama check failed: %s", exc)
return CheckResult(
name="ollama",
level=HealthLevel.UNKNOWN,
message=f"Ollama check failed: {exc}",
)
def _get_ollama_status(self) -> dict[str, Any]:
"""Synchronous Ollama status — checks /api/tags and /api/ps."""
url = settings.normalized_ollama_url
try:
req = urllib.request.Request(
f"{url}/api/tags",
method="GET",
headers={"Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=self.OLLAMA_REQUEST_TIMEOUT) as resp:
data = json.loads(resp.read().decode())
models = data.get("models", [])
except Exception:
return {"reachable": False, "models": [], "loaded_models": []}
# /api/ps lists currently loaded (in-memory) models — Ollama >=0.2
loaded: list[dict] = []
try:
req = urllib.request.Request(
f"{url}/api/ps",
method="GET",
headers={"Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=self.OLLAMA_REQUEST_TIMEOUT) as resp:
ps_data = json.loads(resp.read().decode())
loaded = ps_data.get("models", [])
except Exception:
pass # /api/ps absent on older Ollama — non-fatal
return {"reachable": True, "models": models, "loaded_models": loaded}
async def _unload_ollama_models(self) -> int:
"""Unload in-memory Ollama models to free unified memory.
Uses the keep_alive=0 trick: POSTing to /api/generate with
keep_alive=0 causes Ollama to immediately evict the model.
Returns the number of models successfully unloaded.
"""
return await asyncio.to_thread(self._unload_ollama_models_sync)
def _unload_ollama_models_sync(self) -> int:
"""Synchronous model unload implementation."""
url = settings.normalized_ollama_url
unloaded = 0
try:
req = urllib.request.Request(
f"{url}/api/ps",
method="GET",
headers={"Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=self.OLLAMA_REQUEST_TIMEOUT) as resp:
ps_data = json.loads(resp.read().decode())
loaded = ps_data.get("models", [])
except Exception:
return 0
for model in loaded:
name = model.get("name", "")
if not name:
continue
try:
payload = json.dumps({"model": name, "keep_alive": 0}).encode()
req = urllib.request.Request(
f"{url}/api/generate",
data=payload,
method="POST",
headers={"Content-Type": "application/json"},
)
with urllib.request.urlopen(req, timeout=10) as _:
pass
logger.info("Hermes: unloaded Ollama model %s", name)
unloaded += 1
except Exception as exc:
logger.warning("Hermes: failed to unload model %s: %s", name, exc)
return unloaded
async def _restart_ollama(self) -> bool:
"""Attempt to restart the Ollama service via launchctl or brew."""
return await asyncio.to_thread(self._restart_ollama_sync)
def _restart_ollama_sync(self) -> bool:
"""Try launchctl first, then brew services."""
# macOS launchctl (installed via official Ollama installer)
try:
result = subprocess.run(
["launchctl", "stop", "com.ollama.ollama"],
capture_output=True,
timeout=10,
)
if result.returncode == 0:
time.sleep(2)
subprocess.run(
["launchctl", "start", "com.ollama.ollama"],
capture_output=True,
timeout=10,
)
logger.info("Hermes: Ollama restarted via launchctl")
return True
except Exception:
pass
# Homebrew fallback
try:
result = subprocess.run(
["brew", "services", "restart", "ollama"],
capture_output=True,
timeout=20,
)
if result.returncode == 0:
logger.info("Hermes: Ollama restarted via brew services")
return True
except Exception:
pass
logger.warning("Hermes: Ollama restart failed — manual intervention needed")
return False
# ── Processes ────────────────────────────────────────────────────────────
async def _check_processes(self) -> CheckResult:
"""Check for zombie processes via ps aux."""
try:
result = await asyncio.to_thread(self._get_zombie_processes)
zombies = result.get("zombies", [])
if zombies:
return CheckResult(
name="processes",
level=HealthLevel.WARNING,
message=f"Found {len(zombies)} zombie process(es)",
details={"zombies": zombies[:5]},
needs_human=len(zombies) > 3,
)
return CheckResult(
name="processes",
level=HealthLevel.OK,
message="Processes OK — no zombies detected",
details={"zombie_count": 0},
)
except Exception as exc:
logger.warning("Process check failed: %s", exc)
return CheckResult(
name="processes",
level=HealthLevel.UNKNOWN,
message=f"Process check unavailable: {exc}",
)
def _get_zombie_processes(self) -> dict[str, Any]:
"""Detect zombie processes (state 'Z') via ps aux."""
result = subprocess.run(
["ps", "aux"],
capture_output=True,
text=True,
timeout=5,
)
zombies = []
for line in result.stdout.splitlines()[1:]: # Skip header row
parts = line.split(None, 10)
if len(parts) >= 8 and parts[7] == "Z":
zombies.append(
{
"pid": parts[1],
"command": parts[10][:80] if len(parts) > 10 else "",
}
)
return {"zombies": zombies}
# ── Network ──────────────────────────────────────────────────────────────
async def _check_network(self) -> CheckResult:
"""Check Gitea connectivity."""
try:
result = await asyncio.to_thread(self._check_gitea_connectivity)
reachable = result.get("reachable", False)
latency_ms = result.get("latency_ms", -1.0)
if not reachable:
return CheckResult(
name="network",
level=HealthLevel.WARNING,
message=f"Gitea unreachable: {result.get('error', 'unknown')}",
details=result,
needs_human=True,
)
return CheckResult(
name="network",
level=HealthLevel.OK,
message=f"Network OK — Gitea reachable ({latency_ms:.0f}ms)",
details=result,
)
except Exception as exc:
logger.warning("Network check failed: %s", exc)
return CheckResult(
name="network",
level=HealthLevel.UNKNOWN,
message=f"Network check unavailable: {exc}",
)
def _check_gitea_connectivity(self) -> dict[str, Any]:
"""Synchronous Gitea reachability check."""
url = settings.gitea_url
start = time.monotonic()
try:
req = urllib.request.Request(
f"{url}/api/v1/version",
method="GET",
headers={"Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=self.NETWORK_REQUEST_TIMEOUT) as resp:
latency_ms = (time.monotonic() - start) * 1000
return {
"reachable": resp.status == 200,
"latency_ms": round(latency_ms, 1),
"url": url,
}
except Exception as exc:
return {
"reachable": False,
"error": str(exc),
"url": url,
"latency_ms": -1.0,
}
# ── Alerts ───────────────────────────────────────────────────────────────
async def _handle_alerts(self, report: HealthReport) -> None:
"""Send push notifications for issues that need attention."""
try:
from infrastructure.notifications.push import notifier
except Exception:
return
for check in report.checks:
if check.level == HealthLevel.CRITICAL or check.needs_human:
notifier.notify(
title=f"Hermes Alert: {check.name}",
message=check.message,
category="system",
native=check.level == HealthLevel.CRITICAL,
)
elif check.level == HealthLevel.WARNING and check.auto_resolved:
notifier.notify(
title=f"Hermes: {check.name} auto-fixed",
message=check.message,
category="system",
)
# Module-level singleton
hermes_monitor = HermesMonitor()