Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
661 lines
24 KiB
Python
661 lines
24 KiB
Python
"""Hermes health monitor — system resources + model management.
|
|
|
|
Monitors the local machine (Hermes/M3 Max) and keeps it running smoothly.
|
|
Runs every 5 minutes, auto-resolves issues where possible, alerts when
|
|
human intervention is needed.
|
|
|
|
Monitors:
|
|
1. Memory pressure — unified memory, alert if <4GB free, unload models
|
|
2. Disk usage — alert if <10GB free, clean temp files
|
|
3. Ollama status — verify reachable, restart if crashed, manage loaded models
|
|
4. Process health — detect zombie processes
|
|
5. Network — verify Gitea connectivity
|
|
|
|
Refs: #1073
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
import urllib.request
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from enum import StrEnum
|
|
from typing import Any
|
|
|
|
from config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HealthLevel(StrEnum):
|
|
"""Severity level for a health check result."""
|
|
|
|
OK = "ok"
|
|
WARNING = "warning"
|
|
CRITICAL = "critical"
|
|
UNKNOWN = "unknown"
|
|
|
|
|
|
@dataclass
|
|
class CheckResult:
|
|
"""Result of a single health check."""
|
|
|
|
name: str
|
|
level: HealthLevel
|
|
message: str
|
|
details: dict[str, Any] = field(default_factory=dict)
|
|
auto_resolved: bool = False
|
|
needs_human: bool = False
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"name": self.name,
|
|
"level": self.level.value,
|
|
"message": self.message,
|
|
"details": self.details,
|
|
"auto_resolved": self.auto_resolved,
|
|
"needs_human": self.needs_human,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class HealthReport:
|
|
"""Full health report from a single monitor cycle."""
|
|
|
|
timestamp: str
|
|
checks: list[CheckResult]
|
|
overall: HealthLevel
|
|
|
|
@property
|
|
def has_issues(self) -> bool:
|
|
return any(c.level != HealthLevel.OK for c in self.checks)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"timestamp": self.timestamp,
|
|
"overall": self.overall.value,
|
|
"has_issues": self.has_issues,
|
|
"checks": [c.to_dict() for c in self.checks],
|
|
}
|
|
|
|
|
|
class HermesMonitor:
|
|
"""System health monitor for Hermes (local M3 Max machine).
|
|
|
|
All blocking I/O (subprocess, HTTP) is wrapped in asyncio.to_thread()
|
|
so it never blocks the event loop. Results are cached so the dashboard
|
|
can read the last report without triggering a new cycle.
|
|
"""
|
|
|
|
OLLAMA_REQUEST_TIMEOUT = 5
|
|
NETWORK_REQUEST_TIMEOUT = 5
|
|
|
|
def __init__(self) -> None:
|
|
self._last_report: HealthReport | None = None
|
|
self._last_run_ts: float = 0.0
|
|
|
|
@property
|
|
def last_report(self) -> HealthReport | None:
|
|
"""Most recent health report, or None if no cycle has run yet."""
|
|
return self._last_report
|
|
|
|
@property
|
|
def seconds_since_last_run(self) -> float:
|
|
if self._last_run_ts == 0.0:
|
|
return float("inf")
|
|
return time.monotonic() - self._last_run_ts
|
|
|
|
async def run_cycle(self) -> HealthReport:
|
|
"""Run a full health check cycle and return the report."""
|
|
self._last_run_ts = time.monotonic()
|
|
logger.info("Hermes health cycle starting")
|
|
|
|
check_fns = [
|
|
self._check_memory(),
|
|
self._check_disk(),
|
|
self._check_ollama(),
|
|
self._check_processes(),
|
|
self._check_network(),
|
|
]
|
|
|
|
raw_results = await asyncio.gather(*check_fns, return_exceptions=True)
|
|
|
|
checks: list[CheckResult] = []
|
|
for i, r in enumerate(raw_results):
|
|
if isinstance(r, Exception):
|
|
name = ["memory", "disk", "ollama", "processes", "network"][i]
|
|
logger.warning("Hermes check '%s' raised: %s", name, r)
|
|
checks.append(
|
|
CheckResult(
|
|
name=name,
|
|
level=HealthLevel.UNKNOWN,
|
|
message=f"Check error: {r}",
|
|
)
|
|
)
|
|
else:
|
|
checks.append(r)
|
|
|
|
# Compute overall level
|
|
levels = {c.level for c in checks}
|
|
if HealthLevel.CRITICAL in levels:
|
|
overall = HealthLevel.CRITICAL
|
|
elif HealthLevel.WARNING in levels:
|
|
overall = HealthLevel.WARNING
|
|
elif HealthLevel.UNKNOWN in levels:
|
|
overall = HealthLevel.UNKNOWN
|
|
else:
|
|
overall = HealthLevel.OK
|
|
|
|
report = HealthReport(
|
|
timestamp=datetime.now(UTC).isoformat(),
|
|
checks=checks,
|
|
overall=overall,
|
|
)
|
|
self._last_report = report
|
|
|
|
await self._handle_alerts(report)
|
|
|
|
logger.info("Hermes health cycle complete — overall: %s", overall.value)
|
|
return report
|
|
|
|
# ── Memory ───────────────────────────────────────────────────────────────
|
|
|
|
async def _check_memory(self) -> CheckResult:
|
|
"""Check unified memory usage (macOS vm_stat)."""
|
|
memory_free_min_gb = getattr(settings, "hermes_memory_free_min_gb", 4.0)
|
|
try:
|
|
info = await asyncio.to_thread(self._get_memory_info)
|
|
free_gb = info.get("free_gb", 0.0)
|
|
total_gb = info.get("total_gb", 0.0)
|
|
details: dict[str, Any] = {
|
|
"free_gb": round(free_gb, 2),
|
|
"total_gb": round(total_gb, 2),
|
|
}
|
|
|
|
if free_gb < memory_free_min_gb:
|
|
# Attempt auto-remediation: unload Ollama models
|
|
unloaded = await self._unload_ollama_models()
|
|
if unloaded:
|
|
return CheckResult(
|
|
name="memory",
|
|
level=HealthLevel.WARNING,
|
|
message=(
|
|
f"Low memory ({free_gb:.1f}GB free) — "
|
|
f"unloaded {unloaded} Ollama model(s)"
|
|
),
|
|
details={**details, "models_unloaded": unloaded},
|
|
auto_resolved=True,
|
|
)
|
|
return CheckResult(
|
|
name="memory",
|
|
level=HealthLevel.CRITICAL,
|
|
message=(
|
|
f"Critical: only {free_gb:.1f}GB free (threshold: {memory_free_min_gb}GB)"
|
|
),
|
|
details=details,
|
|
needs_human=True,
|
|
)
|
|
|
|
return CheckResult(
|
|
name="memory",
|
|
level=HealthLevel.OK,
|
|
message=f"Memory OK — {free_gb:.1f}GB free of {total_gb:.1f}GB",
|
|
details=details,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("Memory check failed: %s", exc)
|
|
return CheckResult(
|
|
name="memory",
|
|
level=HealthLevel.UNKNOWN,
|
|
message=f"Memory check unavailable: {exc}",
|
|
)
|
|
|
|
def _get_memory_info(self) -> dict[str, float]:
|
|
"""Get memory stats via macOS sysctl + vm_stat.
|
|
|
|
Falls back gracefully on non-macOS systems.
|
|
"""
|
|
gb = 1024**3
|
|
total_bytes = 0.0
|
|
free_bytes = 0.0
|
|
|
|
# Total memory via sysctl
|
|
try:
|
|
result = subprocess.run(
|
|
["sysctl", "-n", "hw.memsize"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=3,
|
|
)
|
|
total_bytes = float(result.stdout.strip())
|
|
except Exception:
|
|
pass
|
|
|
|
# Free + inactive pages via vm_stat (macOS)
|
|
try:
|
|
result = subprocess.run(
|
|
["vm_stat"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=3,
|
|
)
|
|
page_size = 16384 # 16 KB default on Apple Silicon
|
|
for line in result.stdout.splitlines():
|
|
if "page size of" in line:
|
|
parts = line.split()
|
|
for i, part in enumerate(parts):
|
|
if part == "of" and i + 1 < len(parts):
|
|
try:
|
|
page_size = int(parts[i + 1])
|
|
except ValueError:
|
|
pass
|
|
elif "Pages free:" in line:
|
|
pages = int(line.split(":")[1].strip().rstrip("."))
|
|
free_bytes += pages * page_size
|
|
elif "Pages inactive:" in line:
|
|
pages = int(line.split(":")[1].strip().rstrip("."))
|
|
free_bytes += pages * page_size
|
|
except Exception:
|
|
pass
|
|
|
|
return {
|
|
"total_gb": total_bytes / gb if total_bytes else 0.0,
|
|
"free_gb": free_bytes / gb if free_bytes else 0.0,
|
|
}
|
|
|
|
# ── Disk ─────────────────────────────────────────────────────────────────
|
|
|
|
async def _check_disk(self) -> CheckResult:
|
|
"""Check disk usage via shutil.disk_usage."""
|
|
disk_free_min_gb = getattr(settings, "hermes_disk_free_min_gb", 10.0)
|
|
try:
|
|
usage = await asyncio.to_thread(shutil.disk_usage, "/")
|
|
free_gb = usage.free / (1024**3)
|
|
total_gb = usage.total / (1024**3)
|
|
used_pct = (usage.used / usage.total) * 100
|
|
|
|
details: dict[str, Any] = {
|
|
"free_gb": round(free_gb, 2),
|
|
"total_gb": round(total_gb, 2),
|
|
"used_pct": round(used_pct, 1),
|
|
}
|
|
|
|
if free_gb < disk_free_min_gb:
|
|
cleaned_gb = await self._cleanup_temp_files()
|
|
if cleaned_gb > 0.01:
|
|
return CheckResult(
|
|
name="disk",
|
|
level=HealthLevel.WARNING,
|
|
message=(
|
|
f"Low disk ({free_gb:.1f}GB free) — "
|
|
f"cleaned {cleaned_gb:.2f}GB from /tmp"
|
|
),
|
|
details={**details, "cleaned_gb": round(cleaned_gb, 2)},
|
|
auto_resolved=True,
|
|
)
|
|
return CheckResult(
|
|
name="disk",
|
|
level=HealthLevel.CRITICAL,
|
|
message=(
|
|
f"Critical: only {free_gb:.1f}GB free (threshold: {disk_free_min_gb}GB)"
|
|
),
|
|
details=details,
|
|
needs_human=True,
|
|
)
|
|
|
|
return CheckResult(
|
|
name="disk",
|
|
level=HealthLevel.OK,
|
|
message=f"Disk OK — {free_gb:.1f}GB free ({used_pct:.0f}% used)",
|
|
details=details,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("Disk check failed: %s", exc)
|
|
return CheckResult(
|
|
name="disk",
|
|
level=HealthLevel.UNKNOWN,
|
|
message=f"Disk check unavailable: {exc}",
|
|
)
|
|
|
|
async def _cleanup_temp_files(self) -> float:
|
|
"""Remove /tmp files older than 24 hours. Returns GB freed."""
|
|
return await asyncio.to_thread(self._cleanup_temp_files_sync)
|
|
|
|
def _cleanup_temp_files_sync(self) -> float:
|
|
"""Synchronous /tmp cleanup — only touches files older than 24 hours."""
|
|
from pathlib import Path
|
|
|
|
freed_bytes = 0
|
|
cutoff = time.time() - 86400 # 24 hours ago
|
|
|
|
try:
|
|
tmp = Path(tempfile.gettempdir())
|
|
for item in tmp.iterdir():
|
|
try:
|
|
stat = item.stat()
|
|
if stat.st_mtime >= cutoff:
|
|
continue
|
|
if item.is_file():
|
|
freed_bytes += stat.st_size
|
|
item.unlink(missing_ok=True)
|
|
elif item.is_dir():
|
|
dir_size = sum(f.stat().st_size for f in item.rglob("*") if f.is_file())
|
|
freed_bytes += dir_size
|
|
shutil.rmtree(str(item), ignore_errors=True)
|
|
except (PermissionError, OSError):
|
|
pass # Skip files we can't touch
|
|
except Exception as exc:
|
|
logger.warning("Temp cleanup error: %s", exc)
|
|
|
|
freed_gb = freed_bytes / (1024**3)
|
|
if freed_gb > 0.001:
|
|
logger.info("Hermes disk cleanup: freed %.2fGB from /tmp", freed_gb)
|
|
return freed_gb
|
|
|
|
# ── Ollama ───────────────────────────────────────────────────────────────
|
|
|
|
async def _check_ollama(self) -> CheckResult:
|
|
"""Check Ollama status and loaded models."""
|
|
try:
|
|
status = await asyncio.to_thread(self._get_ollama_status)
|
|
|
|
if not status.get("reachable"):
|
|
restarted = await self._restart_ollama()
|
|
if restarted:
|
|
return CheckResult(
|
|
name="ollama",
|
|
level=HealthLevel.WARNING,
|
|
message="Ollama was unreachable — restart initiated",
|
|
details={"restart_attempted": True},
|
|
auto_resolved=True,
|
|
)
|
|
return CheckResult(
|
|
name="ollama",
|
|
level=HealthLevel.CRITICAL,
|
|
message="Ollama unreachable and restart failed",
|
|
details={"reachable": False},
|
|
needs_human=True,
|
|
)
|
|
|
|
models = status.get("models", [])
|
|
loaded = status.get("loaded_models", [])
|
|
return CheckResult(
|
|
name="ollama",
|
|
level=HealthLevel.OK,
|
|
message=(f"Ollama OK — {len(models)} model(s) available, {len(loaded)} loaded"),
|
|
details={
|
|
"reachable": True,
|
|
"model_count": len(models),
|
|
"loaded_count": len(loaded),
|
|
"loaded_models": [m.get("name", "") for m in loaded],
|
|
},
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("Ollama check failed: %s", exc)
|
|
return CheckResult(
|
|
name="ollama",
|
|
level=HealthLevel.UNKNOWN,
|
|
message=f"Ollama check failed: {exc}",
|
|
)
|
|
|
|
def _get_ollama_status(self) -> dict[str, Any]:
|
|
"""Synchronous Ollama status — checks /api/tags and /api/ps."""
|
|
url = settings.normalized_ollama_url
|
|
|
|
try:
|
|
req = urllib.request.Request(
|
|
f"{url}/api/tags",
|
|
method="GET",
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
with urllib.request.urlopen(req, timeout=self.OLLAMA_REQUEST_TIMEOUT) as resp:
|
|
data = json.loads(resp.read().decode())
|
|
models = data.get("models", [])
|
|
except Exception:
|
|
return {"reachable": False, "models": [], "loaded_models": []}
|
|
|
|
# /api/ps lists currently loaded (in-memory) models — Ollama >=0.2
|
|
loaded: list[dict] = []
|
|
try:
|
|
req = urllib.request.Request(
|
|
f"{url}/api/ps",
|
|
method="GET",
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
with urllib.request.urlopen(req, timeout=self.OLLAMA_REQUEST_TIMEOUT) as resp:
|
|
ps_data = json.loads(resp.read().decode())
|
|
loaded = ps_data.get("models", [])
|
|
except Exception:
|
|
pass # /api/ps absent on older Ollama — non-fatal
|
|
|
|
return {"reachable": True, "models": models, "loaded_models": loaded}
|
|
|
|
async def _unload_ollama_models(self) -> int:
|
|
"""Unload in-memory Ollama models to free unified memory.
|
|
|
|
Uses the keep_alive=0 trick: POSTing to /api/generate with
|
|
keep_alive=0 causes Ollama to immediately evict the model.
|
|
Returns the number of models successfully unloaded.
|
|
"""
|
|
return await asyncio.to_thread(self._unload_ollama_models_sync)
|
|
|
|
def _unload_ollama_models_sync(self) -> int:
|
|
"""Synchronous model unload implementation."""
|
|
url = settings.normalized_ollama_url
|
|
unloaded = 0
|
|
|
|
try:
|
|
req = urllib.request.Request(
|
|
f"{url}/api/ps",
|
|
method="GET",
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
with urllib.request.urlopen(req, timeout=self.OLLAMA_REQUEST_TIMEOUT) as resp:
|
|
ps_data = json.loads(resp.read().decode())
|
|
loaded = ps_data.get("models", [])
|
|
except Exception:
|
|
return 0
|
|
|
|
for model in loaded:
|
|
name = model.get("name", "")
|
|
if not name:
|
|
continue
|
|
try:
|
|
payload = json.dumps({"model": name, "keep_alive": 0}).encode()
|
|
req = urllib.request.Request(
|
|
f"{url}/api/generate",
|
|
data=payload,
|
|
method="POST",
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
with urllib.request.urlopen(req, timeout=10) as _:
|
|
pass
|
|
logger.info("Hermes: unloaded Ollama model %s", name)
|
|
unloaded += 1
|
|
except Exception as exc:
|
|
logger.warning("Hermes: failed to unload model %s: %s", name, exc)
|
|
|
|
return unloaded
|
|
|
|
async def _restart_ollama(self) -> bool:
|
|
"""Attempt to restart the Ollama service via launchctl or brew."""
|
|
return await asyncio.to_thread(self._restart_ollama_sync)
|
|
|
|
def _restart_ollama_sync(self) -> bool:
|
|
"""Try launchctl first, then brew services."""
|
|
# macOS launchctl (installed via official Ollama installer)
|
|
try:
|
|
result = subprocess.run(
|
|
["launchctl", "stop", "com.ollama.ollama"],
|
|
capture_output=True,
|
|
timeout=10,
|
|
)
|
|
if result.returncode == 0:
|
|
time.sleep(2)
|
|
subprocess.run(
|
|
["launchctl", "start", "com.ollama.ollama"],
|
|
capture_output=True,
|
|
timeout=10,
|
|
)
|
|
logger.info("Hermes: Ollama restarted via launchctl")
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
# Homebrew fallback
|
|
try:
|
|
result = subprocess.run(
|
|
["brew", "services", "restart", "ollama"],
|
|
capture_output=True,
|
|
timeout=20,
|
|
)
|
|
if result.returncode == 0:
|
|
logger.info("Hermes: Ollama restarted via brew services")
|
|
return True
|
|
except Exception:
|
|
pass
|
|
|
|
logger.warning("Hermes: Ollama restart failed — manual intervention needed")
|
|
return False
|
|
|
|
# ── Processes ────────────────────────────────────────────────────────────
|
|
|
|
async def _check_processes(self) -> CheckResult:
|
|
"""Check for zombie processes via ps aux."""
|
|
try:
|
|
result = await asyncio.to_thread(self._get_zombie_processes)
|
|
zombies = result.get("zombies", [])
|
|
|
|
if zombies:
|
|
return CheckResult(
|
|
name="processes",
|
|
level=HealthLevel.WARNING,
|
|
message=f"Found {len(zombies)} zombie process(es)",
|
|
details={"zombies": zombies[:5]},
|
|
needs_human=len(zombies) > 3,
|
|
)
|
|
|
|
return CheckResult(
|
|
name="processes",
|
|
level=HealthLevel.OK,
|
|
message="Processes OK — no zombies detected",
|
|
details={"zombie_count": 0},
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("Process check failed: %s", exc)
|
|
return CheckResult(
|
|
name="processes",
|
|
level=HealthLevel.UNKNOWN,
|
|
message=f"Process check unavailable: {exc}",
|
|
)
|
|
|
|
def _get_zombie_processes(self) -> dict[str, Any]:
|
|
"""Detect zombie processes (state 'Z') via ps aux."""
|
|
result = subprocess.run(
|
|
["ps", "aux"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=5,
|
|
)
|
|
zombies = []
|
|
for line in result.stdout.splitlines()[1:]: # Skip header row
|
|
parts = line.split(None, 10)
|
|
if len(parts) >= 8 and parts[7] == "Z":
|
|
zombies.append(
|
|
{
|
|
"pid": parts[1],
|
|
"command": parts[10][:80] if len(parts) > 10 else "",
|
|
}
|
|
)
|
|
return {"zombies": zombies}
|
|
|
|
# ── Network ──────────────────────────────────────────────────────────────
|
|
|
|
async def _check_network(self) -> CheckResult:
|
|
"""Check Gitea connectivity."""
|
|
try:
|
|
result = await asyncio.to_thread(self._check_gitea_connectivity)
|
|
reachable = result.get("reachable", False)
|
|
latency_ms = result.get("latency_ms", -1.0)
|
|
|
|
if not reachable:
|
|
return CheckResult(
|
|
name="network",
|
|
level=HealthLevel.WARNING,
|
|
message=f"Gitea unreachable: {result.get('error', 'unknown')}",
|
|
details=result,
|
|
needs_human=True,
|
|
)
|
|
|
|
return CheckResult(
|
|
name="network",
|
|
level=HealthLevel.OK,
|
|
message=f"Network OK — Gitea reachable ({latency_ms:.0f}ms)",
|
|
details=result,
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("Network check failed: %s", exc)
|
|
return CheckResult(
|
|
name="network",
|
|
level=HealthLevel.UNKNOWN,
|
|
message=f"Network check unavailable: {exc}",
|
|
)
|
|
|
|
def _check_gitea_connectivity(self) -> dict[str, Any]:
|
|
"""Synchronous Gitea reachability check."""
|
|
url = settings.gitea_url
|
|
start = time.monotonic()
|
|
try:
|
|
req = urllib.request.Request(
|
|
f"{url}/api/v1/version",
|
|
method="GET",
|
|
headers={"Accept": "application/json"},
|
|
)
|
|
with urllib.request.urlopen(req, timeout=self.NETWORK_REQUEST_TIMEOUT) as resp:
|
|
latency_ms = (time.monotonic() - start) * 1000
|
|
return {
|
|
"reachable": resp.status == 200,
|
|
"latency_ms": round(latency_ms, 1),
|
|
"url": url,
|
|
}
|
|
except Exception as exc:
|
|
return {
|
|
"reachable": False,
|
|
"error": str(exc),
|
|
"url": url,
|
|
"latency_ms": -1.0,
|
|
}
|
|
|
|
# ── Alerts ───────────────────────────────────────────────────────────────
|
|
|
|
async def _handle_alerts(self, report: HealthReport) -> None:
|
|
"""Send push notifications for issues that need attention."""
|
|
try:
|
|
from infrastructure.notifications.push import notifier
|
|
except Exception:
|
|
return
|
|
|
|
for check in report.checks:
|
|
if check.level == HealthLevel.CRITICAL or check.needs_human:
|
|
notifier.notify(
|
|
title=f"Hermes Alert: {check.name}",
|
|
message=check.message,
|
|
category="system",
|
|
native=check.level == HealthLevel.CRITICAL,
|
|
)
|
|
elif check.level == HealthLevel.WARNING and check.auto_resolved:
|
|
notifier.notify(
|
|
title=f"Hermes: {check.name} auto-fixed",
|
|
message=check.message,
|
|
category="system",
|
|
)
|
|
|
|
|
|
# Module-level singleton
|
|
hermes_monitor = HermesMonitor()
|