[claude] Hermes health monitor — system resources + model management (#1073) (#1133)

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
2026-03-23 18:36:06 +00:00
parent 05e1196ea4
commit 1c1bfb6407
6 changed files with 1214 additions and 0 deletions
--- a/tests/unit/test_hermes_monitor.py
+++ b/tests/unit/test_hermes_monitor.py
@@ -0,0 +1,452 @@
+"""Unit tests for the Hermes health monitor.
+
+Tests all five checks (memory, disk, Ollama, processes, network) using mocks
+so no real subprocesses or network calls are made.
+
+Refs: #1073
+"""
+
+import json
+from io import BytesIO
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from infrastructure.hermes.monitor import CheckResult, HealthLevel, HealthReport, HermesMonitor
+
+
+@pytest.fixture()
+def monitor():
+    return HermesMonitor()
+
+
+# ── Unit helpers ──────────────────────────────────────────────────────────────
+
+
+class _FakeHTTPResponse:
+    """Minimal urllib response stub."""
+
+    def __init__(self, body: bytes, status: int = 200):
+        self._body = body
+        self.status = status
+
+    def read(self) -> bytes:
+        return self._body
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        pass
+
+
+# ── Memory check ──────────────────────────────────────────────────────────────
+
+
+def test_get_memory_info_parses_vm_stat(monitor):
+    vm_stat_output = (
+        "Mach Virtual Memory Statistics: (page size of 16384 bytes)\n"
+        "Pages free:                           12800.\n"
+        "Pages active:                         50000.\n"
+        "Pages inactive:                       25600.\n"
+        "Pages speculative:                     1000.\n"
+    )
+    with (
+        patch("subprocess.run") as mock_run,
+    ):
+        # First call: sysctl hw.memsize (total)
+        sysctl_result = MagicMock()
+        sysctl_result.stdout = "68719476736\n"  # 64 GB
+        # Second call: vm_stat
+        vmstat_result = MagicMock()
+        vmstat_result.stdout = vm_stat_output
+        mock_run.side_effect = [sysctl_result, vmstat_result]
+
+        info = monitor._get_memory_info()
+
+    assert info["total_gb"] == pytest.approx(64.0, abs=0.1)
+    # pages free (12800) + inactive (25600) = 38400 * 16384 bytes = 629145600 bytes ≈ 0.586 GB
+    expected_free_gb = (38400 * 16384) / (1024**3)
+    assert info["free_gb"] == pytest.approx(expected_free_gb, abs=0.001)
+
+
+def test_get_memory_info_handles_subprocess_failure(monitor):
+    with patch("subprocess.run", side_effect=OSError("no sysctl")):
+        info = monitor._get_memory_info()
+    assert info["total_gb"] == 0.0
+    assert info["free_gb"] == 0.0
+
+
+@pytest.mark.asyncio
+async def test_check_memory_ok(monitor):
+    with patch.object(monitor, "_get_memory_info", return_value={"free_gb": 20.0, "total_gb": 64.0}):
+        result = await monitor._check_memory()
+
+    assert result.name == "memory"
+    assert result.level == HealthLevel.OK
+    assert "20.0GB" in result.message
+
+
+@pytest.mark.asyncio
+async def test_check_memory_low_triggers_unload(monitor):
+    with (
+        patch.object(monitor, "_get_memory_info", return_value={"free_gb": 2.0, "total_gb": 64.0}),
+        patch.object(monitor, "_unload_ollama_models", return_value=2),
+    ):
+        result = await monitor._check_memory()
+
+    assert result.level == HealthLevel.WARNING
+    assert result.auto_resolved is True
+    assert "unloaded 2" in result.message
+
+
+@pytest.mark.asyncio
+async def test_check_memory_critical_no_models_to_unload(monitor):
+    with (
+        patch.object(monitor, "_get_memory_info", return_value={"free_gb": 1.0, "total_gb": 64.0}),
+        patch.object(monitor, "_unload_ollama_models", return_value=0),
+    ):
+        result = await monitor._check_memory()
+
+    assert result.level == HealthLevel.CRITICAL
+    assert result.needs_human is True
+
+
+@pytest.mark.asyncio
+async def test_check_memory_exception_returns_unknown(monitor):
+    with patch.object(monitor, "_get_memory_info", side_effect=RuntimeError("boom")):
+        result = await monitor._check_memory()
+
+    assert result.level == HealthLevel.UNKNOWN
+
+
+# ── Disk check ────────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_check_disk_ok(monitor):
+    usage = MagicMock()
+    usage.free = 100 * (1024**3)   # 100 GB
+    usage.total = 500 * (1024**3)  # 500 GB
+    usage.used = 400 * (1024**3)
+
+    with patch("shutil.disk_usage", return_value=usage):
+        result = await monitor._check_disk()
+
+    assert result.level == HealthLevel.OK
+    assert "100.0GB free" in result.message
+
+
+@pytest.mark.asyncio
+async def test_check_disk_low_triggers_cleanup(monitor):
+    usage = MagicMock()
+    usage.free = 5 * (1024**3)    # 5 GB — below threshold
+    usage.total = 500 * (1024**3)
+    usage.used = 495 * (1024**3)
+
+    with (
+        patch("shutil.disk_usage", return_value=usage),
+        patch.object(monitor, "_cleanup_temp_files", return_value=2.5),
+    ):
+        result = await monitor._check_disk()
+
+    assert result.level == HealthLevel.WARNING
+    assert result.auto_resolved is True
+    assert "cleaned 2.50GB" in result.message
+
+
+@pytest.mark.asyncio
+async def test_check_disk_critical_when_cleanup_fails(monitor):
+    usage = MagicMock()
+    usage.free = 5 * (1024**3)
+    usage.total = 500 * (1024**3)
+    usage.used = 495 * (1024**3)
+
+    with (
+        patch("shutil.disk_usage", return_value=usage),
+        patch.object(monitor, "_cleanup_temp_files", return_value=0.0),
+    ):
+        result = await monitor._check_disk()
+
+    assert result.level == HealthLevel.CRITICAL
+    assert result.needs_human is True
+
+
+# ── Ollama check ──────────────────────────────────────────────────────────────
+
+
+def test_get_ollama_status_reachable(monitor):
+    tags_body = json.dumps({
+        "models": [{"name": "qwen3:30b"}, {"name": "llama3.1:8b"}]
+    }).encode()
+    ps_body = json.dumps({
+        "models": [{"name": "qwen3:30b", "size": 1000}]
+    }).encode()
+
+    responses = [
+        _FakeHTTPResponse(tags_body),
+        _FakeHTTPResponse(ps_body),
+    ]
+
+    with patch("urllib.request.urlopen", side_effect=responses):
+        status = monitor._get_ollama_status()
+
+    assert status["reachable"] is True
+    assert len(status["models"]) == 2
+    assert len(status["loaded_models"]) == 1
+
+
+def test_get_ollama_status_unreachable(monitor):
+    with patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
+        status = monitor._get_ollama_status()
+
+    assert status["reachable"] is False
+    assert status["models"] == []
+    assert status["loaded_models"] == []
+
+
+@pytest.mark.asyncio
+async def test_check_ollama_ok(monitor):
+    status = {
+        "reachable": True,
+        "models": [{"name": "qwen3:30b"}],
+        "loaded_models": [],
+    }
+    with patch.object(monitor, "_get_ollama_status", return_value=status):
+        result = await monitor._check_ollama()
+
+    assert result.level == HealthLevel.OK
+    assert result.details["reachable"] is True
+
+
+@pytest.mark.asyncio
+async def test_check_ollama_unreachable_restart_success(monitor):
+    status = {"reachable": False, "models": [], "loaded_models": []}
+    with (
+        patch.object(monitor, "_get_ollama_status", return_value=status),
+        patch.object(monitor, "_restart_ollama", return_value=True),
+    ):
+        result = await monitor._check_ollama()
+
+    assert result.level == HealthLevel.WARNING
+    assert result.auto_resolved is True
+
+
+@pytest.mark.asyncio
+async def test_check_ollama_unreachable_restart_fails(monitor):
+    status = {"reachable": False, "models": [], "loaded_models": []}
+    with (
+        patch.object(monitor, "_get_ollama_status", return_value=status),
+        patch.object(monitor, "_restart_ollama", return_value=False),
+    ):
+        result = await monitor._check_ollama()
+
+    assert result.level == HealthLevel.CRITICAL
+    assert result.needs_human is True
+
+
+# ── Process check ─────────────────────────────────────────────────────────────
+
+
+def test_get_zombie_processes_none(monitor):
+    ps_output = (
+        "USER  PID  %CPU  %MEM  VSZ   RSS  TT  STAT  STARTED  TIME  COMMAND\n"
+        "alex  123   0.1   0.2  100   200  s0  S      1:00   0:01  python\n"
+        "alex  456   0.0   0.1   50   100  s0  S      1:01   0:00  bash\n"
+    )
+    result = MagicMock()
+    result.stdout = ps_output
+    with patch("subprocess.run", return_value=result):
+        info = monitor._get_zombie_processes()
+
+    assert info["zombies"] == []
+
+
+def test_get_zombie_processes_found(monitor):
+    ps_output = (
+        "USER  PID  %CPU  %MEM  VSZ   RSS  TT  STAT  STARTED  TIME  COMMAND\n"
+        "alex  123   0.1   0.2  100   200  s0  S      1:00   0:01  python\n"
+        "alex  789   0.0   0.0    0     0  s0  Z      1:02   0:00  defunct\n"
+    )
+    result = MagicMock()
+    result.stdout = ps_output
+    with patch("subprocess.run", return_value=result):
+        info = monitor._get_zombie_processes()
+
+    assert len(info["zombies"]) == 1
+    assert info["zombies"][0]["pid"] == "789"
+
+
+@pytest.mark.asyncio
+async def test_check_processes_no_zombies(monitor):
+    with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": []}):
+        result = await monitor._check_processes()
+
+    assert result.level == HealthLevel.OK
+
+
+@pytest.mark.asyncio
+async def test_check_processes_zombies_warning(monitor):
+    zombies = [{"pid": "100", "command": "defunct"}, {"pid": "101", "command": "defunct"}]
+    with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
+        result = await monitor._check_processes()
+
+    assert result.level == HealthLevel.WARNING
+    assert result.needs_human is False  # Only 2, threshold is >3
+
+
+@pytest.mark.asyncio
+async def test_check_processes_many_zombies_needs_human(monitor):
+    zombies = [{"pid": str(i), "command": "defunct"} for i in range(5)]
+    with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
+        result = await monitor._check_processes()
+
+    assert result.needs_human is True
+
+
+# ── Network check ─────────────────────────────────────────────────────────────
+
+
+def test_check_gitea_connectivity_ok(monitor):
+    body = json.dumps({"version": "1.22.0"}).encode()
+    with patch("urllib.request.urlopen", return_value=_FakeHTTPResponse(body, status=200)):
+        info = monitor._check_gitea_connectivity()
+
+    assert info["reachable"] is True
+    assert info["latency_ms"] >= 0
+
+
+def test_check_gitea_connectivity_unreachable(monitor):
+    with patch("urllib.request.urlopen", side_effect=OSError("refused")):
+        info = monitor._check_gitea_connectivity()
+
+    assert info["reachable"] is False
+    assert "error" in info
+
+
+@pytest.mark.asyncio
+async def test_check_network_ok(monitor):
+    with patch.object(
+        monitor,
+        "_check_gitea_connectivity",
+        return_value={"reachable": True, "latency_ms": 5.0, "url": "http://localhost:3000"},
+    ):
+        result = await monitor._check_network()
+
+    assert result.level == HealthLevel.OK
+    assert "Gitea reachable" in result.message
+
+
+@pytest.mark.asyncio
+async def test_check_network_unreachable(monitor):
+    with patch.object(
+        monitor,
+        "_check_gitea_connectivity",
+        return_value={"reachable": False, "error": "refused", "url": "http://localhost:3000"},
+    ):
+        result = await monitor._check_network()
+
+    assert result.level == HealthLevel.WARNING
+    assert result.needs_human is True
+
+
+# ── Full cycle ────────────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_run_cycle_all_ok(monitor):
+    ok_result = CheckResult(name="test", level=HealthLevel.OK, message="ok")
+
+    async def _ok_check():
+        return ok_result
+
+    with (
+        patch.object(monitor, "_check_memory", _ok_check),
+        patch.object(monitor, "_check_disk", _ok_check),
+        patch.object(monitor, "_check_ollama", _ok_check),
+        patch.object(monitor, "_check_processes", _ok_check),
+        patch.object(monitor, "_check_network", _ok_check),
+        patch.object(monitor, "_handle_alerts"),
+    ):
+        report = await monitor.run_cycle()
+
+    assert report.overall == HealthLevel.OK
+    assert not report.has_issues
+    assert monitor.last_report is report
+
+
+@pytest.mark.asyncio
+async def test_run_cycle_sets_overall_to_worst(monitor):
+    async def _ok():
+        return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
+
+    async def _critical():
+        return CheckResult(name="critical", level=HealthLevel.CRITICAL, message="bad")
+
+    with (
+        patch.object(monitor, "_check_memory", _ok),
+        patch.object(monitor, "_check_disk", _critical),
+        patch.object(monitor, "_check_ollama", _ok),
+        patch.object(monitor, "_check_processes", _ok),
+        patch.object(monitor, "_check_network", _ok),
+        patch.object(monitor, "_handle_alerts"),
+    ):
+        report = await monitor.run_cycle()
+
+    assert report.overall == HealthLevel.CRITICAL
+    assert report.has_issues is True
+
+
+@pytest.mark.asyncio
+async def test_run_cycle_exception_becomes_unknown(monitor):
+    async def _ok():
+        return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
+
+    async def _boom():
+        raise RuntimeError("unexpected error")
+
+    with (
+        patch.object(monitor, "_check_memory", _ok),
+        patch.object(monitor, "_check_disk", _ok),
+        patch.object(monitor, "_check_ollama", _boom),
+        patch.object(monitor, "_check_processes", _ok),
+        patch.object(monitor, "_check_network", _ok),
+        patch.object(monitor, "_handle_alerts"),
+    ):
+        report = await monitor.run_cycle()
+
+    levels = {c.level for c in report.checks}
+    assert HealthLevel.UNKNOWN in levels
+
+
+# ── to_dict serialisation ────────────────────────────────────────────────────
+
+
+def test_check_result_to_dict():
+    c = CheckResult(
+        name="memory",
+        level=HealthLevel.WARNING,
+        message="low",
+        details={"free_gb": 3.5},
+        auto_resolved=True,
+    )
+    d = c.to_dict()
+    assert d["name"] == "memory"
+    assert d["level"] == "warning"
+    assert d["auto_resolved"] is True
+    assert d["details"]["free_gb"] == 3.5
+
+
+def test_health_report_to_dict():
+    checks = [
+        CheckResult(name="disk", level=HealthLevel.OK, message="ok"),
+    ]
+    report = HealthReport(
+        timestamp="2026-01-01T00:00:00+00:00",
+        checks=checks,
+        overall=HealthLevel.OK,
+    )
+    d = report.to_dict()
+    assert d["overall"] == "ok"
+    assert d["has_issues"] is False
+    assert len(d["checks"]) == 1