[claude] Hermes health monitor — system resources + model management (#1073) (#1133)
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled

Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit was merged in pull request #1133.
This commit is contained in:
2026-03-23 18:36:06 +00:00
committed by rockachopa
parent 05e1196ea4
commit 1c1bfb6407
6 changed files with 1214 additions and 0 deletions

View File

@@ -0,0 +1,452 @@
"""Unit tests for the Hermes health monitor.
Tests all five checks (memory, disk, Ollama, processes, network) using mocks
so no real subprocesses or network calls are made.
Refs: #1073
"""
import json
from io import BytesIO
from unittest.mock import MagicMock, patch
import pytest
from infrastructure.hermes.monitor import CheckResult, HealthLevel, HealthReport, HermesMonitor
@pytest.fixture()
def monitor():
return HermesMonitor()
# ── Unit helpers ──────────────────────────────────────────────────────────────
class _FakeHTTPResponse:
"""Minimal urllib response stub."""
def __init__(self, body: bytes, status: int = 200):
self._body = body
self.status = status
def read(self) -> bytes:
return self._body
def __enter__(self):
return self
def __exit__(self, *_):
pass
# ── Memory check ──────────────────────────────────────────────────────────────
def test_get_memory_info_parses_vm_stat(monitor):
vm_stat_output = (
"Mach Virtual Memory Statistics: (page size of 16384 bytes)\n"
"Pages free: 12800.\n"
"Pages active: 50000.\n"
"Pages inactive: 25600.\n"
"Pages speculative: 1000.\n"
)
with (
patch("subprocess.run") as mock_run,
):
# First call: sysctl hw.memsize (total)
sysctl_result = MagicMock()
sysctl_result.stdout = "68719476736\n" # 64 GB
# Second call: vm_stat
vmstat_result = MagicMock()
vmstat_result.stdout = vm_stat_output
mock_run.side_effect = [sysctl_result, vmstat_result]
info = monitor._get_memory_info()
assert info["total_gb"] == pytest.approx(64.0, abs=0.1)
# pages free (12800) + inactive (25600) = 38400 * 16384 bytes = 629145600 bytes ≈ 0.586 GB
expected_free_gb = (38400 * 16384) / (1024**3)
assert info["free_gb"] == pytest.approx(expected_free_gb, abs=0.001)
def test_get_memory_info_handles_subprocess_failure(monitor):
with patch("subprocess.run", side_effect=OSError("no sysctl")):
info = monitor._get_memory_info()
assert info["total_gb"] == 0.0
assert info["free_gb"] == 0.0
@pytest.mark.asyncio
async def test_check_memory_ok(monitor):
with patch.object(monitor, "_get_memory_info", return_value={"free_gb": 20.0, "total_gb": 64.0}):
result = await monitor._check_memory()
assert result.name == "memory"
assert result.level == HealthLevel.OK
assert "20.0GB" in result.message
@pytest.mark.asyncio
async def test_check_memory_low_triggers_unload(monitor):
with (
patch.object(monitor, "_get_memory_info", return_value={"free_gb": 2.0, "total_gb": 64.0}),
patch.object(monitor, "_unload_ollama_models", return_value=2),
):
result = await monitor._check_memory()
assert result.level == HealthLevel.WARNING
assert result.auto_resolved is True
assert "unloaded 2" in result.message
@pytest.mark.asyncio
async def test_check_memory_critical_no_models_to_unload(monitor):
with (
patch.object(monitor, "_get_memory_info", return_value={"free_gb": 1.0, "total_gb": 64.0}),
patch.object(monitor, "_unload_ollama_models", return_value=0),
):
result = await monitor._check_memory()
assert result.level == HealthLevel.CRITICAL
assert result.needs_human is True
@pytest.mark.asyncio
async def test_check_memory_exception_returns_unknown(monitor):
with patch.object(monitor, "_get_memory_info", side_effect=RuntimeError("boom")):
result = await monitor._check_memory()
assert result.level == HealthLevel.UNKNOWN
# ── Disk check ────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
async def test_check_disk_ok(monitor):
usage = MagicMock()
usage.free = 100 * (1024**3) # 100 GB
usage.total = 500 * (1024**3) # 500 GB
usage.used = 400 * (1024**3)
with patch("shutil.disk_usage", return_value=usage):
result = await monitor._check_disk()
assert result.level == HealthLevel.OK
assert "100.0GB free" in result.message
@pytest.mark.asyncio
async def test_check_disk_low_triggers_cleanup(monitor):
usage = MagicMock()
usage.free = 5 * (1024**3) # 5 GB — below threshold
usage.total = 500 * (1024**3)
usage.used = 495 * (1024**3)
with (
patch("shutil.disk_usage", return_value=usage),
patch.object(monitor, "_cleanup_temp_files", return_value=2.5),
):
result = await monitor._check_disk()
assert result.level == HealthLevel.WARNING
assert result.auto_resolved is True
assert "cleaned 2.50GB" in result.message
@pytest.mark.asyncio
async def test_check_disk_critical_when_cleanup_fails(monitor):
usage = MagicMock()
usage.free = 5 * (1024**3)
usage.total = 500 * (1024**3)
usage.used = 495 * (1024**3)
with (
patch("shutil.disk_usage", return_value=usage),
patch.object(monitor, "_cleanup_temp_files", return_value=0.0),
):
result = await monitor._check_disk()
assert result.level == HealthLevel.CRITICAL
assert result.needs_human is True
# ── Ollama check ──────────────────────────────────────────────────────────────
def test_get_ollama_status_reachable(monitor):
tags_body = json.dumps({
"models": [{"name": "qwen3:30b"}, {"name": "llama3.1:8b"}]
}).encode()
ps_body = json.dumps({
"models": [{"name": "qwen3:30b", "size": 1000}]
}).encode()
responses = [
_FakeHTTPResponse(tags_body),
_FakeHTTPResponse(ps_body),
]
with patch("urllib.request.urlopen", side_effect=responses):
status = monitor._get_ollama_status()
assert status["reachable"] is True
assert len(status["models"]) == 2
assert len(status["loaded_models"]) == 1
def test_get_ollama_status_unreachable(monitor):
with patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
status = monitor._get_ollama_status()
assert status["reachable"] is False
assert status["models"] == []
assert status["loaded_models"] == []
@pytest.mark.asyncio
async def test_check_ollama_ok(monitor):
status = {
"reachable": True,
"models": [{"name": "qwen3:30b"}],
"loaded_models": [],
}
with patch.object(monitor, "_get_ollama_status", return_value=status):
result = await monitor._check_ollama()
assert result.level == HealthLevel.OK
assert result.details["reachable"] is True
@pytest.mark.asyncio
async def test_check_ollama_unreachable_restart_success(monitor):
status = {"reachable": False, "models": [], "loaded_models": []}
with (
patch.object(monitor, "_get_ollama_status", return_value=status),
patch.object(monitor, "_restart_ollama", return_value=True),
):
result = await monitor._check_ollama()
assert result.level == HealthLevel.WARNING
assert result.auto_resolved is True
@pytest.mark.asyncio
async def test_check_ollama_unreachable_restart_fails(monitor):
status = {"reachable": False, "models": [], "loaded_models": []}
with (
patch.object(monitor, "_get_ollama_status", return_value=status),
patch.object(monitor, "_restart_ollama", return_value=False),
):
result = await monitor._check_ollama()
assert result.level == HealthLevel.CRITICAL
assert result.needs_human is True
# ── Process check ─────────────────────────────────────────────────────────────
def test_get_zombie_processes_none(monitor):
ps_output = (
"USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND\n"
"alex 123 0.1 0.2 100 200 s0 S 1:00 0:01 python\n"
"alex 456 0.0 0.1 50 100 s0 S 1:01 0:00 bash\n"
)
result = MagicMock()
result.stdout = ps_output
with patch("subprocess.run", return_value=result):
info = monitor._get_zombie_processes()
assert info["zombies"] == []
def test_get_zombie_processes_found(monitor):
ps_output = (
"USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND\n"
"alex 123 0.1 0.2 100 200 s0 S 1:00 0:01 python\n"
"alex 789 0.0 0.0 0 0 s0 Z 1:02 0:00 defunct\n"
)
result = MagicMock()
result.stdout = ps_output
with patch("subprocess.run", return_value=result):
info = monitor._get_zombie_processes()
assert len(info["zombies"]) == 1
assert info["zombies"][0]["pid"] == "789"
@pytest.mark.asyncio
async def test_check_processes_no_zombies(monitor):
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": []}):
result = await monitor._check_processes()
assert result.level == HealthLevel.OK
@pytest.mark.asyncio
async def test_check_processes_zombies_warning(monitor):
zombies = [{"pid": "100", "command": "defunct"}, {"pid": "101", "command": "defunct"}]
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
result = await monitor._check_processes()
assert result.level == HealthLevel.WARNING
assert result.needs_human is False # Only 2, threshold is >3
@pytest.mark.asyncio
async def test_check_processes_many_zombies_needs_human(monitor):
zombies = [{"pid": str(i), "command": "defunct"} for i in range(5)]
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
result = await monitor._check_processes()
assert result.needs_human is True
# ── Network check ─────────────────────────────────────────────────────────────
def test_check_gitea_connectivity_ok(monitor):
body = json.dumps({"version": "1.22.0"}).encode()
with patch("urllib.request.urlopen", return_value=_FakeHTTPResponse(body, status=200)):
info = monitor._check_gitea_connectivity()
assert info["reachable"] is True
assert info["latency_ms"] >= 0
def test_check_gitea_connectivity_unreachable(monitor):
with patch("urllib.request.urlopen", side_effect=OSError("refused")):
info = monitor._check_gitea_connectivity()
assert info["reachable"] is False
assert "error" in info
@pytest.mark.asyncio
async def test_check_network_ok(monitor):
with patch.object(
monitor,
"_check_gitea_connectivity",
return_value={"reachable": True, "latency_ms": 5.0, "url": "http://localhost:3000"},
):
result = await monitor._check_network()
assert result.level == HealthLevel.OK
assert "Gitea reachable" in result.message
@pytest.mark.asyncio
async def test_check_network_unreachable(monitor):
with patch.object(
monitor,
"_check_gitea_connectivity",
return_value={"reachable": False, "error": "refused", "url": "http://localhost:3000"},
):
result = await monitor._check_network()
assert result.level == HealthLevel.WARNING
assert result.needs_human is True
# ── Full cycle ────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
async def test_run_cycle_all_ok(monitor):
ok_result = CheckResult(name="test", level=HealthLevel.OK, message="ok")
async def _ok_check():
return ok_result
with (
patch.object(monitor, "_check_memory", _ok_check),
patch.object(monitor, "_check_disk", _ok_check),
patch.object(monitor, "_check_ollama", _ok_check),
patch.object(monitor, "_check_processes", _ok_check),
patch.object(monitor, "_check_network", _ok_check),
patch.object(monitor, "_handle_alerts"),
):
report = await monitor.run_cycle()
assert report.overall == HealthLevel.OK
assert not report.has_issues
assert monitor.last_report is report
@pytest.mark.asyncio
async def test_run_cycle_sets_overall_to_worst(monitor):
async def _ok():
return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
async def _critical():
return CheckResult(name="critical", level=HealthLevel.CRITICAL, message="bad")
with (
patch.object(monitor, "_check_memory", _ok),
patch.object(monitor, "_check_disk", _critical),
patch.object(monitor, "_check_ollama", _ok),
patch.object(monitor, "_check_processes", _ok),
patch.object(monitor, "_check_network", _ok),
patch.object(monitor, "_handle_alerts"),
):
report = await monitor.run_cycle()
assert report.overall == HealthLevel.CRITICAL
assert report.has_issues is True
@pytest.mark.asyncio
async def test_run_cycle_exception_becomes_unknown(monitor):
async def _ok():
return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
async def _boom():
raise RuntimeError("unexpected error")
with (
patch.object(monitor, "_check_memory", _ok),
patch.object(monitor, "_check_disk", _ok),
patch.object(monitor, "_check_ollama", _boom),
patch.object(monitor, "_check_processes", _ok),
patch.object(monitor, "_check_network", _ok),
patch.object(monitor, "_handle_alerts"),
):
report = await monitor.run_cycle()
levels = {c.level for c in report.checks}
assert HealthLevel.UNKNOWN in levels
# ── to_dict serialisation ────────────────────────────────────────────────────
def test_check_result_to_dict():
c = CheckResult(
name="memory",
level=HealthLevel.WARNING,
message="low",
details={"free_gb": 3.5},
auto_resolved=True,
)
d = c.to_dict()
assert d["name"] == "memory"
assert d["level"] == "warning"
assert d["auto_resolved"] is True
assert d["details"]["free_gb"] == 3.5
def test_health_report_to_dict():
checks = [
CheckResult(name="disk", level=HealthLevel.OK, message="ok"),
]
report = HealthReport(
timestamp="2026-01-01T00:00:00+00:00",
checks=checks,
overall=HealthLevel.OK,
)
d = report.to_dict()
assert d["overall"] == "ok"
assert d["has_issues"] is False
assert len(d["checks"]) == 1