Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit was merged in pull request #1133.
This commit is contained in:
452
tests/unit/test_hermes_monitor.py
Normal file
452
tests/unit/test_hermes_monitor.py
Normal file
@@ -0,0 +1,452 @@
|
||||
"""Unit tests for the Hermes health monitor.
|
||||
|
||||
Tests all five checks (memory, disk, Ollama, processes, network) using mocks
|
||||
so no real subprocesses or network calls are made.
|
||||
|
||||
Refs: #1073
|
||||
"""
|
||||
|
||||
import json
|
||||
from io import BytesIO
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from infrastructure.hermes.monitor import CheckResult, HealthLevel, HealthReport, HermesMonitor
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def monitor():
|
||||
return HermesMonitor()
|
||||
|
||||
|
||||
# ── Unit helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class _FakeHTTPResponse:
|
||||
"""Minimal urllib response stub."""
|
||||
|
||||
def __init__(self, body: bytes, status: int = 200):
|
||||
self._body = body
|
||||
self.status = status
|
||||
|
||||
def read(self) -> bytes:
|
||||
return self._body
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
pass
|
||||
|
||||
|
||||
# ── Memory check ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_get_memory_info_parses_vm_stat(monitor):
|
||||
vm_stat_output = (
|
||||
"Mach Virtual Memory Statistics: (page size of 16384 bytes)\n"
|
||||
"Pages free: 12800.\n"
|
||||
"Pages active: 50000.\n"
|
||||
"Pages inactive: 25600.\n"
|
||||
"Pages speculative: 1000.\n"
|
||||
)
|
||||
with (
|
||||
patch("subprocess.run") as mock_run,
|
||||
):
|
||||
# First call: sysctl hw.memsize (total)
|
||||
sysctl_result = MagicMock()
|
||||
sysctl_result.stdout = "68719476736\n" # 64 GB
|
||||
# Second call: vm_stat
|
||||
vmstat_result = MagicMock()
|
||||
vmstat_result.stdout = vm_stat_output
|
||||
mock_run.side_effect = [sysctl_result, vmstat_result]
|
||||
|
||||
info = monitor._get_memory_info()
|
||||
|
||||
assert info["total_gb"] == pytest.approx(64.0, abs=0.1)
|
||||
# pages free (12800) + inactive (25600) = 38400 * 16384 bytes = 629145600 bytes ≈ 0.586 GB
|
||||
expected_free_gb = (38400 * 16384) / (1024**3)
|
||||
assert info["free_gb"] == pytest.approx(expected_free_gb, abs=0.001)
|
||||
|
||||
|
||||
def test_get_memory_info_handles_subprocess_failure(monitor):
|
||||
with patch("subprocess.run", side_effect=OSError("no sysctl")):
|
||||
info = monitor._get_memory_info()
|
||||
assert info["total_gb"] == 0.0
|
||||
assert info["free_gb"] == 0.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_memory_ok(monitor):
|
||||
with patch.object(monitor, "_get_memory_info", return_value={"free_gb": 20.0, "total_gb": 64.0}):
|
||||
result = await monitor._check_memory()
|
||||
|
||||
assert result.name == "memory"
|
||||
assert result.level == HealthLevel.OK
|
||||
assert "20.0GB" in result.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_memory_low_triggers_unload(monitor):
|
||||
with (
|
||||
patch.object(monitor, "_get_memory_info", return_value={"free_gb": 2.0, "total_gb": 64.0}),
|
||||
patch.object(monitor, "_unload_ollama_models", return_value=2),
|
||||
):
|
||||
result = await monitor._check_memory()
|
||||
|
||||
assert result.level == HealthLevel.WARNING
|
||||
assert result.auto_resolved is True
|
||||
assert "unloaded 2" in result.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_memory_critical_no_models_to_unload(monitor):
|
||||
with (
|
||||
patch.object(monitor, "_get_memory_info", return_value={"free_gb": 1.0, "total_gb": 64.0}),
|
||||
patch.object(monitor, "_unload_ollama_models", return_value=0),
|
||||
):
|
||||
result = await monitor._check_memory()
|
||||
|
||||
assert result.level == HealthLevel.CRITICAL
|
||||
assert result.needs_human is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_memory_exception_returns_unknown(monitor):
|
||||
with patch.object(monitor, "_get_memory_info", side_effect=RuntimeError("boom")):
|
||||
result = await monitor._check_memory()
|
||||
|
||||
assert result.level == HealthLevel.UNKNOWN
|
||||
|
||||
|
||||
# ── Disk check ────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_disk_ok(monitor):
|
||||
usage = MagicMock()
|
||||
usage.free = 100 * (1024**3) # 100 GB
|
||||
usage.total = 500 * (1024**3) # 500 GB
|
||||
usage.used = 400 * (1024**3)
|
||||
|
||||
with patch("shutil.disk_usage", return_value=usage):
|
||||
result = await monitor._check_disk()
|
||||
|
||||
assert result.level == HealthLevel.OK
|
||||
assert "100.0GB free" in result.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_disk_low_triggers_cleanup(monitor):
|
||||
usage = MagicMock()
|
||||
usage.free = 5 * (1024**3) # 5 GB — below threshold
|
||||
usage.total = 500 * (1024**3)
|
||||
usage.used = 495 * (1024**3)
|
||||
|
||||
with (
|
||||
patch("shutil.disk_usage", return_value=usage),
|
||||
patch.object(monitor, "_cleanup_temp_files", return_value=2.5),
|
||||
):
|
||||
result = await monitor._check_disk()
|
||||
|
||||
assert result.level == HealthLevel.WARNING
|
||||
assert result.auto_resolved is True
|
||||
assert "cleaned 2.50GB" in result.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_disk_critical_when_cleanup_fails(monitor):
|
||||
usage = MagicMock()
|
||||
usage.free = 5 * (1024**3)
|
||||
usage.total = 500 * (1024**3)
|
||||
usage.used = 495 * (1024**3)
|
||||
|
||||
with (
|
||||
patch("shutil.disk_usage", return_value=usage),
|
||||
patch.object(monitor, "_cleanup_temp_files", return_value=0.0),
|
||||
):
|
||||
result = await monitor._check_disk()
|
||||
|
||||
assert result.level == HealthLevel.CRITICAL
|
||||
assert result.needs_human is True
|
||||
|
||||
|
||||
# ── Ollama check ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_get_ollama_status_reachable(monitor):
|
||||
tags_body = json.dumps({
|
||||
"models": [{"name": "qwen3:30b"}, {"name": "llama3.1:8b"}]
|
||||
}).encode()
|
||||
ps_body = json.dumps({
|
||||
"models": [{"name": "qwen3:30b", "size": 1000}]
|
||||
}).encode()
|
||||
|
||||
responses = [
|
||||
_FakeHTTPResponse(tags_body),
|
||||
_FakeHTTPResponse(ps_body),
|
||||
]
|
||||
|
||||
with patch("urllib.request.urlopen", side_effect=responses):
|
||||
status = monitor._get_ollama_status()
|
||||
|
||||
assert status["reachable"] is True
|
||||
assert len(status["models"]) == 2
|
||||
assert len(status["loaded_models"]) == 1
|
||||
|
||||
|
||||
def test_get_ollama_status_unreachable(monitor):
|
||||
with patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
|
||||
status = monitor._get_ollama_status()
|
||||
|
||||
assert status["reachable"] is False
|
||||
assert status["models"] == []
|
||||
assert status["loaded_models"] == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_ollama_ok(monitor):
|
||||
status = {
|
||||
"reachable": True,
|
||||
"models": [{"name": "qwen3:30b"}],
|
||||
"loaded_models": [],
|
||||
}
|
||||
with patch.object(monitor, "_get_ollama_status", return_value=status):
|
||||
result = await monitor._check_ollama()
|
||||
|
||||
assert result.level == HealthLevel.OK
|
||||
assert result.details["reachable"] is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_ollama_unreachable_restart_success(monitor):
|
||||
status = {"reachable": False, "models": [], "loaded_models": []}
|
||||
with (
|
||||
patch.object(monitor, "_get_ollama_status", return_value=status),
|
||||
patch.object(monitor, "_restart_ollama", return_value=True),
|
||||
):
|
||||
result = await monitor._check_ollama()
|
||||
|
||||
assert result.level == HealthLevel.WARNING
|
||||
assert result.auto_resolved is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_ollama_unreachable_restart_fails(monitor):
|
||||
status = {"reachable": False, "models": [], "loaded_models": []}
|
||||
with (
|
||||
patch.object(monitor, "_get_ollama_status", return_value=status),
|
||||
patch.object(monitor, "_restart_ollama", return_value=False),
|
||||
):
|
||||
result = await monitor._check_ollama()
|
||||
|
||||
assert result.level == HealthLevel.CRITICAL
|
||||
assert result.needs_human is True
|
||||
|
||||
|
||||
# ── Process check ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_get_zombie_processes_none(monitor):
|
||||
ps_output = (
|
||||
"USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND\n"
|
||||
"alex 123 0.1 0.2 100 200 s0 S 1:00 0:01 python\n"
|
||||
"alex 456 0.0 0.1 50 100 s0 S 1:01 0:00 bash\n"
|
||||
)
|
||||
result = MagicMock()
|
||||
result.stdout = ps_output
|
||||
with patch("subprocess.run", return_value=result):
|
||||
info = monitor._get_zombie_processes()
|
||||
|
||||
assert info["zombies"] == []
|
||||
|
||||
|
||||
def test_get_zombie_processes_found(monitor):
|
||||
ps_output = (
|
||||
"USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND\n"
|
||||
"alex 123 0.1 0.2 100 200 s0 S 1:00 0:01 python\n"
|
||||
"alex 789 0.0 0.0 0 0 s0 Z 1:02 0:00 defunct\n"
|
||||
)
|
||||
result = MagicMock()
|
||||
result.stdout = ps_output
|
||||
with patch("subprocess.run", return_value=result):
|
||||
info = monitor._get_zombie_processes()
|
||||
|
||||
assert len(info["zombies"]) == 1
|
||||
assert info["zombies"][0]["pid"] == "789"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_processes_no_zombies(monitor):
|
||||
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": []}):
|
||||
result = await monitor._check_processes()
|
||||
|
||||
assert result.level == HealthLevel.OK
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_processes_zombies_warning(monitor):
|
||||
zombies = [{"pid": "100", "command": "defunct"}, {"pid": "101", "command": "defunct"}]
|
||||
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
|
||||
result = await monitor._check_processes()
|
||||
|
||||
assert result.level == HealthLevel.WARNING
|
||||
assert result.needs_human is False # Only 2, threshold is >3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_processes_many_zombies_needs_human(monitor):
|
||||
zombies = [{"pid": str(i), "command": "defunct"} for i in range(5)]
|
||||
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
|
||||
result = await monitor._check_processes()
|
||||
|
||||
assert result.needs_human is True
|
||||
|
||||
|
||||
# ── Network check ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_check_gitea_connectivity_ok(monitor):
|
||||
body = json.dumps({"version": "1.22.0"}).encode()
|
||||
with patch("urllib.request.urlopen", return_value=_FakeHTTPResponse(body, status=200)):
|
||||
info = monitor._check_gitea_connectivity()
|
||||
|
||||
assert info["reachable"] is True
|
||||
assert info["latency_ms"] >= 0
|
||||
|
||||
|
||||
def test_check_gitea_connectivity_unreachable(monitor):
|
||||
with patch("urllib.request.urlopen", side_effect=OSError("refused")):
|
||||
info = monitor._check_gitea_connectivity()
|
||||
|
||||
assert info["reachable"] is False
|
||||
assert "error" in info
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_network_ok(monitor):
|
||||
with patch.object(
|
||||
monitor,
|
||||
"_check_gitea_connectivity",
|
||||
return_value={"reachable": True, "latency_ms": 5.0, "url": "http://localhost:3000"},
|
||||
):
|
||||
result = await monitor._check_network()
|
||||
|
||||
assert result.level == HealthLevel.OK
|
||||
assert "Gitea reachable" in result.message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_network_unreachable(monitor):
|
||||
with patch.object(
|
||||
monitor,
|
||||
"_check_gitea_connectivity",
|
||||
return_value={"reachable": False, "error": "refused", "url": "http://localhost:3000"},
|
||||
):
|
||||
result = await monitor._check_network()
|
||||
|
||||
assert result.level == HealthLevel.WARNING
|
||||
assert result.needs_human is True
|
||||
|
||||
|
||||
# ── Full cycle ────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_cycle_all_ok(monitor):
|
||||
ok_result = CheckResult(name="test", level=HealthLevel.OK, message="ok")
|
||||
|
||||
async def _ok_check():
|
||||
return ok_result
|
||||
|
||||
with (
|
||||
patch.object(monitor, "_check_memory", _ok_check),
|
||||
patch.object(monitor, "_check_disk", _ok_check),
|
||||
patch.object(monitor, "_check_ollama", _ok_check),
|
||||
patch.object(monitor, "_check_processes", _ok_check),
|
||||
patch.object(monitor, "_check_network", _ok_check),
|
||||
patch.object(monitor, "_handle_alerts"),
|
||||
):
|
||||
report = await monitor.run_cycle()
|
||||
|
||||
assert report.overall == HealthLevel.OK
|
||||
assert not report.has_issues
|
||||
assert monitor.last_report is report
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_cycle_sets_overall_to_worst(monitor):
|
||||
async def _ok():
|
||||
return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
|
||||
|
||||
async def _critical():
|
||||
return CheckResult(name="critical", level=HealthLevel.CRITICAL, message="bad")
|
||||
|
||||
with (
|
||||
patch.object(monitor, "_check_memory", _ok),
|
||||
patch.object(monitor, "_check_disk", _critical),
|
||||
patch.object(monitor, "_check_ollama", _ok),
|
||||
patch.object(monitor, "_check_processes", _ok),
|
||||
patch.object(monitor, "_check_network", _ok),
|
||||
patch.object(monitor, "_handle_alerts"),
|
||||
):
|
||||
report = await monitor.run_cycle()
|
||||
|
||||
assert report.overall == HealthLevel.CRITICAL
|
||||
assert report.has_issues is True
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_cycle_exception_becomes_unknown(monitor):
|
||||
async def _ok():
|
||||
return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
|
||||
|
||||
async def _boom():
|
||||
raise RuntimeError("unexpected error")
|
||||
|
||||
with (
|
||||
patch.object(monitor, "_check_memory", _ok),
|
||||
patch.object(monitor, "_check_disk", _ok),
|
||||
patch.object(monitor, "_check_ollama", _boom),
|
||||
patch.object(monitor, "_check_processes", _ok),
|
||||
patch.object(monitor, "_check_network", _ok),
|
||||
patch.object(monitor, "_handle_alerts"),
|
||||
):
|
||||
report = await monitor.run_cycle()
|
||||
|
||||
levels = {c.level for c in report.checks}
|
||||
assert HealthLevel.UNKNOWN in levels
|
||||
|
||||
|
||||
# ── to_dict serialisation ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_check_result_to_dict():
|
||||
c = CheckResult(
|
||||
name="memory",
|
||||
level=HealthLevel.WARNING,
|
||||
message="low",
|
||||
details={"free_gb": 3.5},
|
||||
auto_resolved=True,
|
||||
)
|
||||
d = c.to_dict()
|
||||
assert d["name"] == "memory"
|
||||
assert d["level"] == "warning"
|
||||
assert d["auto_resolved"] is True
|
||||
assert d["details"]["free_gb"] == 3.5
|
||||
|
||||
|
||||
def test_health_report_to_dict():
|
||||
checks = [
|
||||
CheckResult(name="disk", level=HealthLevel.OK, message="ok"),
|
||||
]
|
||||
report = HealthReport(
|
||||
timestamp="2026-01-01T00:00:00+00:00",
|
||||
checks=checks,
|
||||
overall=HealthLevel.OK,
|
||||
)
|
||||
d = report.to_dict()
|
||||
assert d["overall"] == "ok"
|
||||
assert d["has_issues"] is False
|
||||
assert len(d["checks"]) == 1
|
||||
Reference in New Issue
Block a user