Files
Timmy-time-dashboard/tests/unit/test_hermes_monitor.py
Claude (Opus 4.6) 495c1ac2bd
Some checks failed
Tests / lint (push) Has been cancelled
Tests / test (push) Has been cancelled
[claude] Fix 27 ruff lint errors blocking all pushes (#1149) (#1153)
Co-authored-by: Claude (Opus 4.6) <claude@hermes.local>
Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
2026-03-23 19:06:11 +00:00

450 lines
15 KiB
Python

"""Unit tests for the Hermes health monitor.
Tests all five checks (memory, disk, Ollama, processes, network) using mocks
so no real subprocesses or network calls are made.
Refs: #1073
"""
import json
from unittest.mock import MagicMock, patch
import pytest
from infrastructure.hermes.monitor import CheckResult, HealthLevel, HealthReport, HermesMonitor
@pytest.fixture()
def monitor():
return HermesMonitor()
# ── Unit helpers ──────────────────────────────────────────────────────────────
class _FakeHTTPResponse:
"""Minimal urllib response stub."""
def __init__(self, body: bytes, status: int = 200):
self._body = body
self.status = status
def read(self) -> bytes:
return self._body
def __enter__(self):
return self
def __exit__(self, *_):
pass
# ── Memory check ──────────────────────────────────────────────────────────────
def test_get_memory_info_parses_vm_stat(monitor):
vm_stat_output = (
"Mach Virtual Memory Statistics: (page size of 16384 bytes)\n"
"Pages free: 12800.\n"
"Pages active: 50000.\n"
"Pages inactive: 25600.\n"
"Pages speculative: 1000.\n"
)
with (
patch("subprocess.run") as mock_run,
):
# First call: sysctl hw.memsize (total)
sysctl_result = MagicMock()
sysctl_result.stdout = "68719476736\n" # 64 GB
# Second call: vm_stat
vmstat_result = MagicMock()
vmstat_result.stdout = vm_stat_output
mock_run.side_effect = [sysctl_result, vmstat_result]
info = monitor._get_memory_info()
assert info["total_gb"] == pytest.approx(64.0, abs=0.1)
# pages free (12800) + inactive (25600) = 38400 * 16384 bytes = 629145600 bytes ≈ 0.586 GB
expected_free_gb = (38400 * 16384) / (1024**3)
assert info["free_gb"] == pytest.approx(expected_free_gb, abs=0.001)
def test_get_memory_info_handles_subprocess_failure(monitor):
with patch("subprocess.run", side_effect=OSError("no sysctl")):
info = monitor._get_memory_info()
assert info["total_gb"] == 0.0
assert info["free_gb"] == 0.0
@pytest.mark.asyncio
async def test_check_memory_ok(monitor):
with patch.object(
monitor, "_get_memory_info", return_value={"free_gb": 20.0, "total_gb": 64.0}
):
result = await monitor._check_memory()
assert result.name == "memory"
assert result.level == HealthLevel.OK
assert "20.0GB" in result.message
@pytest.mark.asyncio
async def test_check_memory_low_triggers_unload(monitor):
with (
patch.object(monitor, "_get_memory_info", return_value={"free_gb": 2.0, "total_gb": 64.0}),
patch.object(monitor, "_unload_ollama_models", return_value=2),
):
result = await monitor._check_memory()
assert result.level == HealthLevel.WARNING
assert result.auto_resolved is True
assert "unloaded 2" in result.message
@pytest.mark.asyncio
async def test_check_memory_critical_no_models_to_unload(monitor):
with (
patch.object(monitor, "_get_memory_info", return_value={"free_gb": 1.0, "total_gb": 64.0}),
patch.object(monitor, "_unload_ollama_models", return_value=0),
):
result = await monitor._check_memory()
assert result.level == HealthLevel.CRITICAL
assert result.needs_human is True
@pytest.mark.asyncio
async def test_check_memory_exception_returns_unknown(monitor):
with patch.object(monitor, "_get_memory_info", side_effect=RuntimeError("boom")):
result = await monitor._check_memory()
assert result.level == HealthLevel.UNKNOWN
# ── Disk check ────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
async def test_check_disk_ok(monitor):
usage = MagicMock()
usage.free = 100 * (1024**3) # 100 GB
usage.total = 500 * (1024**3) # 500 GB
usage.used = 400 * (1024**3)
with patch("shutil.disk_usage", return_value=usage):
result = await monitor._check_disk()
assert result.level == HealthLevel.OK
assert "100.0GB free" in result.message
@pytest.mark.asyncio
async def test_check_disk_low_triggers_cleanup(monitor):
usage = MagicMock()
usage.free = 5 * (1024**3) # 5 GB — below threshold
usage.total = 500 * (1024**3)
usage.used = 495 * (1024**3)
with (
patch("shutil.disk_usage", return_value=usage),
patch.object(monitor, "_cleanup_temp_files", return_value=2.5),
):
result = await monitor._check_disk()
assert result.level == HealthLevel.WARNING
assert result.auto_resolved is True
assert "cleaned 2.50GB" in result.message
@pytest.mark.asyncio
async def test_check_disk_critical_when_cleanup_fails(monitor):
usage = MagicMock()
usage.free = 5 * (1024**3)
usage.total = 500 * (1024**3)
usage.used = 495 * (1024**3)
with (
patch("shutil.disk_usage", return_value=usage),
patch.object(monitor, "_cleanup_temp_files", return_value=0.0),
):
result = await monitor._check_disk()
assert result.level == HealthLevel.CRITICAL
assert result.needs_human is True
# ── Ollama check ──────────────────────────────────────────────────────────────
def test_get_ollama_status_reachable(monitor):
tags_body = json.dumps({"models": [{"name": "qwen3:30b"}, {"name": "llama3.1:8b"}]}).encode()
ps_body = json.dumps({"models": [{"name": "qwen3:30b", "size": 1000}]}).encode()
responses = [
_FakeHTTPResponse(tags_body),
_FakeHTTPResponse(ps_body),
]
with patch("urllib.request.urlopen", side_effect=responses):
status = monitor._get_ollama_status()
assert status["reachable"] is True
assert len(status["models"]) == 2
assert len(status["loaded_models"]) == 1
def test_get_ollama_status_unreachable(monitor):
with patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
status = monitor._get_ollama_status()
assert status["reachable"] is False
assert status["models"] == []
assert status["loaded_models"] == []
@pytest.mark.asyncio
async def test_check_ollama_ok(monitor):
status = {
"reachable": True,
"models": [{"name": "qwen3:30b"}],
"loaded_models": [],
}
with patch.object(monitor, "_get_ollama_status", return_value=status):
result = await monitor._check_ollama()
assert result.level == HealthLevel.OK
assert result.details["reachable"] is True
@pytest.mark.asyncio
async def test_check_ollama_unreachable_restart_success(monitor):
status = {"reachable": False, "models": [], "loaded_models": []}
with (
patch.object(monitor, "_get_ollama_status", return_value=status),
patch.object(monitor, "_restart_ollama", return_value=True),
):
result = await monitor._check_ollama()
assert result.level == HealthLevel.WARNING
assert result.auto_resolved is True
@pytest.mark.asyncio
async def test_check_ollama_unreachable_restart_fails(monitor):
status = {"reachable": False, "models": [], "loaded_models": []}
with (
patch.object(monitor, "_get_ollama_status", return_value=status),
patch.object(monitor, "_restart_ollama", return_value=False),
):
result = await monitor._check_ollama()
assert result.level == HealthLevel.CRITICAL
assert result.needs_human is True
# ── Process check ─────────────────────────────────────────────────────────────
def test_get_zombie_processes_none(monitor):
ps_output = (
"USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND\n"
"alex 123 0.1 0.2 100 200 s0 S 1:00 0:01 python\n"
"alex 456 0.0 0.1 50 100 s0 S 1:01 0:00 bash\n"
)
result = MagicMock()
result.stdout = ps_output
with patch("subprocess.run", return_value=result):
info = monitor._get_zombie_processes()
assert info["zombies"] == []
def test_get_zombie_processes_found(monitor):
ps_output = (
"USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND\n"
"alex 123 0.1 0.2 100 200 s0 S 1:00 0:01 python\n"
"alex 789 0.0 0.0 0 0 s0 Z 1:02 0:00 defunct\n"
)
result = MagicMock()
result.stdout = ps_output
with patch("subprocess.run", return_value=result):
info = monitor._get_zombie_processes()
assert len(info["zombies"]) == 1
assert info["zombies"][0]["pid"] == "789"
@pytest.mark.asyncio
async def test_check_processes_no_zombies(monitor):
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": []}):
result = await monitor._check_processes()
assert result.level == HealthLevel.OK
@pytest.mark.asyncio
async def test_check_processes_zombies_warning(monitor):
zombies = [{"pid": "100", "command": "defunct"}, {"pid": "101", "command": "defunct"}]
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
result = await monitor._check_processes()
assert result.level == HealthLevel.WARNING
assert result.needs_human is False # Only 2, threshold is >3
@pytest.mark.asyncio
async def test_check_processes_many_zombies_needs_human(monitor):
zombies = [{"pid": str(i), "command": "defunct"} for i in range(5)]
with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
result = await monitor._check_processes()
assert result.needs_human is True
# ── Network check ─────────────────────────────────────────────────────────────
def test_check_gitea_connectivity_ok(monitor):
body = json.dumps({"version": "1.22.0"}).encode()
with patch("urllib.request.urlopen", return_value=_FakeHTTPResponse(body, status=200)):
info = monitor._check_gitea_connectivity()
assert info["reachable"] is True
assert info["latency_ms"] >= 0
def test_check_gitea_connectivity_unreachable(monitor):
with patch("urllib.request.urlopen", side_effect=OSError("refused")):
info = monitor._check_gitea_connectivity()
assert info["reachable"] is False
assert "error" in info
@pytest.mark.asyncio
async def test_check_network_ok(monitor):
with patch.object(
monitor,
"_check_gitea_connectivity",
return_value={"reachable": True, "latency_ms": 5.0, "url": "http://localhost:3000"},
):
result = await monitor._check_network()
assert result.level == HealthLevel.OK
assert "Gitea reachable" in result.message
@pytest.mark.asyncio
async def test_check_network_unreachable(monitor):
with patch.object(
monitor,
"_check_gitea_connectivity",
return_value={"reachable": False, "error": "refused", "url": "http://localhost:3000"},
):
result = await monitor._check_network()
assert result.level == HealthLevel.WARNING
assert result.needs_human is True
# ── Full cycle ────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
async def test_run_cycle_all_ok(monitor):
ok_result = CheckResult(name="test", level=HealthLevel.OK, message="ok")
async def _ok_check():
return ok_result
with (
patch.object(monitor, "_check_memory", _ok_check),
patch.object(monitor, "_check_disk", _ok_check),
patch.object(monitor, "_check_ollama", _ok_check),
patch.object(monitor, "_check_processes", _ok_check),
patch.object(monitor, "_check_network", _ok_check),
patch.object(monitor, "_handle_alerts"),
):
report = await monitor.run_cycle()
assert report.overall == HealthLevel.OK
assert not report.has_issues
assert monitor.last_report is report
@pytest.mark.asyncio
async def test_run_cycle_sets_overall_to_worst(monitor):
async def _ok():
return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
async def _critical():
return CheckResult(name="critical", level=HealthLevel.CRITICAL, message="bad")
with (
patch.object(monitor, "_check_memory", _ok),
patch.object(monitor, "_check_disk", _critical),
patch.object(monitor, "_check_ollama", _ok),
patch.object(monitor, "_check_processes", _ok),
patch.object(monitor, "_check_network", _ok),
patch.object(monitor, "_handle_alerts"),
):
report = await monitor.run_cycle()
assert report.overall == HealthLevel.CRITICAL
assert report.has_issues is True
@pytest.mark.asyncio
async def test_run_cycle_exception_becomes_unknown(monitor):
async def _ok():
return CheckResult(name="ok", level=HealthLevel.OK, message="ok")
async def _boom():
raise RuntimeError("unexpected error")
with (
patch.object(monitor, "_check_memory", _ok),
patch.object(monitor, "_check_disk", _ok),
patch.object(monitor, "_check_ollama", _boom),
patch.object(monitor, "_check_processes", _ok),
patch.object(monitor, "_check_network", _ok),
patch.object(monitor, "_handle_alerts"),
):
report = await monitor.run_cycle()
levels = {c.level for c in report.checks}
assert HealthLevel.UNKNOWN in levels
# ── to_dict serialisation ────────────────────────────────────────────────────
def test_check_result_to_dict():
c = CheckResult(
name="memory",
level=HealthLevel.WARNING,
message="low",
details={"free_gb": 3.5},
auto_resolved=True,
)
d = c.to_dict()
assert d["name"] == "memory"
assert d["level"] == "warning"
assert d["auto_resolved"] is True
assert d["details"]["free_gb"] == 3.5
def test_health_report_to_dict():
checks = [
CheckResult(name="disk", level=HealthLevel.OK, message="ok"),
]
report = HealthReport(
timestamp="2026-01-01T00:00:00+00:00",
checks=checks,
overall=HealthLevel.OK,
)
d = report.to_dict()
assert d["overall"] == "ok"
assert d["has_issues"] is False
assert len(d["checks"]) == 1