Timmy-time-dashboard/tests/unit/test_hermes_monitor.py

"""Unit tests for the Hermes health monitor.

Tests all five checks (memory, disk, Ollama, processes, network) using mocks
so no real subprocesses or network calls are made.

Refs: #1073
"""

import json
from unittest.mock import MagicMock, patch

import pytest

from infrastructure.hermes.monitor import CheckResult, HealthLevel, HealthReport, HermesMonitor


@pytest.fixture()
def monitor():
    return HermesMonitor()


# ── Unit helpers ──────────────────────────────────────────────────────────────


class _FakeHTTPResponse:
    """Minimal urllib response stub."""

    def __init__(self, body: bytes, status: int = 200):
        self._body = body
        self.status = status

    def read(self) -> bytes:
        return self._body

    def __enter__(self):
        return self

    def __exit__(self, *_):
        pass


# ── Memory check ──────────────────────────────────────────────────────────────


def test_get_memory_info_parses_vm_stat(monitor):
    vm_stat_output = (
        "Mach Virtual Memory Statistics: (page size of 16384 bytes)\n"
        "Pages free:                           12800.\n"
        "Pages active:                         50000.\n"
        "Pages inactive:                       25600.\n"
        "Pages speculative:                     1000.\n"
    )
    with (
        patch("subprocess.run") as mock_run,
    ):
        # First call: sysctl hw.memsize (total)
        sysctl_result = MagicMock()
        sysctl_result.stdout = "68719476736\n"  # 64 GB
        # Second call: vm_stat
        vmstat_result = MagicMock()
        vmstat_result.stdout = vm_stat_output
        mock_run.side_effect = [sysctl_result, vmstat_result]

        info = monitor._get_memory_info()

    assert info["total_gb"] == pytest.approx(64.0, abs=0.1)
    # pages free (12800) + inactive (25600) = 38400 * 16384 bytes = 629145600 bytes ≈ 0.586 GB
    expected_free_gb = (38400 * 16384) / (1024**3)
    assert info["free_gb"] == pytest.approx(expected_free_gb, abs=0.001)


def test_get_memory_info_handles_subprocess_failure(monitor):
    with patch("subprocess.run", side_effect=OSError("no sysctl")):
        info = monitor._get_memory_info()
    assert info["total_gb"] == 0.0
    assert info["free_gb"] == 0.0


@pytest.mark.asyncio
async def test_check_memory_ok(monitor):
    with patch.object(
        monitor, "_get_memory_info", return_value={"free_gb": 20.0, "total_gb": 64.0}
    ):
        result = await monitor._check_memory()

    assert result.name == "memory"
    assert result.level == HealthLevel.OK
    assert "20.0GB" in result.message


@pytest.mark.asyncio
async def test_check_memory_low_triggers_unload(monitor):
    with (
        patch.object(monitor, "_get_memory_info", return_value={"free_gb": 2.0, "total_gb": 64.0}),
        patch.object(monitor, "_unload_ollama_models", return_value=2),
    ):
        result = await monitor._check_memory()

    assert result.level == HealthLevel.WARNING
    assert result.auto_resolved is True
    assert "unloaded 2" in result.message


@pytest.mark.asyncio
async def test_check_memory_critical_no_models_to_unload(monitor):
    with (
        patch.object(monitor, "_get_memory_info", return_value={"free_gb": 1.0, "total_gb": 64.0}),
        patch.object(monitor, "_unload_ollama_models", return_value=0),
    ):
        result = await monitor._check_memory()

    assert result.level == HealthLevel.CRITICAL
    assert result.needs_human is True


@pytest.mark.asyncio
async def test_check_memory_exception_returns_unknown(monitor):
    with patch.object(monitor, "_get_memory_info", side_effect=RuntimeError("boom")):
        result = await monitor._check_memory()

    assert result.level == HealthLevel.UNKNOWN


# ── Disk check ────────────────────────────────────────────────────────────────


@pytest.mark.asyncio
async def test_check_disk_ok(monitor):
    usage = MagicMock()
    usage.free = 100 * (1024**3)  # 100 GB
    usage.total = 500 * (1024**3)  # 500 GB
    usage.used = 400 * (1024**3)

    with patch("shutil.disk_usage", return_value=usage):
        result = await monitor._check_disk()

    assert result.level == HealthLevel.OK
    assert "100.0GB free" in result.message


@pytest.mark.asyncio
async def test_check_disk_low_triggers_cleanup(monitor):
    usage = MagicMock()
    usage.free = 5 * (1024**3)  # 5 GB — below threshold
    usage.total = 500 * (1024**3)
    usage.used = 495 * (1024**3)

    with (
        patch("shutil.disk_usage", return_value=usage),
        patch.object(monitor, "_cleanup_temp_files", return_value=2.5),
    ):
        result = await monitor._check_disk()

    assert result.level == HealthLevel.WARNING
    assert result.auto_resolved is True
    assert "cleaned 2.50GB" in result.message


@pytest.mark.asyncio
async def test_check_disk_critical_when_cleanup_fails(monitor):
    usage = MagicMock()
    usage.free = 5 * (1024**3)
    usage.total = 500 * (1024**3)
    usage.used = 495 * (1024**3)

    with (
        patch("shutil.disk_usage", return_value=usage),
        patch.object(monitor, "_cleanup_temp_files", return_value=0.0),
    ):
        result = await monitor._check_disk()

    assert result.level == HealthLevel.CRITICAL
    assert result.needs_human is True


# ── Ollama check ──────────────────────────────────────────────────────────────


def test_get_ollama_status_reachable(monitor):
    tags_body = json.dumps({"models": [{"name": "qwen3:30b"}, {"name": "llama3.1:8b"}]}).encode()
    ps_body = json.dumps({"models": [{"name": "qwen3:30b", "size": 1000}]}).encode()

    responses = [
        _FakeHTTPResponse(tags_body),
        _FakeHTTPResponse(ps_body),
    ]

    with patch("urllib.request.urlopen", side_effect=responses):
        status = monitor._get_ollama_status()

    assert status["reachable"] is True
    assert len(status["models"]) == 2
    assert len(status["loaded_models"]) == 1


def test_get_ollama_status_unreachable(monitor):
    with patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
        status = monitor._get_ollama_status()

    assert status["reachable"] is False
    assert status["models"] == []
    assert status["loaded_models"] == []


@pytest.mark.asyncio
async def test_check_ollama_ok(monitor):
    status = {
        "reachable": True,
        "models": [{"name": "qwen3:30b"}],
        "loaded_models": [],
    }
    with patch.object(monitor, "_get_ollama_status", return_value=status):
        result = await monitor._check_ollama()

    assert result.level == HealthLevel.OK
    assert result.details["reachable"] is True


@pytest.mark.asyncio
async def test_check_ollama_unreachable_restart_success(monitor):
    status = {"reachable": False, "models": [], "loaded_models": []}
    with (
        patch.object(monitor, "_get_ollama_status", return_value=status),
        patch.object(monitor, "_restart_ollama", return_value=True),
    ):
        result = await monitor._check_ollama()

    assert result.level == HealthLevel.WARNING
    assert result.auto_resolved is True


@pytest.mark.asyncio
async def test_check_ollama_unreachable_restart_fails(monitor):
    status = {"reachable": False, "models": [], "loaded_models": []}
    with (
        patch.object(monitor, "_get_ollama_status", return_value=status),
        patch.object(monitor, "_restart_ollama", return_value=False),
    ):
        result = await monitor._check_ollama()

    assert result.level == HealthLevel.CRITICAL
    assert result.needs_human is True


# ── Process check ─────────────────────────────────────────────────────────────


def test_get_zombie_processes_none(monitor):
    ps_output = (
        "USER  PID  %CPU  %MEM  VSZ   RSS  TT  STAT  STARTED  TIME  COMMAND\n"
        "alex  123   0.1   0.2  100   200  s0  S      1:00   0:01  python\n"
        "alex  456   0.0   0.1   50   100  s0  S      1:01   0:00  bash\n"
    )
    result = MagicMock()
    result.stdout = ps_output
    with patch("subprocess.run", return_value=result):
        info = monitor._get_zombie_processes()

    assert info["zombies"] == []


def test_get_zombie_processes_found(monitor):
    ps_output = (
        "USER  PID  %CPU  %MEM  VSZ   RSS  TT  STAT  STARTED  TIME  COMMAND\n"
        "alex  123   0.1   0.2  100   200  s0  S      1:00   0:01  python\n"
        "alex  789   0.0   0.0    0     0  s0  Z      1:02   0:00  defunct\n"
    )
    result = MagicMock()
    result.stdout = ps_output
    with patch("subprocess.run", return_value=result):
        info = monitor._get_zombie_processes()

    assert len(info["zombies"]) == 1
    assert info["zombies"][0]["pid"] == "789"


@pytest.mark.asyncio
async def test_check_processes_no_zombies(monitor):
    with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": []}):
        result = await monitor._check_processes()

    assert result.level == HealthLevel.OK


@pytest.mark.asyncio
async def test_check_processes_zombies_warning(monitor):
    zombies = [{"pid": "100", "command": "defunct"}, {"pid": "101", "command": "defunct"}]
    with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
        result = await monitor._check_processes()

    assert result.level == HealthLevel.WARNING
    assert result.needs_human is False  # Only 2, threshold is >3


@pytest.mark.asyncio
async def test_check_processes_many_zombies_needs_human(monitor):
    zombies = [{"pid": str(i), "command": "defunct"} for i in range(5)]
    with patch.object(monitor, "_get_zombie_processes", return_value={"zombies": zombies}):
        result = await monitor._check_processes()

    assert result.needs_human is True


# ── Network check ─────────────────────────────────────────────────────────────


def test_check_gitea_connectivity_ok(monitor):
    body = json.dumps({"version": "1.22.0"}).encode()
    with patch("urllib.request.urlopen", return_value=_FakeHTTPResponse(body, status=200)):
        info = monitor._check_gitea_connectivity()

    assert info["reachable"] is True
    assert info["latency_ms"] >= 0


def test_check_gitea_connectivity_unreachable(monitor):
    with patch("urllib.request.urlopen", side_effect=OSError("refused")):
        info = monitor._check_gitea_connectivity()

    assert info["reachable"] is False
    assert "error" in info


@pytest.mark.asyncio
async def test_check_network_ok(monitor):
    with patch.object(
        monitor,
        "_check_gitea_connectivity",
        return_value={"reachable": True, "latency_ms": 5.0, "url": "http://localhost:3000"},
    ):
        result = await monitor._check_network()

    assert result.level == HealthLevel.OK
    assert "Gitea reachable" in result.message


@pytest.mark.asyncio
async def test_check_network_unreachable(monitor):
    with patch.object(
        monitor,
        "_check_gitea_connectivity",
        return_value={"reachable": False, "error": "refused", "url": "http://localhost:3000"},
    ):
        result = await monitor._check_network()

    assert result.level == HealthLevel.WARNING
    assert result.needs_human is True


# ── Full cycle ────────────────────────────────────────────────────────────────


@pytest.mark.asyncio
async def test_run_cycle_all_ok(monitor):
    ok_result = CheckResult(name="test", level=HealthLevel.OK, message="ok")

    async def _ok_check():
        return ok_result

    with (
        patch.object(monitor, "_check_memory", _ok_check),
        patch.object(monitor, "_check_disk", _ok_check),
        patch.object(monitor, "_check_ollama", _ok_check),
        patch.object(monitor, "_check_processes", _ok_check),
        patch.object(monitor, "_check_network", _ok_check),
        patch.object(monitor, "_handle_alerts"),
    ):
        report = await monitor.run_cycle()

    assert report.overall == HealthLevel.OK
    assert not report.has_issues
    assert monitor.last_report is report


@pytest.mark.asyncio
async def test_run_cycle_sets_overall_to_worst(monitor):
    async def _ok():
        return CheckResult(name="ok", level=HealthLevel.OK, message="ok")

    async def _critical():
        return CheckResult(name="critical", level=HealthLevel.CRITICAL, message="bad")

    with (
        patch.object(monitor, "_check_memory", _ok),
        patch.object(monitor, "_check_disk", _critical),
        patch.object(monitor, "_check_ollama", _ok),
        patch.object(monitor, "_check_processes", _ok),
        patch.object(monitor, "_check_network", _ok),
        patch.object(monitor, "_handle_alerts"),
    ):
        report = await monitor.run_cycle()

    assert report.overall == HealthLevel.CRITICAL
    assert report.has_issues is True


@pytest.mark.asyncio
async def test_run_cycle_exception_becomes_unknown(monitor):
    async def _ok():
        return CheckResult(name="ok", level=HealthLevel.OK, message="ok")

    async def _boom():
        raise RuntimeError("unexpected error")

    with (
        patch.object(monitor, "_check_memory", _ok),
        patch.object(monitor, "_check_disk", _ok),
        patch.object(monitor, "_check_ollama", _boom),
        patch.object(monitor, "_check_processes", _ok),
        patch.object(monitor, "_check_network", _ok),
        patch.object(monitor, "_handle_alerts"),
    ):
        report = await monitor.run_cycle()

    levels = {c.level for c in report.checks}
    assert HealthLevel.UNKNOWN in levels


# ── to_dict serialisation ────────────────────────────────────────────────────


def test_check_result_to_dict():
    c = CheckResult(
        name="memory",
        level=HealthLevel.WARNING,
        message="low",
        details={"free_gb": 3.5},
        auto_resolved=True,
    )
    d = c.to_dict()
    assert d["name"] == "memory"
    assert d["level"] == "warning"
    assert d["auto_resolved"] is True
    assert d["details"]["free_gb"] == 3.5


def test_health_report_to_dict():
    checks = [
        CheckResult(name="disk", level=HealthLevel.OK, message="ok"),
    ]
    report = HealthReport(
        timestamp="2026-01-01T00:00:00+00:00",
        checks=checks,
        overall=HealthLevel.OK,
    )
    d = report.to_dict()
    assert d["overall"] == "ok"
    assert d["has_issues"] is False
    assert len(d["checks"]) == 1