Files
the-nexus/tests/test_cron_heartbeats.py

342 lines
12 KiB
Python
Raw Normal View History

"""Tests for the poka-yoke cron heartbeat system.
Covers:
- nexus/cron_heartbeat.py (write utility)
- bin/check_cron_heartbeats.py (meta-heartbeat checker)
Refs: #1096
"""
from __future__ import annotations
import importlib.util
import json
import os
import sys
import time
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
# ── Load modules under test ───────────────────────────────────────────
PROJECT_ROOT = Path(__file__).parent.parent
_hb_spec = importlib.util.spec_from_file_location(
"_cron_heartbeat",
PROJECT_ROOT / "nexus" / "cron_heartbeat.py",
)
_hb = importlib.util.module_from_spec(_hb_spec)
sys.modules["_cron_heartbeat"] = _hb
_hb_spec.loader.exec_module(_hb)
_chk_spec = importlib.util.spec_from_file_location(
"_check_cron_heartbeats",
PROJECT_ROOT / "bin" / "check_cron_heartbeats.py",
)
_chk = importlib.util.module_from_spec(_chk_spec)
sys.modules["_check_cron_heartbeats"] = _chk
_chk_spec.loader.exec_module(_chk)
write_cron_heartbeat = _hb.write_cron_heartbeat
heartbeat_path = _hb.heartbeat_path
scan_heartbeats = _chk.scan_heartbeats
build_report = _chk.build_report
HeartbeatReport = _chk.HeartbeatReport
JobStatus = _chk.JobStatus
_read_job_status = _chk._read_job_status
_fmt_duration = _chk._fmt_duration
# ── nexus/cron_heartbeat.py ───────────────────────────────────────────
class TestWriteCronHeartbeat:
def test_creates_file(self, tmp_path):
"""write_cron_heartbeat creates <job>.last in the given directory."""
path = write_cron_heartbeat("my_job", interval_seconds=300, directory=tmp_path)
assert path == tmp_path / "my_job.last"
assert path.exists()
def test_file_content(self, tmp_path):
"""Written file has all required fields."""
write_cron_heartbeat("my_job", interval_seconds=600, status="ok", directory=tmp_path)
data = json.loads((tmp_path / "my_job.last").read_text())
assert data["job"] == "my_job"
assert data["interval_seconds"] == 600
assert data["status"] == "ok"
assert data["pid"] == os.getpid()
assert abs(data["timestamp"] - time.time()) < 2
def test_atomic_write_no_temp_files(self, tmp_path):
"""No temporary files remain after a successful write."""
write_cron_heartbeat("my_job", interval_seconds=300, directory=tmp_path)
files = list(tmp_path.iterdir())
assert len(files) == 1
assert files[0].name == "my_job.last"
def test_overwrites_cleanly(self, tmp_path):
"""Successive writes update, not append."""
write_cron_heartbeat("j", interval_seconds=60, status="ok", directory=tmp_path)
write_cron_heartbeat("j", interval_seconds=60, status="warn", directory=tmp_path)
data = json.loads((tmp_path / "j.last").read_text())
assert data["status"] == "warn"
def test_creates_parent_dirs(self, tmp_path):
"""Parent directories are created as needed."""
deep_dir = tmp_path / "a" / "b" / "c"
write_cron_heartbeat("j", interval_seconds=60, directory=deep_dir)
assert (deep_dir / "j.last").exists()
def test_heartbeat_path_helper(self, tmp_path):
"""heartbeat_path() returns the correct path without writing."""
p = heartbeat_path("myjob", directory=tmp_path)
assert p == tmp_path / "myjob.last"
assert not p.exists()
def test_env_var_override(self, tmp_path, monkeypatch):
"""BEZALEL_HEARTBEAT_DIR env var overrides the directory."""
monkeypatch.setenv("BEZALEL_HEARTBEAT_DIR", str(tmp_path))
# Call without directory= so it uses the env var
path = write_cron_heartbeat("env_job", interval_seconds=120)
assert path.parent == tmp_path
# ── bin/check_cron_heartbeats.py ─────────────────────────────────────
class TestScanHeartbeats:
def test_empty_dir(self, tmp_path):
"""No .last files → empty list."""
assert scan_heartbeats(tmp_path) == []
def test_nonexistent_dir(self, tmp_path):
"""Missing directory → empty list (no exception)."""
assert scan_heartbeats(tmp_path / "nope") == []
def test_healthy_job(self, tmp_path):
"""Fresh heartbeat → healthy."""
(tmp_path / "myjob.last").write_text(json.dumps({
"job": "myjob",
"timestamp": time.time(),
"interval_seconds": 300,
"pid": 1,
"status": "ok",
}))
jobs = scan_heartbeats(tmp_path)
assert len(jobs) == 1
assert jobs[0].healthy is True
assert jobs[0].job == "myjob"
def test_stale_job(self, tmp_path):
"""Heartbeat older than 2× interval → stale."""
(tmp_path / "slow.last").write_text(json.dumps({
"job": "slow",
"timestamp": time.time() - 700, # 11.7 minutes
"interval_seconds": 300, # 5 min interval → ratio 2.33
"pid": 1,
"status": "ok",
}))
jobs = scan_heartbeats(tmp_path)
assert jobs[0].healthy is False
assert jobs[0].staleness_ratio > 2.0
def test_missing_heartbeat_file(self, tmp_path):
"""_read_job_status handles a file that disappears mid-scan."""
ghost_path = tmp_path / "ghost.last"
status = _read_job_status("ghost", ghost_path)
assert status.healthy is False
assert "missing" in status.raw_status
def test_corrupt_heartbeat(self, tmp_path):
"""Corrupt JSON → unhealthy with 'corrupt' status."""
p = tmp_path / "bad.last"
p.write_text("{not valid json")
status = _read_job_status("bad", p)
assert status.healthy is False
assert "corrupt" in status.raw_status
def test_multiple_jobs(self, tmp_path):
"""Multiple .last files are all reported."""
for i, name in enumerate(["alpha", "beta", "gamma"]):
(tmp_path / f"{name}.last").write_text(json.dumps({
"job": name,
"timestamp": time.time() - i * 10,
"interval_seconds": 300,
"pid": i + 1,
"status": "ok",
}))
jobs = scan_heartbeats(tmp_path)
assert len(jobs) == 3
job_names = {j.job for j in jobs}
assert job_names == {"alpha", "beta", "gamma"}
def test_non_last_files_ignored(self, tmp_path):
""".json and other extensions are ignored."""
(tmp_path / "other.json").write_text("{}")
(tmp_path / "notes.txt").write_text("hello")
assert scan_heartbeats(tmp_path) == []
class TestHeartbeatReport:
def _fresh_job(self, name="j"):
return JobStatus(
job=name, path=Path(f"/tmp/{name}.last"),
healthy=True, age_seconds=30, interval_seconds=300,
staleness_ratio=0.1, last_timestamp=time.time() - 30,
pid=1, raw_status="ok",
message="Last beat 30s ago (ratio 0.1x)",
)
def _stale_job(self, name="s"):
return JobStatus(
job=name, path=Path(f"/tmp/{name}.last"),
healthy=False, age_seconds=700, interval_seconds=300,
staleness_ratio=2.33, last_timestamp=time.time() - 700,
pid=1, raw_status="stale",
message="Silent for 11m 40s (2.3x interval of 5m 0s)",
)
def test_overall_healthy(self):
report = HeartbeatReport(
timestamp=time.time(),
heartbeat_dir=Path("/tmp"),
jobs=[self._fresh_job()],
)
assert report.overall_healthy is True
def test_overall_unhealthy(self):
report = HeartbeatReport(
timestamp=time.time(),
heartbeat_dir=Path("/tmp"),
jobs=[self._fresh_job(), self._stale_job()],
)
assert report.overall_healthy is False
assert len(report.stale_jobs) == 1
def test_panel_markdown_contains_table(self):
report = HeartbeatReport(
timestamp=time.time(),
heartbeat_dir=Path("/tmp"),
jobs=[self._fresh_job("alpha"), self._stale_job("beta")],
)
panel = report.to_panel_markdown()
assert "## Heartbeat Panel" in panel
assert "| `alpha` |" in panel
assert "| `beta` |" in panel
assert "STALE" in panel
assert "OK" in panel
assert "**Overall:** ALERT" in panel
def test_panel_markdown_no_jobs(self):
report = HeartbeatReport(
timestamp=time.time(),
heartbeat_dir=Path("/tmp"),
jobs=[],
)
panel = report.to_panel_markdown()
assert "no heartbeat files found" in panel
def test_panel_overall_ok(self):
report = HeartbeatReport(
timestamp=time.time(),
heartbeat_dir=Path("/tmp"),
jobs=[self._fresh_job()],
)
panel = report.to_panel_markdown()
assert "**Overall:** OK" in panel
def test_alert_body_lists_stale_jobs(self):
report = HeartbeatReport(
timestamp=time.time(),
heartbeat_dir=Path("/tmp"),
jobs=[self._stale_job("slow")],
)
body = report.to_alert_body()
assert "slow" in body
assert "STALE" in body.upper() or "stale" in body.lower() or "silent" in body.lower()
assert "crontab" in body.lower()
def test_to_json(self):
report = HeartbeatReport(
timestamp=time.time(),
heartbeat_dir=Path("/tmp"),
jobs=[self._fresh_job()],
)
data = report.to_json()
assert data["healthy"] is True
assert len(data["jobs"]) == 1
assert data["jobs"][0]["job"] == "j"
class TestFmtDuration:
def test_seconds(self):
assert _fmt_duration(45) == "45s"
def test_minutes(self):
assert _fmt_duration(90) == "1m 30s"
def test_hours(self):
assert _fmt_duration(3661) == "1h 1m"
class TestBuildReport:
def test_build_report_with_dir(self, tmp_path):
"""build_report() uses the given directory."""
(tmp_path / "myjob.last").write_text(json.dumps({
"job": "myjob",
"timestamp": time.time(),
"interval_seconds": 300,
"pid": 1,
"status": "ok",
}))
report = build_report(directory=tmp_path)
assert len(report.jobs) == 1
assert report.overall_healthy is True
def test_build_report_empty_dir(self, tmp_path):
report = build_report(directory=tmp_path)
assert report.jobs == []
assert report.overall_healthy is True # nothing stale = healthy
# ── Integration: nexus_watchdog writes its own heartbeat ─────────────
class TestWatchdogHeartbeatIntegration:
"""Verify nexus_watchdog.py writes a cron heartbeat after run_once()."""
def test_watchdog_writes_cron_heartbeat(self, tmp_path, monkeypatch):
"""After run_once, nexus_watchdog writes nexus_watchdog.last."""
monkeypatch.setenv("BEZALEL_HEARTBEAT_DIR", str(tmp_path))
# Load watchdog module
spec = importlib.util.spec_from_file_location(
"_watchdog_hb_test",
PROJECT_ROOT / "bin" / "nexus_watchdog.py",
)
wd = importlib.util.module_from_spec(spec)
sys.modules["_watchdog_hb_test"] = wd
# Patch out network calls
with patch("socket.socket") as mock_sock, \
patch("subprocess.run") as mock_run:
mock_sock.return_value.connect_ex.return_value = 111 # port closed
mock_run.return_value = MagicMock(returncode=1, stdout="")
spec.loader.exec_module(wd)
args = MagicMock()
args.ws_host = "localhost"
args.ws_port = 8765
args.heartbeat_path = str(tmp_path / "nexus_heartbeat.json")
args.stale_threshold = 300
args.dry_run = True # don't touch Gitea
wd.run_once(args)
hb_file = tmp_path / "nexus_watchdog.last"
assert hb_file.exists(), "nexus_watchdog.last was not written"
data = json.loads(hb_file.read_text())
assert data["job"] == "nexus_watchdog"
assert data["interval_seconds"] == 300