"""Tests for the poka-yoke cron heartbeat system. Covers: - nexus/cron_heartbeat.py (write utility) - bin/check_cron_heartbeats.py (meta-heartbeat checker) Refs: #1096 """ from __future__ import annotations import importlib.util import json import os import sys import time from pathlib import Path from unittest.mock import MagicMock, patch import pytest # ── Load modules under test ─────────────────────────────────────────── PROJECT_ROOT = Path(__file__).parent.parent _hb_spec = importlib.util.spec_from_file_location( "_cron_heartbeat", PROJECT_ROOT / "nexus" / "cron_heartbeat.py", ) _hb = importlib.util.module_from_spec(_hb_spec) sys.modules["_cron_heartbeat"] = _hb _hb_spec.loader.exec_module(_hb) _chk_spec = importlib.util.spec_from_file_location( "_check_cron_heartbeats", PROJECT_ROOT / "bin" / "check_cron_heartbeats.py", ) _chk = importlib.util.module_from_spec(_chk_spec) sys.modules["_check_cron_heartbeats"] = _chk _chk_spec.loader.exec_module(_chk) write_cron_heartbeat = _hb.write_cron_heartbeat heartbeat_path = _hb.heartbeat_path scan_heartbeats = _chk.scan_heartbeats build_report = _chk.build_report HeartbeatReport = _chk.HeartbeatReport JobStatus = _chk.JobStatus _read_job_status = _chk._read_job_status _fmt_duration = _chk._fmt_duration # ── nexus/cron_heartbeat.py ─────────────────────────────────────────── class TestWriteCronHeartbeat: def test_creates_file(self, tmp_path): """write_cron_heartbeat creates .last in the given directory.""" path = write_cron_heartbeat("my_job", interval_seconds=300, directory=tmp_path) assert path == tmp_path / "my_job.last" assert path.exists() def test_file_content(self, tmp_path): """Written file has all required fields.""" write_cron_heartbeat("my_job", interval_seconds=600, status="ok", directory=tmp_path) data = json.loads((tmp_path / "my_job.last").read_text()) assert data["job"] == "my_job" assert data["interval_seconds"] == 600 assert data["status"] == "ok" assert data["pid"] == os.getpid() assert abs(data["timestamp"] - time.time()) < 2 def test_atomic_write_no_temp_files(self, tmp_path): """No temporary files remain after a successful write.""" write_cron_heartbeat("my_job", interval_seconds=300, directory=tmp_path) files = list(tmp_path.iterdir()) assert len(files) == 1 assert files[0].name == "my_job.last" def test_overwrites_cleanly(self, tmp_path): """Successive writes update, not append.""" write_cron_heartbeat("j", interval_seconds=60, status="ok", directory=tmp_path) write_cron_heartbeat("j", interval_seconds=60, status="warn", directory=tmp_path) data = json.loads((tmp_path / "j.last").read_text()) assert data["status"] == "warn" def test_creates_parent_dirs(self, tmp_path): """Parent directories are created as needed.""" deep_dir = tmp_path / "a" / "b" / "c" write_cron_heartbeat("j", interval_seconds=60, directory=deep_dir) assert (deep_dir / "j.last").exists() def test_heartbeat_path_helper(self, tmp_path): """heartbeat_path() returns the correct path without writing.""" p = heartbeat_path("myjob", directory=tmp_path) assert p == tmp_path / "myjob.last" assert not p.exists() def test_env_var_override(self, tmp_path, monkeypatch): """BEZALEL_HEARTBEAT_DIR env var overrides the directory.""" monkeypatch.setenv("BEZALEL_HEARTBEAT_DIR", str(tmp_path)) # Call without directory= so it uses the env var path = write_cron_heartbeat("env_job", interval_seconds=120) assert path.parent == tmp_path # ── bin/check_cron_heartbeats.py ───────────────────────────────────── class TestScanHeartbeats: def test_empty_dir(self, tmp_path): """No .last files → empty list.""" assert scan_heartbeats(tmp_path) == [] def test_nonexistent_dir(self, tmp_path): """Missing directory → empty list (no exception).""" assert scan_heartbeats(tmp_path / "nope") == [] def test_healthy_job(self, tmp_path): """Fresh heartbeat → healthy.""" (tmp_path / "myjob.last").write_text(json.dumps({ "job": "myjob", "timestamp": time.time(), "interval_seconds": 300, "pid": 1, "status": "ok", })) jobs = scan_heartbeats(tmp_path) assert len(jobs) == 1 assert jobs[0].healthy is True assert jobs[0].job == "myjob" def test_stale_job(self, tmp_path): """Heartbeat older than 2× interval → stale.""" (tmp_path / "slow.last").write_text(json.dumps({ "job": "slow", "timestamp": time.time() - 700, # 11.7 minutes "interval_seconds": 300, # 5 min interval → ratio 2.33 "pid": 1, "status": "ok", })) jobs = scan_heartbeats(tmp_path) assert jobs[0].healthy is False assert jobs[0].staleness_ratio > 2.0 def test_missing_heartbeat_file(self, tmp_path): """_read_job_status handles a file that disappears mid-scan.""" ghost_path = tmp_path / "ghost.last" status = _read_job_status("ghost", ghost_path) assert status.healthy is False assert "missing" in status.raw_status def test_corrupt_heartbeat(self, tmp_path): """Corrupt JSON → unhealthy with 'corrupt' status.""" p = tmp_path / "bad.last" p.write_text("{not valid json") status = _read_job_status("bad", p) assert status.healthy is False assert "corrupt" in status.raw_status def test_multiple_jobs(self, tmp_path): """Multiple .last files are all reported.""" for i, name in enumerate(["alpha", "beta", "gamma"]): (tmp_path / f"{name}.last").write_text(json.dumps({ "job": name, "timestamp": time.time() - i * 10, "interval_seconds": 300, "pid": i + 1, "status": "ok", })) jobs = scan_heartbeats(tmp_path) assert len(jobs) == 3 job_names = {j.job for j in jobs} assert job_names == {"alpha", "beta", "gamma"} def test_non_last_files_ignored(self, tmp_path): """.json and other extensions are ignored.""" (tmp_path / "other.json").write_text("{}") (tmp_path / "notes.txt").write_text("hello") assert scan_heartbeats(tmp_path) == [] class TestHeartbeatReport: def _fresh_job(self, name="j"): return JobStatus( job=name, path=Path(f"/tmp/{name}.last"), healthy=True, age_seconds=30, interval_seconds=300, staleness_ratio=0.1, last_timestamp=time.time() - 30, pid=1, raw_status="ok", message="Last beat 30s ago (ratio 0.1x)", ) def _stale_job(self, name="s"): return JobStatus( job=name, path=Path(f"/tmp/{name}.last"), healthy=False, age_seconds=700, interval_seconds=300, staleness_ratio=2.33, last_timestamp=time.time() - 700, pid=1, raw_status="stale", message="Silent for 11m 40s (2.3x interval of 5m 0s)", ) def test_overall_healthy(self): report = HeartbeatReport( timestamp=time.time(), heartbeat_dir=Path("/tmp"), jobs=[self._fresh_job()], ) assert report.overall_healthy is True def test_overall_unhealthy(self): report = HeartbeatReport( timestamp=time.time(), heartbeat_dir=Path("/tmp"), jobs=[self._fresh_job(), self._stale_job()], ) assert report.overall_healthy is False assert len(report.stale_jobs) == 1 def test_panel_markdown_contains_table(self): report = HeartbeatReport( timestamp=time.time(), heartbeat_dir=Path("/tmp"), jobs=[self._fresh_job("alpha"), self._stale_job("beta")], ) panel = report.to_panel_markdown() assert "## Heartbeat Panel" in panel assert "| `alpha` |" in panel assert "| `beta` |" in panel assert "STALE" in panel assert "OK" in panel assert "**Overall:** ALERT" in panel def test_panel_markdown_no_jobs(self): report = HeartbeatReport( timestamp=time.time(), heartbeat_dir=Path("/tmp"), jobs=[], ) panel = report.to_panel_markdown() assert "no heartbeat files found" in panel def test_panel_overall_ok(self): report = HeartbeatReport( timestamp=time.time(), heartbeat_dir=Path("/tmp"), jobs=[self._fresh_job()], ) panel = report.to_panel_markdown() assert "**Overall:** OK" in panel def test_alert_body_lists_stale_jobs(self): report = HeartbeatReport( timestamp=time.time(), heartbeat_dir=Path("/tmp"), jobs=[self._stale_job("slow")], ) body = report.to_alert_body() assert "slow" in body assert "STALE" in body.upper() or "stale" in body.lower() or "silent" in body.lower() assert "crontab" in body.lower() def test_to_json(self): report = HeartbeatReport( timestamp=time.time(), heartbeat_dir=Path("/tmp"), jobs=[self._fresh_job()], ) data = report.to_json() assert data["healthy"] is True assert len(data["jobs"]) == 1 assert data["jobs"][0]["job"] == "j" class TestFmtDuration: def test_seconds(self): assert _fmt_duration(45) == "45s" def test_minutes(self): assert _fmt_duration(90) == "1m 30s" def test_hours(self): assert _fmt_duration(3661) == "1h 1m" class TestBuildReport: def test_build_report_with_dir(self, tmp_path): """build_report() uses the given directory.""" (tmp_path / "myjob.last").write_text(json.dumps({ "job": "myjob", "timestamp": time.time(), "interval_seconds": 300, "pid": 1, "status": "ok", })) report = build_report(directory=tmp_path) assert len(report.jobs) == 1 assert report.overall_healthy is True def test_build_report_empty_dir(self, tmp_path): report = build_report(directory=tmp_path) assert report.jobs == [] assert report.overall_healthy is True # nothing stale = healthy # ── Integration: nexus_watchdog writes its own heartbeat ───────────── class TestWatchdogHeartbeatIntegration: """Verify nexus_watchdog.py writes a cron heartbeat after run_once().""" def test_watchdog_writes_cron_heartbeat(self, tmp_path, monkeypatch): """After run_once, nexus_watchdog writes nexus_watchdog.last.""" monkeypatch.setenv("BEZALEL_HEARTBEAT_DIR", str(tmp_path)) # Load watchdog module spec = importlib.util.spec_from_file_location( "_watchdog_hb_test", PROJECT_ROOT / "bin" / "nexus_watchdog.py", ) wd = importlib.util.module_from_spec(spec) sys.modules["_watchdog_hb_test"] = wd # Patch out network calls with patch("socket.socket") as mock_sock, \ patch("subprocess.run") as mock_run: mock_sock.return_value.connect_ex.return_value = 111 # port closed mock_run.return_value = MagicMock(returncode=1, stdout="") spec.loader.exec_module(wd) args = MagicMock() args.ws_host = "localhost" args.ws_port = 8765 args.heartbeat_path = str(tmp_path / "nexus_heartbeat.json") args.stale_threshold = 300 args.dry_run = True # don't touch Gitea wd.run_once(args) hb_file = tmp_path / "nexus_watchdog.last" assert hb_file.exists(), "nexus_watchdog.last was not written" data = json.loads(hb_file.read_text()) assert data["job"] == "nexus_watchdog" assert data["interval_seconds"] == 300