"""Poka-yoke heartbeat writer for cron jobs. Every scheduled job calls write_cron_heartbeat() on successful completion so the meta-heartbeat checker (bin/check_cron_heartbeats.py) can verify that all jobs are still alive. Absence of a fresh heartbeat = silent failure. Path convention --------------- Primary: /var/run/bezalel/heartbeats/.last Fallback: ~/.bezalel/heartbeats/.last (used when /var/run/bezalel is not writable, e.g. dev machines) Override: BEZALEL_HEARTBEAT_DIR environment variable Heartbeat file format (JSON) ---------------------------- { "job": "nexus_watchdog", "timestamp": 1744000000.0, "interval_seconds": 300, "pid": 12345, "status": "ok" } Usage in a cron job ------------------- from nexus.cron_heartbeat import write_cron_heartbeat def main(): # ... do the work ... write_cron_heartbeat("my_job_name", interval_seconds=300) Zero-dependency shell one-liner (for scripts that can't import Python) ----------------------------------------------------------------------- python -c " from nexus.cron_heartbeat import write_cron_heartbeat write_cron_heartbeat('my_job', interval_seconds=300) " Refs: #1096 """ from __future__ import annotations import json import os import tempfile import time from pathlib import Path PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats") FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats" def _resolve_heartbeat_dir() -> Path: """Return the heartbeat directory, trying primary then fallback. If BEZALEL_HEARTBEAT_DIR is set in the environment that wins outright (useful for tests and non-standard deployments). """ env = os.environ.get("BEZALEL_HEARTBEAT_DIR") if env: return Path(env) # Try to create and write-test the primary path try: PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True) probe = PRIMARY_HEARTBEAT_DIR / ".write_probe" probe.touch() probe.unlink() return PRIMARY_HEARTBEAT_DIR except (PermissionError, OSError): pass FALLBACK_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True) return FALLBACK_HEARTBEAT_DIR def heartbeat_path(job: str, directory: Path | None = None) -> Path: """Return the Path where *job*'s heartbeat file lives. Useful for readers (e.g. the Night Watch report) that just need the location without writing anything. """ d = directory if directory is not None else _resolve_heartbeat_dir() return d / f"{job}.last" def write_cron_heartbeat( job: str, interval_seconds: int, status: str = "ok", directory: Path | None = None, ) -> Path: """Write a poka-yoke heartbeat file for a cron job. Call this at the end of your job's main function. The file is written atomically (write-to-temp + rename) so the checker never reads a partial file. Args: job: Unique job name, e.g. ``"nexus_watchdog"``. interval_seconds: Expected run cadence, e.g. ``300`` for every 5 min. status: Completion status: ``"ok"``, ``"warn"``, or ``"error"``. Only ``"ok"`` resets the stale clock. directory: Override the heartbeat directory (mainly for tests). Returns: Path to the written heartbeat file. """ d = directory if directory is not None else _resolve_heartbeat_dir() d.mkdir(parents=True, exist_ok=True) path = d / f"{job}.last" data = { "job": job, "timestamp": time.time(), "interval_seconds": interval_seconds, "pid": os.getpid(), "status": status, } # Atomic write: temp file in same directory + rename. # Guarantees the checker never sees a half-written file. fd, tmp = tempfile.mkstemp(dir=str(d), prefix=f".{job}-", suffix=".tmp") try: with os.fdopen(fd, "w") as f: json.dump(data, f) os.replace(tmp, str(path)) except Exception: # Best-effort — never crash the job over a heartbeat failure try: os.unlink(tmp) except OSError: pass return path