Files
the-nexus/nexus/cron_heartbeat.py
2026-04-07 14:38:55 +00:00

137 lines
4.1 KiB
Python

"""Poka-yoke heartbeat writer for cron jobs.
Every scheduled job calls write_cron_heartbeat() on successful completion so
the meta-heartbeat checker (bin/check_cron_heartbeats.py) can verify that all
jobs are still alive. Absence of a fresh heartbeat = silent failure.
Path convention
---------------
Primary: /var/run/bezalel/heartbeats/<job>.last
Fallback: ~/.bezalel/heartbeats/<job>.last
(used when /var/run/bezalel is not writable, e.g. dev machines)
Override: BEZALEL_HEARTBEAT_DIR environment variable
Heartbeat file format (JSON)
----------------------------
{
"job": "nexus_watchdog",
"timestamp": 1744000000.0,
"interval_seconds": 300,
"pid": 12345,
"status": "ok"
}
Usage in a cron job
-------------------
from nexus.cron_heartbeat import write_cron_heartbeat
def main():
# ... do the work ...
write_cron_heartbeat("my_job_name", interval_seconds=300)
Zero-dependency shell one-liner (for scripts that can't import Python)
-----------------------------------------------------------------------
python -c "
from nexus.cron_heartbeat import write_cron_heartbeat
write_cron_heartbeat('my_job', interval_seconds=300)
"
Refs: #1096
"""
from __future__ import annotations
import json
import os
import tempfile
import time
from pathlib import Path
PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats")
FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats"
def _resolve_heartbeat_dir() -> Path:
"""Return the heartbeat directory, trying primary then fallback.
If BEZALEL_HEARTBEAT_DIR is set in the environment that wins outright
(useful for tests and non-standard deployments).
"""
env = os.environ.get("BEZALEL_HEARTBEAT_DIR")
if env:
return Path(env)
# Try to create and write-test the primary path
try:
PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
probe = PRIMARY_HEARTBEAT_DIR / ".write_probe"
probe.touch()
probe.unlink()
return PRIMARY_HEARTBEAT_DIR
except (PermissionError, OSError):
pass
FALLBACK_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
return FALLBACK_HEARTBEAT_DIR
def heartbeat_path(job: str, directory: Path | None = None) -> Path:
"""Return the Path where *job*'s heartbeat file lives.
Useful for readers (e.g. the Night Watch report) that just need the
location without writing anything.
"""
d = directory if directory is not None else _resolve_heartbeat_dir()
return d / f"{job}.last"
def write_cron_heartbeat(
job: str,
interval_seconds: int,
status: str = "ok",
directory: Path | None = None,
) -> Path:
"""Write a poka-yoke heartbeat file for a cron job.
Call this at the end of your job's main function. The file is written
atomically (write-to-temp + rename) so the checker never reads a partial
file.
Args:
job: Unique job name, e.g. ``"nexus_watchdog"``.
interval_seconds: Expected run cadence, e.g. ``300`` for every 5 min.
status: Completion status: ``"ok"``, ``"warn"``, or
``"error"``. Only ``"ok"`` resets the stale clock.
directory: Override the heartbeat directory (mainly for tests).
Returns:
Path to the written heartbeat file.
"""
d = directory if directory is not None else _resolve_heartbeat_dir()
d.mkdir(parents=True, exist_ok=True)
path = d / f"{job}.last"
data = {
"job": job,
"timestamp": time.time(),
"interval_seconds": interval_seconds,
"pid": os.getpid(),
"status": status,
}
# Atomic write: temp file in same directory + rename.
# Guarantees the checker never sees a half-written file.
fd, tmp = tempfile.mkstemp(dir=str(d), prefix=f".{job}-", suffix=".tmp")
try:
with os.fdopen(fd, "w") as f:
json.dump(data, f)
os.replace(tmp, str(path))
except Exception:
# Best-effort — never crash the job over a heartbeat failure
try:
os.unlink(tmp)
except OSError:
pass
return path