137 lines
4.1 KiB
Python
137 lines
4.1 KiB
Python
"""Poka-yoke heartbeat writer for cron jobs.
|
|
|
|
Every scheduled job calls write_cron_heartbeat() on successful completion so
|
|
the meta-heartbeat checker (bin/check_cron_heartbeats.py) can verify that all
|
|
jobs are still alive. Absence of a fresh heartbeat = silent failure.
|
|
|
|
Path convention
|
|
---------------
|
|
Primary: /var/run/bezalel/heartbeats/<job>.last
|
|
Fallback: ~/.bezalel/heartbeats/<job>.last
|
|
(used when /var/run/bezalel is not writable, e.g. dev machines)
|
|
Override: BEZALEL_HEARTBEAT_DIR environment variable
|
|
|
|
Heartbeat file format (JSON)
|
|
----------------------------
|
|
{
|
|
"job": "nexus_watchdog",
|
|
"timestamp": 1744000000.0,
|
|
"interval_seconds": 300,
|
|
"pid": 12345,
|
|
"status": "ok"
|
|
}
|
|
|
|
Usage in a cron job
|
|
-------------------
|
|
from nexus.cron_heartbeat import write_cron_heartbeat
|
|
|
|
def main():
|
|
# ... do the work ...
|
|
write_cron_heartbeat("my_job_name", interval_seconds=300)
|
|
|
|
Zero-dependency shell one-liner (for scripts that can't import Python)
|
|
-----------------------------------------------------------------------
|
|
python -c "
|
|
from nexus.cron_heartbeat import write_cron_heartbeat
|
|
write_cron_heartbeat('my_job', interval_seconds=300)
|
|
"
|
|
|
|
Refs: #1096
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
|
|
PRIMARY_HEARTBEAT_DIR = Path("/var/run/bezalel/heartbeats")
|
|
FALLBACK_HEARTBEAT_DIR = Path.home() / ".bezalel" / "heartbeats"
|
|
|
|
|
|
def _resolve_heartbeat_dir() -> Path:
|
|
"""Return the heartbeat directory, trying primary then fallback.
|
|
|
|
If BEZALEL_HEARTBEAT_DIR is set in the environment that wins outright
|
|
(useful for tests and non-standard deployments).
|
|
"""
|
|
env = os.environ.get("BEZALEL_HEARTBEAT_DIR")
|
|
if env:
|
|
return Path(env)
|
|
|
|
# Try to create and write-test the primary path
|
|
try:
|
|
PRIMARY_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
|
|
probe = PRIMARY_HEARTBEAT_DIR / ".write_probe"
|
|
probe.touch()
|
|
probe.unlink()
|
|
return PRIMARY_HEARTBEAT_DIR
|
|
except (PermissionError, OSError):
|
|
pass
|
|
|
|
FALLBACK_HEARTBEAT_DIR.mkdir(parents=True, exist_ok=True)
|
|
return FALLBACK_HEARTBEAT_DIR
|
|
|
|
|
|
def heartbeat_path(job: str, directory: Path | None = None) -> Path:
|
|
"""Return the Path where *job*'s heartbeat file lives.
|
|
|
|
Useful for readers (e.g. the Night Watch report) that just need the
|
|
location without writing anything.
|
|
"""
|
|
d = directory if directory is not None else _resolve_heartbeat_dir()
|
|
return d / f"{job}.last"
|
|
|
|
|
|
def write_cron_heartbeat(
|
|
job: str,
|
|
interval_seconds: int,
|
|
status: str = "ok",
|
|
directory: Path | None = None,
|
|
) -> Path:
|
|
"""Write a poka-yoke heartbeat file for a cron job.
|
|
|
|
Call this at the end of your job's main function. The file is written
|
|
atomically (write-to-temp + rename) so the checker never reads a partial
|
|
file.
|
|
|
|
Args:
|
|
job: Unique job name, e.g. ``"nexus_watchdog"``.
|
|
interval_seconds: Expected run cadence, e.g. ``300`` for every 5 min.
|
|
status: Completion status: ``"ok"``, ``"warn"``, or
|
|
``"error"``. Only ``"ok"`` resets the stale clock.
|
|
directory: Override the heartbeat directory (mainly for tests).
|
|
|
|
Returns:
|
|
Path to the written heartbeat file.
|
|
"""
|
|
d = directory if directory is not None else _resolve_heartbeat_dir()
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
path = d / f"{job}.last"
|
|
|
|
data = {
|
|
"job": job,
|
|
"timestamp": time.time(),
|
|
"interval_seconds": interval_seconds,
|
|
"pid": os.getpid(),
|
|
"status": status,
|
|
}
|
|
|
|
# Atomic write: temp file in same directory + rename.
|
|
# Guarantees the checker never sees a half-written file.
|
|
fd, tmp = tempfile.mkstemp(dir=str(d), prefix=f".{job}-", suffix=".tmp")
|
|
try:
|
|
with os.fdopen(fd, "w") as f:
|
|
json.dump(data, f)
|
|
os.replace(tmp, str(path))
|
|
except Exception:
|
|
# Best-effort — never crash the job over a heartbeat failure
|
|
try:
|
|
os.unlink(tmp)
|
|
except OSError:
|
|
pass
|
|
|
|
return path
|