fix(cron): prevent recurring job re-fire on gateway crash/restart loop (#3396)

When a gateway crashes mid-job execution (before mark_job_run can persist the updated next_run_at), the job would fire again on every restart attempt within the grace window. For a daily 6:15 AM job with a 2-hour grace, rapidly restarting the gateway could trigger dozens of duplicate runs. Fix: call advance_next_run() BEFORE run_job() in tick(). For recurring jobs (cron/interval), this preemptively advances next_run_at to the next future occurrence and persists it to disk. If the process then crashes during execution, the job won't be considered due on restart. One-shot jobs are left unchanged — they still retry on restart since there's no future occurrence to advance to. This changes the scheduler from at-least-once to at-most-once semantics for recurring jobs, which is the correct tradeoff: missing one daily message is far better than sending it dozens of times.
2026-03-27 08:02:58 -07:00
parent 5a1e2a307a
commit eb2127c1dc
4 changed files with 158 additions and 1 deletions
--- a/cron/jobs.py
+++ b/cron/jobs.py
@@ -598,6 +598,34 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
    save_jobs(jobs)


+def advance_next_run(job_id: str) -> bool:
+    """Preemptively advance next_run_at for a recurring job before execution.
+
+    Call this BEFORE run_job() so that if the process crashes mid-execution,
+    the job won't re-fire on the next gateway restart.  This converts the
+    scheduler from at-least-once to at-most-once for recurring jobs — missing
+    one run is far better than firing dozens of times in a crash loop.
+
+    One-shot jobs are left unchanged so they can still retry on restart.
+
+    Returns True if next_run_at was advanced, False otherwise.
+    """
+    jobs = load_jobs()
+    for job in jobs:
+        if job["id"] == job_id:
+            kind = job.get("schedule", {}).get("kind")
+            if kind not in ("cron", "interval"):
+                return False
+            now = _hermes_now().isoformat()
+            new_next = compute_next_run(job["schedule"], now)
+            if new_next and new_next != job.get("next_run_at"):
+                job["next_run_at"] = new_next
+                save_jobs(jobs)
+                return True
+            return False
+    return False
+
+
 def get_due_jobs() -> List[Dict[str, Any]]:
    """Get all jobs that are due to run now.

--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -35,7 +35,7 @@ logger = logging.getLogger(__name__)
 # Add parent directory to path for imports
 sys.path.insert(0, str(Path(__file__).parent.parent))

-from cron.jobs import get_due_jobs, mark_job_run, save_job_output
+from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run

 # Sentinel: when a cron agent has nothing new to report, it can start its
 # response with this marker to suppress delivery.  Output is still saved
@@ -524,6 +524,12 @@ def tick(verbose: bool = True) -> int:
        executed = 0
        for job in due_jobs:
            try:
+                # For recurring jobs (cron/interval), advance next_run_at to the
+                # next future occurrence BEFORE execution.  This way, if the
+                # process crashes mid-run, the job won't re-fire on restart.
+                # One-shot jobs are left alone so they can retry on restart.
+                advance_next_run(job["id"])
+
                success, output, final_response, error = run_job(job)

                output_file = save_job_output(job["id"], output)