From 21ffadc2a614804f7ee152b49501e273a849e1d1 Mon Sep 17 00:00:00 2001 From: Teknium Date: Sun, 22 Mar 2026 04:04:24 -0700 Subject: [PATCH] fix: dynamic grace window for missed cron job catch-up Replace hardcoded 120-second grace period with a dynamic window that scales with the job's scheduling frequency (half the period, clamped to [120s, 2h]). Daily jobs now catch up if missed by up to 2 hours instead of being silently skipped after just 2 minutes. --- cron/jobs.py | 42 +++++++++++++++++++++++++++++++++++++---- tests/cron/test_jobs.py | 18 ++++++++++++------ 2 files changed, 50 insertions(+), 10 deletions(-) diff --git a/cron/jobs.py b/cron/jobs.py index da4382cb0..86a50f3b2 100644 --- a/cron/jobs.py +++ b/cron/jobs.py @@ -248,6 +248,38 @@ def _recoverable_oneshot_run_at( return None +def _compute_grace_seconds(schedule: dict) -> int: + """Compute how late a job can be and still catch up instead of fast-forwarding. + + Uses half the schedule period, clamped between 120 seconds and 2 hours. + This ensures daily jobs can catch up if missed by up to 2 hours, + while frequent jobs (every 5-10 min) still fast-forward quickly. + """ + MIN_GRACE = 120 + MAX_GRACE = 7200 # 2 hours + + kind = schedule.get("kind") + + if kind == "interval": + period_seconds = schedule.get("minutes", 1) * 60 + grace = period_seconds // 2 + return max(MIN_GRACE, min(grace, MAX_GRACE)) + + if kind == "cron" and HAS_CRONITER: + try: + now = _hermes_now() + cron = croniter(schedule["expr"], now) + first = cron.get_next(datetime) + second = cron.get_next(datetime) + period_seconds = int((second - first).total_seconds()) + grace = period_seconds // 2 + return max(MIN_GRACE, min(grace, MAX_GRACE)) + except Exception: + pass + + return MIN_GRACE + + def compute_next_run(schedule: Dict[str, Any], last_run_at: Optional[str] = None) -> Optional[str]: """ Compute the next run time for a schedule. @@ -610,16 +642,18 @@ def get_due_jobs() -> List[Dict[str, Any]]: # For recurring jobs, check if the scheduled time is stale # (gateway was down and missed the window). Fast-forward to # the next future occurrence instead of firing a stale run. - if kind in ("cron", "interval") and (now - next_run_dt).total_seconds() > 120: - # More than 2 minutes late — this is a missed run, not a current one. - # Recompute next_run_at to the next future occurrence. + grace = _compute_grace_seconds(schedule) + if kind in ("cron", "interval") and (now - next_run_dt).total_seconds() > grace: + # Job is past its catch-up grace window — this is a stale missed run. + # Grace scales with schedule period: daily=2h, hourly=30m, 10min=5m. new_next = compute_next_run(schedule, now.isoformat()) if new_next: logger.info( - "Job '%s' missed its scheduled time (%s). " + "Job '%s' missed its scheduled time (%s, grace=%ds). " "Fast-forwarding to next run: %s", job.get("name", job["id"]), next_run, + grace, new_next, ) # Update the job in storage diff --git a/tests/cron/test_jobs.py b/tests/cron/test_jobs.py index e0e80fe89..a0dc8a89b 100644 --- a/tests/cron/test_jobs.py +++ b/tests/cron/test_jobs.py @@ -323,11 +323,14 @@ class TestMarkJobRun: class TestGetDueJobs: def test_past_due_within_window_returned(self, tmp_cron_dir): - """Jobs less than 2 minutes late are still considered due (not stale).""" + """Jobs within the dynamic grace window are still considered due (not stale). + + For an hourly job, grace = 30 min (half the period, clamped to [120s, 2h]). + """ job = create_job(prompt="Due now", schedule="every 1h") - # Force next_run_at to just 1 minute ago (within the 2-min window) + # Force next_run_at to 10 minutes ago (within the 30-min grace for hourly) jobs = load_jobs() - jobs[0]["next_run_at"] = (datetime.now() - timedelta(seconds=60)).isoformat() + jobs[0]["next_run_at"] = (datetime.now() - timedelta(minutes=10)).isoformat() save_jobs(jobs) due = get_due_jobs() @@ -335,11 +338,14 @@ class TestGetDueJobs: assert due[0]["id"] == job["id"] def test_stale_past_due_skipped(self, tmp_cron_dir): - """Recurring jobs more than 2 minutes late are fast-forwarded, not fired.""" + """Recurring jobs past their dynamic grace window are fast-forwarded, not fired. + + For an hourly job, grace = 30 min. Setting 35 min late exceeds the window. + """ job = create_job(prompt="Stale", schedule="every 1h") - # Force next_run_at to 5 minutes ago (beyond the 2-min window) + # Force next_run_at to 35 minutes ago (beyond the 30-min grace for hourly) jobs = load_jobs() - jobs[0]["next_run_at"] = (datetime.now() - timedelta(minutes=5)).isoformat() + jobs[0]["next_run_at"] = (datetime.now() - timedelta(minutes=35)).isoformat() save_jobs(jobs) due = get_due_jobs()