fix(cron): clear stale revoked-session error on re-trigger (#349 )

When a cron job recovers from auth failure, the stale last_error persisted until the next tick completed. Three entry points fixed: 1. trigger_job() - clears last_error, sets last_status='retrying' when re-triggering a previously-failed job. 2. resume_job() - same treatment when resuming a paused job that had failed before being paused. 3. mark_job_run() - tracks last_error_at and last_success_at timestamps so callers can distinguish 'currently broken' from 'had an error previously but recovered'. CLI /cron list shows health-aware status: - error: shows error preview - recovered: shows when last_success_at > last_error_at - retrying: shows when job was re-triggered _format_job() includes last_error, last_error_at, last_success_at. 9 tests added, all passing. Closes #349
2026-04-13 18:24:04 -04:00
4 changed files with 218 additions and 3 deletions
--- a/cli.py
+++ b/cli.py
@@ -4123,7 +4123,19 @@ class HermesCLI:
                    print(f"  Skills: {', '.join(job['skills'])}")
                print(f"  Prompt: {job.get('prompt_preview', '')}")
                if job.get("last_run_at"):
-                    print(f"  Last run: {job['last_run_at']} ({job.get('last_status', '?')})")
+                    status = job.get('last_status', '?')
+                    if status == "error" and job.get("last_error"):
+                        # Show error with staleness hint
+                        last_success = job.get("last_success_at", "")
+                        if last_success and last_success > job.get("last_error_at", ""):
+                            print(f"  Last run: {job['last_run_at']} (recovered, last error was before {last_success})")
+                        else:
+                            err_preview = job['last_error'][:80]
+                            print(f"  Last run: {job['last_run_at']} (error: {err_preview}...)")
+                    elif status == "retrying":
+                        print(f"  Last run: {job['last_run_at']} (retrying...)")
+                    else:
+                        print(f"  Last run: {job['last_run_at']} ({status})")
                print()
            return

--- a/cron/jobs.py
+++ b/cron/jobs.py
@@ -528,7 +528,11 @@ def pause_job(job_id: str, reason: Optional[str] = None) -> Optional[Dict[str, A


 def resume_job(job_id: str) -> Optional[Dict[str, Any]]:
-    """Resume a paused job and compute the next future run from now."""
+    """Resume a paused job and compute the next future run from now.
+
+    Clears stale error state so the job's health reflects the upcoming
+    run rather than a previous failure that may have caused the pause.
+    """
    job = get_job(job_id)
    if not job:
        return None
@@ -542,12 +546,19 @@ def resume_job(job_id: str) -> Optional[Dict[str, Any]]:
            "paused_at": None,
            "paused_reason": None,
            "next_run_at": next_run_at,
+            # Clear stale error — resuming is an explicit retry signal.
+            "last_error": None,
+            "last_status": "retrying" if job.get("last_status") == "error" else job.get("last_status"),
        },
    )


 def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
-    """Schedule a job to run on the next scheduler tick."""
+    """Schedule a job to run on the next scheduler tick.
+
+    Clears stale error state so the job's health reflects the upcoming
+    re-run rather than the previous failure.
+    """
    job = get_job(job_id)
    if not job:
        return None
@@ -559,6 +570,10 @@ def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
            "paused_at": None,
            "paused_reason": None,
            "next_run_at": _hermes_now().isoformat(),
+            # Clear stale error — the job is being retried.
+            # The next run will set last_error on failure or None on success.
+            "last_error": None,
+            "last_status": "retrying" if job.get("last_status") == "error" else job.get("last_status"),
        },
    )

@@ -618,6 +633,8 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
    
    Updates last_run_at, last_status, increments completed count,
    computes next_run_at, and auto-deletes if repeat limit reached.
+    Also tracks last_error_at and last_success_at timestamps so callers
+    can distinguish current health from historical failure residue.
    """
    jobs = load_jobs()
    for i, job in enumerate(jobs):
@@ -627,6 +644,12 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
            job["last_status"] = "ok" if success else "error"
            job["last_error"] = error if not success else None
            
+            # Track timestamps for health distinction
+            if success:
+                job["last_success_at"] = now
+            else:
+                job["last_error_at"] = now
+            
            # Increment completed count
            if job.get("repeat"):
                job["repeat"]["completed"] = job["repeat"].get("completed", 0) + 1
--- a/tests/test_cron_stale_error_349.py
+++ b/tests/test_cron_stale_error_349.py
@@ -0,0 +1,177 @@
+"""Tests for cron job stale error state clearing (#349)."""
+
+import json
+import os
+import sys
+import tempfile
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+# Ensure cron.jobs can be imported without triggering cron/__init__.py
+# which may have additional imports that break in test isolation.
+sys.modules.setdefault("cron.scheduler", MagicMock())
+
+
+@pytest.fixture()
+def isolated_cron_dir(tmp_path, monkeypatch):
+    """Point CRON_DIR at a temp directory for test isolation."""
+    cron_dir = tmp_path / "cron"
+    cron_dir.mkdir()
+    output_dir = cron_dir / "output"
+    output_dir.mkdir()
+    monkeypatch.setattr("cron.jobs.CRON_DIR", cron_dir)
+    monkeypatch.setattr("cron.jobs.JOBS_FILE", cron_dir / "jobs.json")
+    monkeypatch.setattr("cron.jobs.OUTPUT_DIR", output_dir)
+    return cron_dir
+
+
+def _make_job(cron_dir, job_id="test-job-1", last_status="error", last_error="auth revoked"):
+    """Write a minimal jobs.json with one job."""
+    from hermes_time import now as _hermes_now
+    job = {
+        "id": job_id,
+        "name": "Test Job",
+        "prompt": "test prompt",
+        "schedule": {"kind": "interval", "minutes": 10, "display": "every 10m"},
+        "enabled": True,
+        "state": "scheduled",
+        "last_run_at": _hermes_now().isoformat(),
+        "last_status": last_status,
+        "last_error": last_error,
+        "last_error_at": _hermes_now().isoformat(),
+        "next_run_at": _hermes_now().isoformat(),
+    }
+    jobs_file = cron_dir / "jobs.json"
+    jobs_file.write_text(json.dumps({"jobs": [job]}))
+    return job
+
+
+# ---------------------------------------------------------------------------
+# trigger_job clears stale error
+# ---------------------------------------------------------------------------
+
+class TestTriggerJobClearsError:
+    def test_trigger_clears_last_error(self, isolated_cron_dir):
+        from cron.jobs import trigger_job, get_job
+
+        _make_job(isolated_cron_dir, last_status="error", last_error="Refresh session revoked")
+        result = trigger_job("test-job-1")
+        assert result is not None
+        job = get_job("test-job-1")
+        assert job["last_error"] is None
+        assert job["last_status"] == "retrying"
+
+    def test_trigger_preserves_ok_status(self, isolated_cron_dir):
+        from cron.jobs import trigger_job, get_job
+
+        _make_job(isolated_cron_dir, last_status="ok", last_error=None)
+        result = trigger_job("test-job-1")
+        assert result is not None
+        job = get_job("test-job-1")
+        assert job["last_error"] is None
+        assert job["last_status"] == "ok"  # stays ok, not retrying
+
+    def test_trigger_nonexistent_returns_none(self, isolated_cron_dir):
+        from cron.jobs import trigger_job
+
+        result = trigger_job("nonexistent")
+        assert result is None
+
+
+# ---------------------------------------------------------------------------
+# resume_job also clears stale error
+# ---------------------------------------------------------------------------
+
+class TestResumeJobClearsError:
+    def test_resume_clears_last_error(self, isolated_cron_dir):
+        from cron.jobs import resume_job, get_job, pause_job
+
+        _make_job(isolated_cron_dir, last_status="error", last_error="auth revoked")
+        pause_job("test-job-1", reason="auth issue")
+        result = resume_job("test-job-1")
+        assert result is not None
+        job = get_job("test-job-1")
+        assert job["last_error"] is None
+        assert job["last_status"] == "retrying"
+        assert job["state"] == "scheduled"
+
+
+# ---------------------------------------------------------------------------
+# mark_job_run tracks timestamps
+# ---------------------------------------------------------------------------
+
+class TestMarkJobRunTimestamps:
+    def test_success_sets_last_success_at(self, isolated_cron_dir):
+        from cron.jobs import mark_job_run, get_job
+
+        _make_job(isolated_cron_dir)
+        mark_job_run("test-job-1", success=True, error=None)
+        job = get_job("test-job-1")
+        assert job["last_status"] == "ok"
+        assert job["last_error"] is None
+        assert job.get("last_success_at") is not None
+
+    def test_error_sets_last_error_at(self, isolated_cron_dir):
+        from cron.jobs import mark_job_run, get_job
+
+        _make_job(isolated_cron_dir, last_status="ok", last_error=None)
+        mark_job_run("test-job-1", success=False, error="new error")
+        job = get_job("test-job-1")
+        assert job["last_status"] == "error"
+        assert job["last_error"] == "new error"
+        assert job.get("last_error_at") is not None
+
+    def test_success_clears_error(self, isolated_cron_dir):
+        from cron.jobs import mark_job_run, get_job
+
+        _make_job(isolated_cron_dir, last_status="error", last_error="old error")
+        mark_job_run("test-job-1", success=True, error=None)
+        job = get_job("test-job-1")
+        assert job["last_status"] == "ok"
+        assert job["last_error"] is None
+        assert job.get("last_success_at") is not None
+
+
+# ---------------------------------------------------------------------------
+# Health distinction: error vs recovered
+# ---------------------------------------------------------------------------
+
+class TestHealthDistinction:
+    def test_recovered_job_shows_success_after_error(self, isolated_cron_dir):
+        from cron.jobs import mark_job_run, get_job
+        from hermes_time import now as _hermes_now
+
+        _make_job(isolated_cron_dir, last_status="error", last_error="auth revoked")
+        # Simulate recovery
+        mark_job_run("test-job-1", success=True, error=None)
+        job = get_job("test-job-1")
+        assert job["last_status"] == "ok"
+        assert job["last_error"] is None
+        # last_success_at should be after last_error_at
+        assert job["last_success_at"] >= job.get("last_error_at", "")
+
+
+# ---------------------------------------------------------------------------
+# _format_job includes health fields
+# ---------------------------------------------------------------------------
+
+class TestFormatJobHealth:
+    def test_format_includes_health_fields(self):
+        from tools.cronjob_tools import _format_job
+
+        job = {
+            "id": "j1",
+            "name": "Test",
+            "prompt": "hello",
+            "schedule_display": "every 10m",
+            "last_status": "error",
+            "last_error": "revoked",
+            "last_error_at": "2026-04-13T10:00:00",
+            "last_success_at": "2026-04-13T09:00:00",
+        }
+        result = _format_job(job)
+        assert result["last_error"] == "revoked"
+        assert result["last_error_at"] == "2026-04-13T10:00:00"
+        assert result["last_success_at"] == "2026-04-13T09:00:00"
--- a/tools/cronjob_tools.py
+++ b/tools/cronjob_tools.py
@@ -196,6 +196,9 @@ def _format_job(job: Dict[str, Any]) -> Dict[str, Any]:
        "next_run_at": job.get("next_run_at"),
        "last_run_at": job.get("last_run_at"),
        "last_status": job.get("last_status"),
+        "last_error": job.get("last_error"),
+        "last_error_at": job.get("last_error_at"),
+        "last_success_at": job.get("last_success_at"),
        "enabled": job.get("enabled", True),
        "state": job.get("state", "scheduled" if job.get("enabled", True) else "paused"),
        "paused_at": job.get("paused_at"),