feat(cron): add deploy sync guard to catch stale code before cascading failures

When the installed run_agent.py diverges from what scheduler.py expects, every cron job fails with TypeError on AIAgent.__init__() — a silent total outage that cascades into gateway restarts, asyncio shutdown errors, and auth token expiry. This commit adds a _validate_agent_interface() guard that: - Inspects AIAgent.__init__ at runtime via inspect.signature - Verifies every kwarg the scheduler passes exists in the constructor - Fails fast with a clear remediation message on mismatch - Runs once per gateway process (cached, zero per-job overhead) The guard is called at the top of run_job() before any work begins. It would have caught the tool_choice TypeError that caused 1,199 failures across 55 jobs (meta-issue #343). Includes 3 tests: pass, fail, and cache verification.
2026-04-13 03:33:48 -04:00
2 changed files with 123 additions and 1 deletions
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -10,6 +10,7 @@ runs at a time if multiple processes overlap.

 import asyncio
 import concurrent.futures
+import inspect
 import json
 import logging
 import os
@@ -50,6 +51,71 @@ _KNOWN_DELIVERY_PLATFORMS = frozenset({

 from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run

+# ---------------------------------------------------------------------------
+# Deploy Sync Guard
+# ---------------------------------------------------------------------------
+# The scheduler passes keyword arguments to AIAgent() that may not exist in
+# older installed versions.  When the installed run_agent.py diverges from the
+# version the scheduler was written against, every cron job fails with a
+# TypeError — a silent total outage.  This guard catches that at the first
+# tick rather than after 1000+ error log lines.
+#
+# The check runs once per gateway process (cached by _agent_interface_validated).
+# It inspects AIAgent.__init__ and verifies every parameter the scheduler
+# passes is accepted.  On mismatch it raises RuntimeError with a fix command.
+
+_agent_interface_validated = False
+
+# Parameters the scheduler passes to AIAgent() in run_job().
+# If you add a new kwarg to the AIAgent() call below, add it here too.
+_SCHEDULER_AGENT_KWARGS = {
+    "tool_choice": "required",
+    "skip_memory": True,
+    "platform": "cron",
+}
+
+
+def _validate_agent_interface():
+    """Verify AIAgent.__init__ accepts every kwarg the scheduler uses.
+
+    Raises RuntimeError with a remediation message on mismatch.
+    Called once per process from run_job(); subsequent calls are no-ops.
+    """
+    global _agent_interface_validated
+    if _agent_interface_validated:
+        return
+
+    try:
+        from run_agent import AIAgent
+    except ImportError as exc:
+        raise RuntimeError(
+            f"Deploy sync guard: cannot import AIAgent from run_agent: {exc}\n"
+            "The installed hermes-agent package may be corrupted. "
+            "Reinstall: pip install -e ~/.hermes/hermes-agent"
+        ) from exc
+
+    sig = inspect.signature(AIAgent.__init__)
+    params = set(sig.parameters.keys()) - {"self"}
+
+    missing = [kw for kw in _SCHEDULER_AGENT_KWARGS if kw not in params]
+    if missing:
+        raise RuntimeError(
+            "Deploy sync guard: AIAgent.__init__() is missing parameters that "
+            "the cron scheduler requires. This means the installed code is out "
+            "of sync with the scheduler module.\n"
+            f"  Missing parameters: {', '.join(missing)}\n"
+            f"  Expected by: cron/scheduler.py (run_job → AIAgent())\n"
+            f"  Fix: pip install -e ~/.hermes/hermes-agent --force-reinstall\n"
+            f"  Then restart the gateway."
+        )
+
+    _agent_interface_validated = True
+    logger.info(
+        "Deploy sync guard: AIAgent interface OK (%d params, %d scheduler deps)",
+        len(params), len(_SCHEDULER_AGENT_KWARGS),
+    )
+
+
 # Sentinel: when a cron agent has nothing new to report, it can start its
 # response with this marker to suppress delivery.  Output is still saved
 # locally for audit.
@@ -470,7 +536,12 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
        Tuple of (success, full_output_doc, final_response, error_message)
    """
    from run_agent import AIAgent
-    
+
+    # Deploy sync guard: verify the installed AIAgent accepts all kwargs
+    # the scheduler passes.  Catches stale-code-deploy bugs before they
+    # cascade into 1000+ TypeErrors and gateway restart loops.
+    _validate_agent_interface()
+
    # Initialize SQLite session store so cron job messages are persisted
    # and discoverable via session_search (same pattern as gateway/run.py).
    _session_db = None
--- a/tests/cron/test_scheduler.py
+++ b/tests/cron/test_scheduler.py
@@ -863,3 +863,54 @@ class TestTickAdvanceBeforeRun:
        adv_mock.assert_called_once_with("test-advance")
        # advance must happen before run
        assert call_order == [("advance", "test-advance"), ("run", "test-advance")]
+
+
+class TestDeploySyncGuard:
+    """Tests for _validate_agent_interface() — the deploy sync guard."""
+
+    def test_passes_when_all_params_present(self):
+        """Validation passes when AIAgent accepts every scheduler kwarg."""
+        from cron.scheduler import _validate_agent_interface, _agent_interface_validated
+        import cron.scheduler as sched_mod
+
+        # Reset the cached flag so the check actually runs.
+        sched_mod._agent_interface_validated = False
+        # Should not raise.
+        _validate_agent_interface()
+        assert sched_mod._agent_interface_validated is True
+
+    def test_fails_when_param_missing(self):
+        """Validation raises RuntimeError when AIAgent is missing a required param."""
+        import cron.scheduler as sched_mod
+        from unittest.mock import MagicMock
+        import inspect
+
+        # Save and restore.
+        orig_flag = sched_mod._agent_interface_validated
+        try:
+            sched_mod._agent_interface_validated = False
+
+            # Build a fake AIAgent class whose __init__ lacks 'tool_choice'.
+            class FakeAIAgent:
+                def __init__(self, model="", max_iterations=90, quiet_mode=False,
+                             disabled_toolsets=None, skip_memory=False, platform=None,
+                             session_id=None, session_db=None):
+                    pass
+
+            fake_module = MagicMock()
+            fake_module.AIAgent = FakeAIAgent
+
+            with pytest.raises(RuntimeError, match="Missing parameters: tool_choice"):
+                with patch.dict("sys.modules", {"run_agent": fake_module}):
+                    sched_mod._validate_agent_interface()
+        finally:
+            sched_mod._agent_interface_validated = orig_flag
+
+    def test_cached_after_first_run(self):
+        """Second call is a no-op (uses cached flag)."""
+        import cron.scheduler as sched_mod
+
+        sched_mod._agent_interface_validated = True
+        # Should not raise even if we somehow break AIAgent — the flag is set.
+        sched_mod._validate_agent_interface()
+        # No exception = pass.