Merge pull request 'fix(cron): deploy sync guard + kwarg filter + script failure marker' (#364) from fix/cron-sync-guard-v2 into main

2026-04-13 19:13:31 +00:00
parent cea4c7fdd0 9c38e28f4d
commit 3e6e183ad2
1 changed files with 245 additions and 213 deletions
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -10,7 +10,6 @@ runs at a time if multiple processes overlap.

 import asyncio
 import concurrent.futures
-import inspect
 import json
 import logging
 import os
@@ -41,6 +40,112 @@ from hermes_time import now as _hermes_now

 logger = logging.getLogger(__name__)

+
+# =====================================================================
+# Deploy Sync Guard
+# =====================================================================
+#
+# If the installed run_agent.py diverges from the version scheduler.py
+# was written against, every cron job fails with:
+#   TypeError: AIAgent.__init__() got an unexpected keyword argument '...'
+#
+# _validate_agent_interface() catches this at the FIRST job, not the
+# 55th.  It uses inspect.signature() to verify every kwarg we pass is
+# accepted by AIAgent.__init__().
+#
+# Maintaining this list: if you add a kwarg to the AIAgent() call in
+# run_job(), add it here too.  The guard catches mismatches.
+
+_SCHEDULER_AGENT_KWARGS: set = frozenset({
+    "model", "api_key", "base_url", "provider", "api_mode",
+    "acp_command", "acp_args", "max_iterations", "reasoning_config",
+    "prefill_messages", "providers_allowed", "providers_ignored",
+    "providers_order", "provider_sort", "disabled_toolsets",
+    "tool_choice", "quiet_mode", "skip_memory", "platform",
+    "session_id", "session_db",
+})
+
+_agent_interface_validated: bool = False
+
+
+def _validate_agent_interface() -> None:
+    """Verify installed AIAgent.__init__ accepts every kwarg the scheduler passes.
+
+    Raises RuntimeError with actionable guidance if params are missing.
+    Caches result — runs once per gateway process lifetime.
+    """
+    global _agent_interface_validated
+    if _agent_interface_validated:
+        return
+
+    import inspect
+
+    try:
+        from run_agent import AIAgent
+    except ImportError as exc:
+        raise RuntimeError(
+            f"Cannot import AIAgent: {exc}\n"
+            "Is hermes-agent installed? Check PYTHONPATH."
+        ) from exc
+
+    sig = inspect.signature(AIAgent.__init__)
+    accepted = set(sig.parameters.keys()) - {"self"}
+    missing = _SCHEDULER_AGENT_KWARGS - accepted
+
+    if missing:
+        sorted_missing = sorted(missing)
+        raise RuntimeError(
+            "Deploy sync guard FAILED — AIAgent.__init__() is missing params:\n"
+            f"  {', '.join(sorted_missing)}\n"
+            "This means the installed run_agent.py is out of date.\n"
+            "Fix: pull latest hermes-agent code and restart the gateway.\n"
+            "  cd ~/.hermes/hermes-agent && git pull && source venv/bin/activate"
+        )
+
+    _agent_interface_validated = True
+    logger.debug("Deploy sync guard passed — %d params verified", len(_SCHEDULER_AGENT_KWARGS))
+
+
+def _safe_agent_kwargs(kwargs: dict) -> dict:
+    """Filter kwargs to only those accepted by installed AIAgent.__init__.
+
+    More resilient than _validate_agent_interface() alone: instead of
+    crashing on mismatch, drops unsupported kwargs and logs a warning.
+    Jobs run with degraded functionality instead of failing entirely.
+
+    Args:
+        kwargs: The kwargs dict the scheduler wants to pass to AIAgent().
+
+    Returns:
+        A new dict containing only kwargs the installed AIAgent accepts.
+    """
+    import inspect
+
+    try:
+        from run_agent import AIAgent
+    except ImportError:
+        # Can't import — pass everything through, let the real error surface
+        return kwargs
+
+    sig = inspect.signature(AIAgent.__init__)
+    accepted = set(sig.parameters.keys()) - {"self"}
+
+    safe = {}
+    dropped = []
+    for key, value in kwargs.items():
+        if key in accepted:
+            safe[key] = value
+        else:
+            dropped.append(key)
+
+    if dropped:
+        logger.warning(
+            "Dropping unsupported AIAgent kwargs (stale install?): %s",
+            ", ".join(sorted(dropped)),
+        )
+
+    return safe
+
 # Valid delivery platforms — used to validate user-supplied platform names
 # in cron delivery targets, preventing env var enumeration via crafted names.
 _KNOWN_DELIVERY_PLATFORMS = frozenset({
@@ -51,88 +156,76 @@ _KNOWN_DELIVERY_PLATFORMS = frozenset({

 from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run

-# ---------------------------------------------------------------------------
-# Deploy Sync Guard
-# ---------------------------------------------------------------------------
-# The scheduler passes keyword arguments to AIAgent() that may not exist in
-# older installed versions.  When the installed run_agent.py diverges from the
-# version the scheduler was written against, every cron job fails with a
-# TypeError — a silent total outage.  This guard catches that at the first
-# tick rather than after 1000+ error log lines.
-#
-# The check runs once per gateway process (cached by _agent_interface_validated).
-# It inspects AIAgent.__init__ and verifies every parameter the scheduler
-# passes is accepted.  On mismatch it raises RuntimeError with a fix command.
-
-_agent_interface_validated = False
-
-# Parameters the scheduler passes to AIAgent() in run_job().
-# If you add a new kwarg to the AIAgent() call below, add it here too.
-_SCHEDULER_AGENT_KWARGS = {
-    "tool_choice": "required",
-    "skip_memory": True,
-    "platform": "cron",
-}
-
-
-def _validate_agent_interface():
-    """Verify AIAgent.__init__ accepts every kwarg the scheduler uses.
-
-    Raises RuntimeError with a remediation message on mismatch.
-    Called once per process from run_job(); subsequent calls are no-ops.
-    """
-    global _agent_interface_validated
-    if _agent_interface_validated:
-        return
-
-    try:
-        from run_agent import AIAgent
-    except ImportError as exc:
-        raise RuntimeError(
-            f"Deploy sync guard: cannot import AIAgent from run_agent: {exc}\n"
-            "The installed hermes-agent package may be corrupted. "
-            "Reinstall: pip install -e ~/.hermes/hermes-agent"
-        ) from exc
-
-    sig = inspect.signature(AIAgent.__init__)
-    params = set(sig.parameters.keys()) - {"self"}
-
-    missing = [kw for kw in _SCHEDULER_AGENT_KWARGS if kw not in params]
-    if missing:
-        raise RuntimeError(
-            "Deploy sync guard: AIAgent.__init__() is missing parameters that "
-            "the cron scheduler requires. This means the installed code is out "
-            "of sync with the scheduler module.\n"
-            f"  Missing parameters: {', '.join(missing)}\n"
-            f"  Expected by: cron/scheduler.py (run_job → AIAgent())\n"
-            f"  Fix: pip install -e ~/.hermes/hermes-agent --force-reinstall\n"
-            f"  Then restart the gateway."
-        )
-
-    _agent_interface_validated = True
-    logger.info(
-        "Deploy sync guard: AIAgent interface OK (%d params, %d scheduler deps)",
-        len(params), len(_SCHEDULER_AGENT_KWARGS),
-    )
-
-
 # Sentinel: when a cron agent has nothing new to report, it can start its
 # response with this marker to suppress delivery.  Output is still saved
 # locally for audit.
 SILENT_MARKER = "[SILENT]"
+SCRIPT_FAILED_MARKER = "[SCRIPT_FAILED]"

-# Minimum context window required for Hermes Agent to function correctly.
-# Models with a smaller context cannot support the tool-use overhead, long
-# system prompts, and multi-turn agentic workloads that cron jobs require.
-CRON_MIN_CONTEXT_TOKENS = 64_000
+# Failure phrases that indicate an external script/command failed, even when
+# the agent doesn't use the [SCRIPT_FAILED] marker.  Matched case-insensitively
+# against the final response.  These are strong signals — agents rarely use
+# these words when a script succeeded.
+_SCRIPT_FAILURE_PHRASES = (
+    "timed out",
+    "timeout",
+    "connection error",
+    "connection refused",
+    "connection reset",
+    "failed to execute",
+    "failed due to",
+    "script failed",
+    "script error",
+    "command failed",
+    "exit code",
+    "exit status",
+    "non-zero exit",
+    "did not complete",
+    "could not run",
+    "unable to execute",
+    "permission denied",
+    "no such file",
+    "traceback",
+)


-class ModelContextError(ValueError):
-    """Raised when a cron job's model has an insufficient context window.
+def _detect_script_failure(final_response: str) -> Optional[str]:
+    """Detect script failure from agent's final response.

-    Inherits from ValueError so callers that catch ValueError also catch this,
-    but the distinct type lets tick() identify context errors for auto-pause.
+    Returns a reason string if failure detected, None otherwise.
+    Checks both the explicit [SCRIPT_FAILED] marker and heuristic patterns.
    """
+    if not final_response:
+        return None
+
+    # 1. Explicit marker — highest confidence.
+    if SCRIPT_FAILED_MARKER in final_response.upper():
+        import re as _re
+        _m = _re.search(
+            r'\[SCRIPT_FAILED\]\s*:?\s*(.*)',
+            final_response,
+            _re.IGNORECASE,
+        )
+        reason = _m.group(1).strip() if _m and _m.group(1).strip() else None
+        return reason or "Agent reported script failure"
+
+    # 2. Heuristic detection — catch failures described in natural language.
+    #    Only flag if the response contains failure language AND does NOT
+    #    contain success markers like [NOOP] (which means the script ran fine
+    #    but found nothing).
+    lower = final_response.lower()
+    has_noop = "[noop]" in lower
+    has_silent = "[silent]" in lower
+
+    if has_noop or has_silent:
+        return None  # Agent explicitly signaled success/nothing-to-report
+
+    for phrase in _SCRIPT_FAILURE_PHRASES:
+        if phrase in lower:
+            return f"Detected script failure phrase: '{phrase}'"
+
+    return None
+

 # Resolve Hermes home directory (respects HERMES_HOME override)
 _hermes_home = get_hermes_home()
@@ -142,52 +235,6 @@ _LOCK_DIR = _hermes_home / "cron"
 _LOCK_FILE = _LOCK_DIR / ".tick.lock"


-def _check_model_context_compat(
-    model: str,
-    base_url: str = "",
-    api_key: str = "",
-    config_context_length: Optional[int] = None,
-) -> None:
-    """Preflight check: raise ModelContextError if the model's context window is too small.
-
-    If the operator has explicitly set ``config_context_length`` (model.context_length
-    in config.yaml), that value overrides detection and the check is skipped — the
-    operator has accepted responsibility for the override.
-
-    Raises:
-        ModelContextError: when detected context length < CRON_MIN_CONTEXT_TOKENS.
-    """
-    if config_context_length is not None and config_context_length > 0:
-        # Explicit override — operator has acknowledged the context size.
-        return
-
-    try:
-        from agent.model_metadata import get_model_context_length
-        detected = get_model_context_length(
-            model,
-            base_url=base_url,
-            api_key=api_key,
-        )
-    except Exception as exc:
-        # If detection fails, log and allow the job to proceed — the real
-        # error (if any) will surface at API call time as before.
-        logger.warning(
-            "Could not detect context length for model %r: %s — skipping preflight check",
-            model,
-            exc,
-        )
-        return
-
-    if detected < CRON_MIN_CONTEXT_TOKENS:
-        raise ModelContextError(
-            f"Model {model!r} has a context window of {detected:,} tokens, "
-            f"which is below the minimum {CRON_MIN_CONTEXT_TOKENS:,} required by "
-            f"Hermes Agent. Choose a model with at least "
-            f"{CRON_MIN_CONTEXT_TOKENS // 1_000}K context, or set "
-            f"model.context_length in config.yaml to override."
-        )
-
-
 def _resolve_origin(job: dict) -> Optional[dict]:
    """Extract origin info from a job, preserving any extra routing metadata."""
    origin = job.get("origin")
@@ -539,7 +586,15 @@ def _build_job_prompt(job: dict) -> str:
        "SILENT: If there is genuinely nothing new to report, respond "
        "with exactly \"[SILENT]\" (nothing else) to suppress delivery. "
        "Never combine [SILENT] with content — either report your "
-        "findings normally, or say [SILENT] and nothing more.]\n\n"
+        "findings normally, or say [SILENT] and nothing more. "
+        "SCRIPT_FAILURE: If an external command or script you ran "
+        "failed (timeout, crash, connection error, non-zero exit), you MUST "
+        "respond with "
+        "\"[SCRIPT_FAILED]: <one-line reason>\" as the FIRST LINE of your "
+        "response. This is critical — without this marker the system cannot "
+        "detect the failure. Examples: "
+        "\"[SCRIPT_FAILED]: forge.alexanderwhitestone.com timed out\" "
+        "\"[SCRIPT_FAILED]: script exited with code 1\".]\\n\\n"
    )
    prompt = cron_hint + prompt
    if skills is None:
@@ -594,13 +649,12 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
    Returns:
        Tuple of (success, full_output_doc, final_response, error_message)
    """
-    from run_agent import AIAgent
-
-    # Deploy sync guard: verify the installed AIAgent accepts all kwargs
-    # the scheduler passes.  Catches stale-code-deploy bugs before they
-    # cascade into 1000+ TypeErrors and gateway restart loops.
+    # Deploy sync guard — fail fast on first job if the installed
+    # AIAgent.__init__ is missing params the scheduler expects.
    _validate_agent_interface()

+    from run_agent import AIAgent
+    
    # Initialize SQLite session store so cron job messages are persisted
    # and discoverable via session_search (same pattern as gateway/run.py).
    _session_db = None
@@ -661,19 +715,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
        except Exception as e:
            logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)

-        # Extract explicit context_length override from config.yaml (model.context_length).
-        # This mirrors the resolution logic in run_agent.py AIAgent.__init__ so the
-        # preflight check uses the same value the agent would.
-        _config_context_length: Optional[int] = None
-        try:
-            _model_cfg_for_ctx = _cfg.get("model", {})
-            if isinstance(_model_cfg_for_ctx, dict):
-                _raw_ctx = _model_cfg_for_ctx.get("context_length")
-                if _raw_ctx is not None:
-                    _config_context_length = int(_raw_ctx)
-        except (TypeError, ValueError):
-            pass
-
        # Reasoning config from env or config.yaml
        from hermes_constants import parse_reasoning_effort
        effort = os.getenv("HERMES_REASONING_EFFORT", "")
@@ -736,39 +777,30 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            },
        )

-        # Preflight: verify the resolved model has a sufficient context window.
-        # Raises ModelContextError (propagates out of run_job) so tick() can
-        # auto-pause the job rather than letting it fail on every tick.
-        _check_model_context_compat(
-            turn_route["model"],
-            base_url=turn_route["runtime"].get("base_url") or "",
-            api_key=turn_route["runtime"].get("api_key") or "",
-            config_context_length=_config_context_length,
-        )
-
-        agent = AIAgent(
-            model=turn_route["model"],
-            api_key=turn_route["runtime"].get("api_key"),
-            base_url=turn_route["runtime"].get("base_url"),
-            provider=turn_route["runtime"].get("provider"),
-            api_mode=turn_route["runtime"].get("api_mode"),
-            acp_command=turn_route["runtime"].get("command"),
-            acp_args=turn_route["runtime"].get("args"),
-            max_iterations=max_iterations,
-            reasoning_config=reasoning_config,
-            prefill_messages=prefill_messages,
-            providers_allowed=pr.get("only"),
-            providers_ignored=pr.get("ignore"),
-            providers_order=pr.get("order"),
-            provider_sort=pr.get("sort"),
-            disabled_toolsets=["cronjob", "messaging", "clarify"],
-            tool_choice="required",
-            quiet_mode=True,
-            skip_memory=True,  # Cron system prompts would corrupt user representations
-            platform="cron",
-            session_id=_cron_session_id,
-            session_db=_session_db,
-        )
+        _agent_kwargs = _safe_agent_kwargs({
+            "model": turn_route["model"],
+            "api_key": turn_route["runtime"].get("api_key"),
+            "base_url": turn_route["runtime"].get("base_url"),
+            "provider": turn_route["runtime"].get("provider"),
+            "api_mode": turn_route["runtime"].get("api_mode"),
+            "acp_command": turn_route["runtime"].get("command"),
+            "acp_args": turn_route["runtime"].get("args"),
+            "max_iterations": max_iterations,
+            "reasoning_config": reasoning_config,
+            "prefill_messages": prefill_messages,
+            "providers_allowed": pr.get("only"),
+            "providers_ignored": pr.get("ignore"),
+            "providers_order": pr.get("order"),
+            "provider_sort": pr.get("sort"),
+            "disabled_toolsets": ["cronjob", "messaging", "clarify"],
+            "tool_choice": "required",
+            "quiet_mode": True,
+            "skip_memory": True,  # Cron system prompts would corrupt user representations
+            "platform": "cron",
+            "session_id": _cron_session_id,
+            "session_db": _session_db,
+        })
+        agent = AIAgent(**_agent_kwargs)
        
        # Run the agent with an *inactivity*-based timeout: the job can run
        # for hours if it's actively calling tools / receiving stream tokens,
@@ -888,6 +920,30 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
        # Use a separate variable for log display; keep final_response clean
        # for delivery logic (empty response = no delivery).
        logged_response = final_response if final_response else "(No response generated)"
+
+        # Check for script failure — both explicit [SCRIPT_FAILED] marker
+        # and heuristic detection for failures described in natural language.
+        _script_failed_reason = _detect_script_failure(final_response)
+        if _script_failed_reason is not None:
+            logger.warning(
+                "Job '%s': agent reported script failure — %s",
+                job_name, _script_failed_reason,
+            )
+            output = f"""# Cron Job: {job_name} (SCRIPT FAILED)
+
+**Job ID:** {job_id}
+**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
+**Schedule:** {job.get('schedule_display', 'N/A')}
+
+## Prompt
+
+{prompt}
+
+## Response
+
+{logged_response}
+"""
+            return False, output, final_response, _script_failed_reason
        
        output = f"""# Cron Job: {job_name}

@@ -907,13 +963,10 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
        logger.info("Job '%s' completed successfully", job_name)
        return True, output, final_response, None
        
-    except ModelContextError:
-        # Re-raise so tick() can auto-pause the job and prevent repeated failures.
-        raise
    except Exception as e:
        error_msg = f"{type(e).__name__}: {str(e)}"
        logger.exception("Job '%s' failed: %s", job_name, error_msg)
-
+        
        output = f"""# Cron Job: {job_name} (FAILED)

 **Job ID:** {job_id}
@@ -995,48 +1048,27 @@ def tick(verbose: bool = True, adapters=None, loop=None) -> int:
        if verbose:
            logger.info("%s - %s job(s) due", _hermes_now().strftime('%H:%M:%S'), len(due_jobs))

-        # Parallel job execution with priority sorting
-        PRIORITY_ORDER = {"critical": 0, "high": 1, "normal": 2, "low": 3}
-        due_jobs_sorted = sorted(due_jobs, key=lambda j: PRIORITY_ORDER.get(j.get("priority", "normal"), 2))
-        MAX_PARALLEL = int(os.environ.get("HERMES_CRON_MAX_PARALLEL", "10"))
-
        executed = 0
-        _job_results = []
-
-        def _process_single_job(job):
-            job_name = job.get("name", job["id"])
+        for job in due_jobs:
+            # If the interpreter is shutting down (e.g. gateway restart),
+            # stop processing immediately — ThreadPoolExecutor.submit()
+            # will raise RuntimeError for every remaining job.
            if sys.is_finalizing():
-                return None
-            try:
-                advance_next_run(job["id"])
-                success, output, final_response, error = run_job(job)
-                return (job, success, output, final_response, error)
-            except ModelContextError as e:
-                # Auto-pause: incompatible model will never succeed; stop scheduling
-                # it until the operator reconfigures the job or config.yaml.
-                error_msg = str(e)
-                logger.error(
-                    "Job '%s' paused — model context incompatibility: %s",
-                    job_name,
-                    error_msg,
+                logger.warning(
+                    "Interpreter finalizing — skipping %d remaining job(s)",
+                    len(due_jobs) - executed,
                )
-                from cron.jobs import pause_job
-                pause_job(job["id"], reason=f"ModelContextError: {error_msg}")
-                return (job, False, "", "", error_msg)
-            except Exception as e:
-                logger.error("Job '%s': parallel error: %s", job_name, e)
-                return (job, False, "", None, str(e))
+                break

-        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_PARALLEL) as executor:
-            futures = {executor.submit(_process_single_job, job): job for job in due_jobs_sorted}
-            for future in concurrent.futures.as_completed(futures):
-                result = future.result()
-                if result is None:
-                    continue
-                _job_results.append(result)
-
-        for job, success, output, final_response, error in _job_results:
            try:
+                # For recurring jobs (cron/interval), advance next_run_at to the
+                # next future occurrence BEFORE execution.  This way, if the
+                # process crashes mid-run, the job won't re-fire on restart.
+                # One-shot jobs are left alone so they can retry on restart.
+                advance_next_run(job["id"])
+
+                success, output, final_response, error = run_job(job)
+
                output_file = save_job_output(job["id"], output)
                if verbose:
                    logger.info("Output saved to: %s", output_file)