fix(cron): expand _SCRIPT_FAILURE_PHRASES with SSH patterns

Part of #457, Closes #350 Detect SSH-specific errors: 'no such file or directory', 'command not found', 'ssh: connect to host', etc.
feat(cron): SSH dispatch validation utilities
2026-04-14 01:21:43 +00:00 · 2026-04-14 01:21:39 +00:00 · 2026-04-14 01:08:13 +00:00 · 2026-04-13 20:52:06 -04:00 · 2026-04-14 00:34:14 +00:00 · 2026-04-14 00:34:06 +00:00
19 changed files with 1991 additions and 42 deletions
--- a/agent/smart_model_routing.py
+++ b/agent/smart_model_routing.py
@@ -1,10 +1,11 @@
-"""Helpers for optional cheap-vs-strong model routing."""
+"""Helpers for optional cheap-vs-strong and time-aware model routing."""

 from __future__ import annotations

 import os
 import re
-from typing import Any, Dict, Optional
+from datetime import datetime
+from typing import Any, Dict, List, Optional

 from utils import is_truthy_value

@@ -192,3 +193,104 @@ def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any
            tuple(runtime.get("args") or ()),
        ),
    }
+
+
+# =========================================================================
+# Time-aware cron model routing
+# =========================================================================
+#
+# Empirical finding: cron error rate peaks at 18:00 (9.4%) vs 4.0% at 09:00.
+# During high-error windows, route cron jobs to more capable models.
+#
+# Config (config.yaml):
+#   cron_model_routing:
+#     enabled: true
+#     fallback_model: "anthropic/claude-sonnet-4"
+#     fallback_provider: "openrouter"
+#     windows:
+#       - start_hour: 17
+#         end_hour: 22
+#         reason: "evening_error_peak"
+#       - start_hour: 2
+#         end_hour: 5
+#         reason: "overnight_api_instability"
+# =========================================================================
+
+def _hour_in_window(hour: int, start: int, end: int) -> bool:
+    """Check if hour falls in [start, end) window, handling midnight wrap."""
+    if start <= end:
+        return start <= hour < end
+    else:
+        # Wraps midnight: e.g., 22-06
+        return hour >= start or hour < end
+
+
+def resolve_cron_model(
+    base_model: str,
+    routing_config: Optional[Dict[str, Any]],
+    now: Optional[datetime] = None,
+) -> Dict[str, Any]:
+    """Apply time-aware model override for cron jobs.
+
+    During configured high-error windows, returns a stronger model config.
+    Outside windows, returns the base model unchanged.
+
+    Args:
+        base_model: The model string already resolved (from job/config/env).
+        routing_config: The cron_model_routing dict from config.yaml.
+        now: Override current time (for testing). Defaults to datetime.now().
+
+    Returns:
+        Dict with keys: model, provider, overridden, reason.
+        - model: the effective model string to use
+        - provider: provider override (empty string = use default)
+        - overridden: True if time-based override was applied
+        - reason: why override was applied (empty string if not)
+    """
+    cfg = routing_config or {}
+
+    if not _coerce_bool(cfg.get("enabled"), False):
+        return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
+
+    windows = cfg.get("windows") or []
+    if not isinstance(windows, list) or not windows:
+        return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
+
+    current = now or datetime.now()
+    current_hour = current.hour
+
+    matched_window = None
+    for window in windows:
+        if not isinstance(window, dict):
+            continue
+        start = _coerce_int(window.get("start_hour"), -1)
+        end = _coerce_int(window.get("end_hour"), -1)
+        if start < 0 or end < 0:
+            continue
+        if _hour_in_window(current_hour, start, end):
+            matched_window = window
+            break
+
+    if not matched_window:
+        return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
+
+    # Window matched — use the override model from window or global fallback
+    override_model = str(matched_window.get("model") or "").strip()
+    override_provider = str(matched_window.get("provider") or "").strip()
+
+    if not override_model:
+        override_model = str(cfg.get("fallback_model") or "").strip()
+    if not override_provider:
+        override_provider = str(cfg.get("fallback_provider") or "").strip()
+
+    if not override_model:
+        return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
+
+    reason = str(matched_window.get("reason") or "time_window").strip()
+
+    return {
+        "model": override_model,
+        "provider": override_provider,
+        "overridden": True,
+        "reason": f"cron_routing:{reason}(hour={current_hour})",
+    }
--- a/cli.py
+++ b/cli.py
@@ -3134,6 +3134,196 @@ class HermesCLI:
        print(f"  Home:    {display}")
        print()

+    def _handle_debug_command(self, command: str):
+        """Generate a debug report with system info and logs, upload to paste service."""
+        import platform
+        import sys
+        import time as _time
+
+        # Parse optional lines argument
+        parts = command.split(maxsplit=1)
+        log_lines = 50
+        if len(parts) > 1:
+            try:
+                log_lines = min(int(parts[1]), 500)
+            except ValueError:
+                pass
+
+        _cprint("  Collecting debug info...")
+
+        # Collect system info
+        lines = []
+        lines.append("=== HERMES DEBUG REPORT ===")
+        lines.append(f"Generated: {_time.strftime('%Y-%m-%d %H:%M:%S %z')}")
+        lines.append("")
+
+        lines.append("--- System ---")
+        lines.append(f"Python: {sys.version}")
+        lines.append(f"Platform: {platform.platform()}")
+        lines.append(f"Architecture: {platform.machine()}")
+        lines.append(f"Hostname: {platform.node()}")
+        lines.append("")
+
+        # Hermes info
+        lines.append("--- Hermes ---")
+        try:
+            from hermes_constants import get_hermes_home, display_hermes_home
+            lines.append(f"Home: {display_hermes_home()}")
+        except Exception:
+            lines.append("Home: unknown")
+
+        try:
+            from hermes_constants import __version__
+            lines.append(f"Version: {__version__}")
+        except Exception:
+            lines.append("Version: unknown")
+
+        lines.append(f"Profile: {getattr(self, '_profile_name', 'default')}")
+        lines.append(f"Session: {self.session_id}")
+        lines.append(f"Model: {self.model}")
+        lines.append(f"Provider: {getattr(self, '_provider_name', 'unknown')}")
+
+        try:
+            lines.append(f"Working dir: {os.getcwd()}")
+        except Exception:
+            pass
+
+        # Config (redacted)
+        lines.append("")
+        lines.append("--- Config (redacted) ---")
+        try:
+            from hermes_constants import get_hermes_home
+            config_path = get_hermes_home() / "config.yaml"
+            if config_path.exists():
+                import yaml
+                with open(config_path) as f:
+                    cfg = yaml.safe_load(f) or {}
+                # Redact secrets
+                for key in ("api_key", "token", "secret", "password"):
+                    if key in cfg:
+                        cfg[key] = "***REDACTED***"
+                lines.append(yaml.dump(cfg, default_flow_style=False)[:2000])
+            else:
+                lines.append("(no config file found)")
+        except Exception as e:
+            lines.append(f"(error reading config: {e})")
+
+        # Recent logs
+        lines.append("")
+        lines.append(f"--- Recent Logs (last {log_lines} lines) ---")
+        try:
+            from hermes_constants import get_hermes_home
+            log_dir = get_hermes_home() / "logs"
+            if log_dir.exists():
+                for log_file in sorted(log_dir.glob("*.log")):
+                    try:
+                        content = log_file.read_text(encoding="utf-8", errors="replace")
+                        tail = content.strip().split("\n")[-log_lines:]
+                        if tail:
+                            lines.append(f"\n[{log_file.name}]")
+                            lines.extend(tail)
+                    except Exception:
+                        pass
+            else:
+                lines.append("(no logs directory)")
+        except Exception:
+            lines.append("(error reading logs)")
+
+        # Tool info
+        lines.append("")
+        lines.append("--- Enabled Toolsets ---")
+        try:
+            lines.append(", ".join(self.enabled_toolsets) if self.enabled_toolsets else "(none)")
+        except Exception:
+            lines.append("(unknown)")
+
+        report = "\n".join(lines)
+        report_size = len(report)
+
+        # Try to upload to paste services
+        paste_url = None
+        services = [
+            ("dpaste", _upload_dpaste),
+            ("0x0.st", _upload_0x0st),
+        ]
+
+        for name, uploader in services:
+            try:
+                url = uploader(report)
+                if url:
+                    paste_url = url
+                    break
+            except Exception:
+                continue
+
+        print()
+        if paste_url:
+            _cprint(f"  Debug report uploaded: {paste_url}")
+            _cprint(f"  Size: {report_size} bytes, {len(lines)} lines")
+        else:
+            # Fallback: save locally
+            try:
+                from hermes_constants import get_hermes_home
+                debug_path = get_hermes_home() / "debug-report.txt"
+                debug_path.write_text(report, encoding="utf-8")
+                _cprint(f"  Paste services unavailable. Report saved to: {debug_path}")
+                _cprint(f"  Size: {report_size} bytes, {len(lines)} lines")
+            except Exception as e:
+                _cprint(f"  Failed to save report: {e}")
+                _cprint(f"  Report ({report_size} bytes):")
+                print(report)
+        print()
+
+
+def _upload_dpaste(content: str) -> str | None:
+    """Upload content to dpaste.org. Returns URL or None."""
+    import urllib.request
+    import urllib.parse
+    data = urllib.parse.urlencode({
+        "content": content,
+        "syntax": "text",
+        "expiry_days": 7,
+    }).encode()
+    req = urllib.request.Request(
+        "https://dpaste.org/api/",
+        data=data,
+        headers={"User-Agent": "hermes-agent/debug"},
+    )
+    with urllib.request.urlopen(req, timeout=10) as resp:
+        url = resp.read().decode().strip()
+        if url.startswith("http"):
+            return url
+    return None
+
+
+def _upload_0x0st(content: str) -> str | None:
+    """Upload content to 0x0.st. Returns URL or None."""
+    import urllib.request
+    import io
+    # 0x0.st expects multipart form with a file field
+    boundary = "----HermesDebugBoundary"
+    body = (
+        f"--{boundary}\r\n"
+        f'Content-Disposition: form-data; name="file"; filename="debug.txt"\r\n'
+        f"Content-Type: text/plain\r\n\r\n"
+        f"{content}\r\n"
+        f"--{boundary}--\r\n"
+    ).encode()
+    req = urllib.request.Request(
+        "https://0x0.st",
+        data=body,
+        headers={
+            "Content-Type": f"multipart/form-data; boundary={boundary}",
+            "User-Agent": "hermes-agent/debug",
+        },
+    )
+    with urllib.request.urlopen(req, timeout=10) as resp:
+        url = resp.read().decode().strip()
+        if url.startswith("http"):
+            return url
+    return None
+
+
    def show_config(self):
        """Display current configuration with kawaii ASCII art."""
        # Get terminal config from environment (which was set from cli-config.yaml)
@@ -4321,6 +4511,8 @@ class HermesCLI:
            self.show_help()
        elif canonical == "profile":
            self._handle_profile_command()
+        elif canonical == "debug":
+            self._handle_debug_command(cmd_original)
        elif canonical == "tools":
            self._handle_tools_command(cmd_original)
        elif canonical == "toolsets":
--- a/cron/jobs.py
+++ b/cron/jobs.py
@@ -547,20 +547,30 @@ def resume_job(job_id: str) -> Optional[Dict[str, Any]]:


 def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
-    """Schedule a job to run on the next scheduler tick."""
+    """Schedule a job to run on the next scheduler tick.
+    
+    Clears stale error state when re-triggering a previously-failed job
+    so the stale failure doesn't persist until the next tick completes.
+    """
    job = get_job(job_id)
    if not job:
        return None
-    return update_job(
-        job_id,
-        {
-            "enabled": True,
-            "state": "scheduled",
-            "paused_at": None,
-            "paused_reason": None,
-            "next_run_at": _hermes_now().isoformat(),
-        },
-    )
+    
+    updates = {
+        "enabled": True,
+        "state": "scheduled",
+        "paused_at": None,
+        "paused_reason": None,
+        "next_run_at": _hermes_now().isoformat(),
+    }
+    
+    # Clear stale error state when re-triggering
+    if job.get("last_status") == "error":
+        updates["last_status"] = "retrying"
+        updates["last_error"] = None
+        updates["error_cleared_at"] = _hermes_now().isoformat()
+    
+    return update_job(job_id, updates)


 def run_job_now(job_id: str) -> Optional[Dict[str, Any]]:
@@ -618,6 +628,7 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
    
    Updates last_run_at, last_status, increments completed count,
    computes next_run_at, and auto-deletes if repeat limit reached.
+    Tracks health timestamps for error/success history.
    """
    jobs = load_jobs()
    for i, job in enumerate(jobs):
@@ -627,6 +638,18 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
            job["last_status"] = "ok" if success else "error"
            job["last_error"] = error if not success else None
            
+            # Track health timestamps
+            if success:
+                job["last_success_at"] = now
+                # Clear stale error tracking on success
+                if job.get("last_error_at"):
+                    job["error_resolved_at"] = now
+            else:
+                job["last_error_at"] = now
+                # Clear resolved tracking on new error
+                if job.get("error_resolved_at"):
+                    del job["error_resolved_at"]
+            
            # Increment completed count
            if job.get("repeat"):
                job["repeat"]["completed"] = job["repeat"].get("completed", 0) + 1
@@ -656,6 +679,32 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
    save_jobs(jobs)


+
+def clear_job_error(job_id: str) -> Optional[Dict[str, Any]]:
+    """
+    Clear stale error state for a job.
+    
+    Resets last_status to 'ok', last_error to None, and 
+    records when the error was cleared. Useful after auth
+    recovery when the job itself is healthy but stale error
+    state persists.
+    
+    Returns:
+        Updated job dict, or None if not found.
+    """
+    jobs = load_jobs()
+    for job in jobs:
+        if job["id"] == job_id:
+            job["last_status"] = "ok"
+            job["last_error"] = None
+            job["error_cleared_at"] = _hermes_now().isoformat()
+            save_jobs(jobs)
+            return job
+    save_jobs(jobs)
+    return None
+
+
+
 def advance_next_run(job_id: str) -> bool:
    """Preemptively advance next_run_at for a recurring job before execution.

--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -37,6 +37,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
 from hermes_constants import get_hermes_home
 from hermes_cli.config import load_config
 from hermes_time import now as _hermes_now
+from agent.model_metadata import is_local_endpoint

 logger = logging.getLogger(__name__)

@@ -181,6 +182,15 @@ _SCRIPT_FAILURE_PHRASES = (
    "exit status",
    "non-zero exit",
    "did not complete",
+    # SSH-specific failure patterns (#350)
+    "no such file or directory",
+    "command not found",
+    "hermes binary not found",
+    "hermes not found",
+    "ssh: connect to host",
+    "connection timed out",
+    "host key verification failed",
+    "no route to host",
    "could not run",
    "unable to execute",
    "permission denied",
@@ -717,6 +727,22 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:

        # Reasoning config from env or config.yaml
        from hermes_constants import parse_reasoning_effort
+
+        # Time-aware cron model routing — override model during high-error windows
+        try:
+            from agent.smart_model_routing import resolve_cron_model
+            _cron_routing_cfg = (_cfg.get("cron_model_routing") or {})
+            _cron_route = resolve_cron_model(model, _cron_routing_cfg)
+            if _cron_route["overridden"]:
+                _original_model = model
+                model = _cron_route["model"]
+                logger.info(
+                    "Job '%s': cron model override %s -> %s (%s)",
+                    job_id, _original_model, model, _cron_route["reason"],
+                )
+        except Exception as _e:
+            logger.debug("Job '%s': cron model routing skipped: %s", job_id, _e)
+
        effort = os.getenv("HERMES_REASONING_EFFORT", "")
        if not effort:
            effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
@@ -777,6 +803,29 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            },
        )

+        # Build disabled toolsets — always exclude cronjob/messaging/clarify
+        # for cron sessions.  When the runtime endpoint is cloud (not local),
+        # also disable terminal so the agent does not attempt SSH or shell
+        # commands that require local infrastructure (keys, filesystem).
+        # Jobs that declare requires_local_infra=true also get terminal
+        # disabled on cloud endpoints regardless of this check.  #379
+        _cron_disabled = ["cronjob", "messaging", "clarify"]
+        _runtime_base_url = turn_route["runtime"].get("base_url", "")
+        _is_cloud = not is_local_endpoint(_runtime_base_url)
+        if _is_cloud:
+            _cron_disabled.append("terminal")
+            logger.info(
+                "Job '%s': cloud provider detected (%s), disabling terminal toolset",
+                job_name,
+                turn_route["runtime"].get("provider", "unknown"),
+            )
+        if job.get("requires_local_infra") and _is_cloud:
+            logger.warning(
+                "Job '%s': requires_local_infra=true but running on cloud provider — "
+                "terminal-dependent steps will fail gracefully",
+                job_name,
+            )
+
        _agent_kwargs = _safe_agent_kwargs({
            "model": turn_route["model"],
            "api_key": turn_route["runtime"].get("api_key"),
@@ -784,7 +833,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            "provider": turn_route["runtime"].get("provider"),
            "api_mode": turn_route["runtime"].get("api_mode"),
            "acp_command": turn_route["runtime"].get("command"),
-            "acp_args": turn_route["runtime"].get("args"),
+            "acp_args": list(turn_route["runtime"].get("args") or []),
            "max_iterations": max_iterations,
            "reasoning_config": reasoning_config,
            "prefill_messages": prefill_messages,
@@ -792,7 +841,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            "providers_ignored": pr.get("ignore"),
            "providers_order": pr.get("order"),
            "provider_sort": pr.get("sort"),
-            "disabled_toolsets": ["cronjob", "messaging", "clarify"],
+            "disabled_toolsets": _cron_disabled,
            "tool_choice": "required",
            "quiet_mode": True,
            "skip_memory": True,  # Cron system prompts would corrupt user representations
--- a/cron/ssh_dispatch.py
+++ b/cron/ssh_dispatch.py
@@ -0,0 +1,212 @@
+"""
+SSH dispatch utilities for cron jobs.
+
+Provides validated remote execution so broken hermes binary paths
+are caught before draining the dispatch queue.
+
+Usage:
+    from cron.ssh_dispatch import SSHEnvironment, format_dispatch_report
+
+    ssh = SSHEnvironment(host="root@ezra", agent="allegro")
+    result = ssh.dispatch("cron tick")
+    if not result.success:
+        print(result.failure_reason)
+"""
+
+import subprocess
+import shutil
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+@dataclass
+class DispatchResult:
+    """Structured result of a remote command dispatch."""
+    host: str
+    command: str
+    success: bool
+    exit_code: Optional[int] = None
+    stdout: str = ""
+    stderr: str = ""
+    failure_reason: Optional[str] = None
+    duration_s: float = 0.0
+
+
+@dataclass
+class SSHEnvironment:
+    """Validates and dispatches commands to a remote host via SSH."""
+
+    host: str  # e.g. "root@ezra" or "192.168.1.10"
+    agent: str = ""  # agent name for logging
+    hermes_path: Optional[str] = None  # explicit path, auto-detected if None
+    timeout: int = 120  # seconds
+    _validated_path: Optional[str] = field(default=None, init=False, repr=False)
+
+    def _ssh_base(self) -> List[str]:
+        return [
+            "ssh",
+            "-o", "ConnectTimeout=10",
+            "-o", "StrictHostKeyChecking=accept-new",
+            "-o", "BatchMode=yes",
+            self.host,
+        ]
+
+    def _probe_remote_binary(self, candidate: str) -> bool:
+        """Check if a hermes binary exists and is executable on the remote host."""
+        try:
+            result = subprocess.run(
+                self._ssh_base() + [f"test -x {candidate}"],
+                capture_output=True, timeout=15,
+            )
+            return result.returncode == 0
+        except (subprocess.TimeoutExpired, FileNotFoundError):
+            return False
+
+    def detect_hermes_binary(self) -> Optional[str]:
+        """Find a working hermes binary on the remote host."""
+        if self._validated_path:
+            return self._validated_path
+
+        candidates = []
+        if self.hermes_path:
+            candidates.append(self.hermes_path)
+
+        # Common locations
+        candidates.extend([
+            "hermes",  # on PATH
+            "~/.local/bin/hermes",
+            "/usr/local/bin/hermes",
+            f"~/wizards/{self.agent}/venv/bin/hermes" if self.agent else "",
+            f"/root/wizards/{self.agent}/venv/bin/hermes" if self.agent else "",
+        ])
+        candidates = [c for c in candidates if c]
+
+        for candidate in candidates:
+            if self._probe_remote_binary(candidate):
+                self._validated_path = candidate
+                return candidate
+
+        return None
+
+    def dispatch(self, command: str, *, validate_binary: bool = True) -> DispatchResult:
+        """Execute a command on the remote host."""
+        import time
+        start = time.monotonic()
+
+        if validate_binary:
+            binary = self.detect_hermes_binary()
+            if not binary:
+                return DispatchResult(
+                    host=self.host,
+                    command=command,
+                    success=False,
+                    failure_reason=f"No working hermes binary found on {self.host}",
+                    duration_s=time.monotonic() - start,
+                )
+
+        try:
+            result = subprocess.run(
+                self._ssh_base() + [command],
+                capture_output=True,
+                timeout=self.timeout,
+            )
+            duration = time.monotonic() - start
+            stdout = result.stdout.decode("utf-8", errors="replace")
+            stderr = result.stderr.decode("utf-8", errors="replace")
+
+            failure_reason = None
+            if result.returncode != 0:
+                failure_reason = _classify_ssh_error(stderr, result.returncode)
+
+            return DispatchResult(
+                host=self.host,
+                command=command,
+                success=result.returncode == 0,
+                exit_code=result.returncode,
+                stdout=stdout,
+                stderr=stderr,
+                failure_reason=failure_reason,
+                duration_s=duration,
+            )
+        except subprocess.TimeoutExpired:
+            return DispatchResult(
+                host=self.host,
+                command=command,
+                success=False,
+                failure_reason=f"SSH command timed out after {self.timeout}s",
+                duration_s=time.monotonic() - start,
+            )
+        except FileNotFoundError:
+            return DispatchResult(
+                host=self.host,
+                command=command,
+                success=False,
+                failure_reason="ssh binary not found on local system",
+                duration_s=time.monotonic() - start,
+            )
+
+
+def _classify_ssh_error(stderr: str, exit_code: int) -> str:
+    """Classify an SSH error from stderr and exit code."""
+    lower = stderr.lower()
+
+    if "no such file or directory" in lower:
+        return f"Remote binary or file not found (exit {exit_code})"
+    if "command not found" in lower:
+        return f"Command not found on remote host (exit {exit_code})"
+    if "permission denied" in lower:
+        return f"Permission denied (exit {exit_code})"
+    if "connection timed out" in lower or "connection refused" in lower:
+        return f"SSH connection failed (exit {exit_code})"
+    if "host key verification failed" in lower:
+        return f"Host key verification failed (exit {exit_code})"
+    if "no route to host" in lower:
+        return f"No route to host (exit {exit_code})"
+    if exit_code == 127:
+        return f"Command not found (exit 127)"
+    if exit_code == 126:
+        return f"Command not executable (exit 126)"
+
+    return f"Command failed with exit code {exit_code}: {stderr[:200]}"
+
+
+def dispatch_to_hosts(
+    hosts: List[str],
+    command: str,
+    agent: str = "",
+    timeout: int = 120,
+) -> List[DispatchResult]:
+    """Dispatch a command to multiple hosts and return results."""
+    results = []
+    for host in hosts:
+        ssh = SSHEnvironment(host=host, agent=agent, timeout=timeout)
+        result = ssh.dispatch(command)
+        results.append(result)
+    return results
+
+
+def format_dispatch_report(results: List[DispatchResult]) -> str:
+    """Format a human-readable report of dispatch results."""
+    lines = ["## Dispatch Report", ""]
+
+    succeeded = [r for r in results if r.success]
+    failed = [r for r in results if not r.success]
+
+    lines.append(f"**Total:** {len(results)} hosts | "
+                 f"**OK:** {len(succeeded)} | **Failed:** {len(failed)}")
+    lines.append("")
+
+    for r in results:
+        status = "OK" if r.success else "FAIL"
+        lines.append(f"### {r.host} [{status}]")
+        lines.append(f"- Command: `{r.command}`")
+        lines.append(f"- Duration: {r.duration_s:.1f}s")
+        if r.exit_code is not None:
+            lines.append(f"- Exit code: {r.exit_code}")
+        if r.failure_reason:
+            lines.append(f"- **Failure:** {r.failure_reason}")
+        if r.stderr and not r.success:
+            lines.append(f"- Stderr: `{r.stderr[:300]}`")
+        lines.append("")
+
+    return "\n".join(lines)
--- a/deploy-crons.py
+++ b/deploy-crons.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+deploy-crons — normalize cron job schemas for consistent model field types.
+
+This script ensures that the model field in jobs.json is always a dict when
+either model or provider is specified, preventing schema inconsistency.
+
+Usage:
+    python deploy-crons.py [--dry-run] [--jobs-file PATH]
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+def normalize_job(job: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Normalize a job dict to ensure consistent model field types.
+
+    Before normalization:
+    - If model AND provider: model = raw string, provider = raw string (inconsistent)
+    - If only model: model = raw string
+    - If only provider: provider = raw string at top level
+
+    After normalization:
+    - If model exists: model = {"model": "xxx"}
+    - If provider exists: model = {"provider": "yyy"}
+    - If both exist: model = {"model": "xxx", "provider": "yyy"}
+    - If neither: model = None
+    """
+    job = dict(job)  # Create a copy to avoid modifying the original
+    
+    model = job.get("model")
+    provider = job.get("provider")
+    
+    # Skip if already normalized (model is a dict)
+    if isinstance(model, dict):
+        return job
+    
+    # Build normalized model dict
+    model_dict = {}
+    
+    if model is not None and isinstance(model, str):
+        model_dict["model"] = model.strip()
+    
+    if provider is not None and isinstance(provider, str):
+        model_dict["provider"] = provider.strip()
+    
+    # Set model field
+    if model_dict:
+        job["model"] = model_dict
+    else:
+        job["model"] = None
+    
+    # Remove top-level provider field if it was moved into model dict
+    if provider is not None and "provider" in model_dict:
+        # Keep provider field for backward compatibility but mark it as deprecated
+        # This allows existing code that reads job["provider"] to continue working
+        pass
+    
+    return job
+
+
+def normalize_jobs_file(jobs_file: Path, dry_run: bool = False) -> int:
+    """
+    Normalize all jobs in a jobs.json file.
+    
+    Returns the number of jobs that were modified.
+    """
+    if not jobs_file.exists():
+        print(f"Error: Jobs file not found: {jobs_file}", file=sys.stderr)
+        return 1
+    
+    try:
+        with open(jobs_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Error: Invalid JSON in {jobs_file}: {e}", file=sys.stderr)
+        return 1
+    
+    jobs = data.get("jobs", [])
+    if not jobs:
+        print("No jobs found in file.")
+        return 0
+    
+    modified_count = 0
+    for i, job in enumerate(jobs):
+        original_model = job.get("model")
+        original_provider = job.get("provider")
+        
+        normalized_job = normalize_job(job)
+        
+        # Check if anything changed
+        if (normalized_job.get("model") != original_model or
+            normalized_job.get("provider") != original_provider):
+            jobs[i] = normalized_job
+            modified_count += 1
+            
+            job_id = job.get("id", "?")
+            job_name = job.get("name", "(unnamed)")
+            print(f"Normalized job {job_id} ({job_name}):")
+            print(f"  model: {original_model!r} -> {normalized_job.get('model')!r}")
+            print(f"  provider: {original_provider!r} -> {normalized_job.get('provider')!r}")
+    
+    if modified_count == 0:
+        print("All jobs already have consistent model field types.")
+        return 0
+    
+    if dry_run:
+        print(f"DRY RUN: Would normalize {modified_count} jobs.")
+        return 0
+    
+    # Write back to file
+    data["jobs"] = jobs
+    try:
+        with open(jobs_file, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        print(f"Normalized {modified_count} jobs in {jobs_file}")
+        return 0
+    except Exception as e:
+        print(f"Error writing to {jobs_file}: {e}", file=sys.stderr)
+        return 1
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Normalize cron job schemas for consistent model field types."
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be changed without modifying the file."
+    )
+    parser.add_argument(
+        "--jobs-file",
+        type=Path,
+        default=Path.home() / ".hermes" / "cron" / "jobs.json",
+        help="Path to jobs.json file (default: ~/.hermes/cron/jobs.json)"
+    )
+    
+    args = parser.parse_args()
+    
+    if args.dry_run:
+        print("DRY RUN MODE — no changes will be made.")
+        print()
+    
+    return normalize_jobs_file(args.jobs_file, args.dry_run)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/docs/honcho-evaluation-322.md
+++ b/docs/honcho-evaluation-322.md
@@ -0,0 +1,170 @@
+# Honcho Memory Integration Evaluation (#322)
+
+## Executive Summary
+
+**Status:** Integration already implemented and production-ready.
+**Recommendation:** KEEP — well-gated, zero overhead when disabled, supports self-hosted.
+
+## Decision: Cloud vs Local
+
+### The Question
+"Do we want a cloud-dependent memory layer, or keep everything local?"
+
+### Answer: BOTH — User's Choice
+
+Honcho supports both deployment modes:
+
+| Mode | Configuration | Data Location | Use Case |
+|------|--------------|---------------|----------|
+| Cloud | `HONCHO_API_KEY` | Honcho servers | Quick start, no infrastructure |
+| Self-hosted | `HONCHO_BASE_URL=http://localhost:8000` | Your servers | Full sovereignty |
+| Disabled | No config | N/A | Pure local (holographic fact_store only) |
+
+### Why Keep It
+
+1. **Opt-in Architecture**
+   - No Honcho config → zero overhead (cron guard, lazy init)
+   - Memory provider system allows switching between providers
+   - `hermes memory off` disables completely
+
+2. **Zero Runtime Cost When Disabled**
+   ```python
+   if not cfg.enabled or not (cfg.api_key or cfg.base_url):
+       return ""  # No HTTP calls, no overhead
+   ```
+
+3. **Cross-Session User Modeling**
+   - Holographic fact_store lacks persistent user modeling
+   - Honcho provides: peer cards, dialectic Q&A, semantic search
+   - Complements (not replaces) local memory
+
+4. **Self-Hosted Option**
+   - Set `HONCHO_BASE_URL=http://localhost:8000`
+   - Run Honcho server locally via Docker
+   - Full data sovereignty
+
+5. **Production-Grade Implementation**
+   - 3 components, ~700 lines of code
+   - 7 tests passing
+   - Async prefetch (zero-latency context injection)
+   - Configurable recall modes (hybrid/context/tools)
+   - Write frequency control (async/turn/session/N-turns)
+
+## Architecture
+
+### Components (Already Implemented)
+
+```
+plugins/memory/honcho/
+├── client.py      # Config resolution (API key, base_url, profiles)
+├── session.py     # Session management, async prefetch, dialectic queries
+├── __init__.py    # MemoryProvider interface, 4 tool schemas
+├── cli.py         # CLI commands (setup, status, sessions, map, peer, mode)
+├── plugin.yaml    # Plugin metadata
+└── README.md      # Documentation
+```
+
+### Integration Points
+
+1. **System Prompt**: Context injected on first turn (cached for prompt caching)
+2. **Tool Registry**: 4 tools available when `recall_mode != "context"`
+3. **Session End**: Messages flushed to Honcho
+4. **Cron Guard**: Fully inactive in cron context
+
+### Tools Available
+
+| Tool | Cost | Speed | Purpose |
+|------|------|-------|---------|
+| `honcho_profile` | Free | Fast | Quick factual snapshot (peer card) |
+| `honcho_search` | Free | Fast | Semantic search (raw excerpts) |
+| `honcho_context` | Paid | Slow | Dialectic Q&A (synthesized answers) |
+| `honcho_conclude` | Free | Fast | Save persistent facts about user |
+
+## Configuration Guide
+
+### Option 1: Cloud (Quick Start)
+```bash
+# Get API key from https://app.honcho.dev
+export HONCHO_API_KEY="your-api-key"
+hermes chat
+```
+
+### Option 2: Self-Hosted (Full Sovereignty)
+```bash
+# Run Honcho server locally
+docker run -p 8000:8000 honcho/server
+
+# Configure Hermes
+export HONCHO_BASE_URL="http://localhost:8000"
+hermes chat
+```
+
+### Option 3: CLI Setup
+```bash
+hermes honcho setup
+```
+
+### Option 4: Disabled (Pure Local)
+```bash
+# Don't set any Honcho config
+hermes memory off  # If previously enabled
+hermes chat
+```
+
+## Memory Modes
+
+| Mode | Context Injection | Tools | Cost | Use Case |
+|------|------------------|-------|------|----------|
+| hybrid | Yes | Yes | Medium | Default — auto-inject + on-demand |
+| context | Yes | No | Low | Budget mode — auto-inject only |
+| tools | No | Yes | Variable | Full control — agent decides |
+
+## Risk Assessment
+
+| Risk | Mitigation | Status |
+|------|------------|--------|
+| Cloud dependency | Self-hosted option available | ✅ |
+| Cost from LLM calls | Recall mode "context" or "tools" reduces calls | ✅ |
+| Data privacy | Self-hosted keeps data on your servers | ✅ |
+| Performance overhead | Cron guard + lazy init + async prefetch | ✅ |
+| Vendor lock-in | MemoryProvider interface allows swapping | ✅ |
+
+## Comparison with Alternatives
+
+| Feature | Honcho | Holographic | Mem0 | Hindsight |
+|---------|--------|-------------|------|-----------|
+| Cross-session modeling | ✅ | ❌ | ✅ | ✅ |
+| Dialectic Q&A | ✅ | ❌ | ❌ | ❌ |
+| Self-hosted | ✅ | N/A | ❌ | ❌ |
+| Local-only option | ✅ | ✅ | ❌ | ✅ |
+| Cost | Free/Paid | Free | Paid | Free |
+
+## Conclusion
+
+**Keep Honcho integration.** It provides unique cross-session user modeling capabilities that complement the local holographic fact_store. The integration is:
+
+- Well-gated (opt-in, zero overhead when disabled)
+- Flexible (cloud or self-hosted)
+- Production-ready (7 tests passing, async prefetch, configurable)
+- Non-exclusive (works alongside other memory providers)
+
+### To Enable
+
+```bash
+# Cloud
+hermes honcho setup
+
+# Self-hosted
+export HONCHO_BASE_URL="http://localhost:8000"
+hermes chat
+```
+
+### To Disable
+
+```bash
+hermes memory off
+```
+
+---
+
+*Evaluated by SANDALPHON — Cron/Ops lane*
--- a/gateway/config.py
+++ b/gateway/config.py
@@ -412,6 +412,52 @@ class GatewayConfig:
        return self.unauthorized_dm_behavior


+def _validate_fallback_providers() -> None:
+    """Validate fallback_providers from config.yaml at gateway startup.
+
+    Checks that each entry has 'provider' and 'model' fields and logs
+    warnings for malformed entries.  This catches broken fallback chains
+    before they silently degrade into no-fallback mode.
+    """
+    try:
+        _home = get_hermes_home()
+        _config_path = _home / "config.yaml"
+        if not _config_path.exists():
+            return
+        import yaml
+        with open(_config_path, encoding="utf-8") as _f:
+            _cfg = yaml.safe_load(_f) or {}
+        fbp = _cfg.get("fallback_providers")
+        if not fbp:
+            return
+        if not isinstance(fbp, list):
+            logger.warning(
+                "fallback_providers should be a YAML list, got %s. "
+                "Fallback chain will be disabled.",
+                type(fbp).__name__,
+            )
+            return
+        for i, entry in enumerate(fbp):
+            if not isinstance(entry, dict):
+                logger.warning(
+                    "fallback_providers[%d] is not a dict (got %s). Skipping entry.",
+                    i, type(entry).__name__,
+                )
+                continue
+            if not entry.get("provider"):
+                logger.warning(
+                    "fallback_providers[%d] missing 'provider' field. Skipping entry.",
+                    i,
+                )
+            if not entry.get("model"):
+                logger.warning(
+                    "fallback_providers[%d] missing 'model' field. Skipping entry.",
+                    i,
+                )
+    except Exception:
+        pass  # Non-fatal; validation is advisory
+
+
 def load_gateway_config() -> GatewayConfig:
    """
    Load gateway configuration from multiple sources.
@@ -645,6 +691,19 @@ def load_gateway_config() -> GatewayConfig:
                platform.value, env_name,
            )

+    # Warn about API Server enabled without a key (unauthenticated endpoint)
+    if Platform.API_SERVER in config.platforms:
+        api_cfg = config.platforms[Platform.API_SERVER]
+        if api_cfg.enabled and not api_cfg.extra.get("key"):
+            logger.warning(
+                "api_server is enabled but API_SERVER_KEY is not set. "
+                "The API endpoint will run unauthenticated. "
+                "Set API_SERVER_KEY in ~/.hermes/.env to secure it.",
+            )
+
+    # Validate fallback_providers structure from config.yaml
+    _validate_fallback_providers()
+
    return config


--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -1338,6 +1338,11 @@ _KNOWN_ROOT_KEYS = {
    "fallback_providers", "credential_pool_strategies", "toolsets",
    "agent", "terminal", "display", "compression", "delegation",
    "auxiliary", "custom_providers", "memory", "gateway",
+    "session_reset", "browser", "checkpoints", "smart_model_routing",
+    "voice", "stt", "tts", "human_delay", "security", "privacy",
+    "cron", "logging", "approvals", "command_allowlist", "quick_commands",
+    "personalities", "skills", "honcho", "timezone", "discord",
+    "whatsapp", "prefill_messages_file", "file_read_max_chars",
 }

 # Valid fields inside a custom_providers list entry
@@ -1498,6 +1503,72 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
                f"Move '{key}' under the appropriate section",
            ))

+    # ── fallback_providers must be a list of dicts with provider + model ─
+    fbp = config.get("fallback_providers")
+    if fbp is not None:
+        if not isinstance(fbp, list):
+            issues.append(ConfigIssue(
+                "error",
+                f"fallback_providers should be a YAML list, got {type(fbp).__name__}",
+                "Change to:\n"
+                "  fallback_providers:\n"
+                "    - provider: openrouter\n"
+                "      model: google/gemini-3-flash-preview",
+            ))
+        elif fbp:
+            for i, entry in enumerate(fbp):
+                if not isinstance(entry, dict):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is not a dict (got {type(entry).__name__})",
+                        "Each entry needs at minimum: provider, model",
+                    ))
+                    continue
+                if not entry.get("provider"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is missing 'provider' field — this fallback will be skipped",
+                        "Add: provider: openrouter (or another provider name)",
+                    ))
+                if not entry.get("model"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is missing 'model' field — this fallback will be skipped",
+                        "Add: model: google/gemini-3-flash-preview (or another model slug)",
+                    ))
+
+    # ── session_reset validation ─────────────────────────────────────────
+    session_reset = config.get("session_reset", {})
+    if isinstance(session_reset, dict):
+        idle_minutes = session_reset.get("idle_minutes")
+        if idle_minutes is not None:
+            if not isinstance(idle_minutes, (int, float)) or idle_minutes <= 0:
+                issues.append(ConfigIssue(
+                    "warning",
+                    f"session_reset.idle_minutes={idle_minutes} is invalid (must be a positive number)",
+                    "Set to a positive integer, e.g. 1440 (24 hours). Using 0 causes immediate resets.",
+                ))
+        at_hour = session_reset.get("at_hour")
+        if at_hour is not None:
+            if not isinstance(at_hour, (int, float)) or not (0 <= at_hour <= 23):
+                issues.append(ConfigIssue(
+                    "warning",
+                    f"session_reset.at_hour={at_hour} is invalid (must be 0-23)",
+                    "Set to an hour between 0 and 23, e.g. 4 for 4am",
+                ))
+
+    # ── API Server key check ─────────────────────────────────────────────
+    # If api_server is enabled via env, but no key is set, warn.
+    # This catches the "API_SERVER_KEY not configured" error from gateway logs.
+    api_server_enabled = os.getenv("API_SERVER_ENABLED", "").lower() in ("true", "1", "yes")
+    api_server_key = os.getenv("API_SERVER_KEY", "").strip()
+    if api_server_enabled and not api_server_key:
+        issues.append(ConfigIssue(
+            "warning",
+            "API_SERVER is enabled but API_SERVER_KEY is not set — the API server will run unauthenticated",
+            "Set API_SERVER_KEY in ~/.hermes/.env to secure the API endpoint",
+        ))
+
    return issues


--- a/hermes_cli/cron.py
+++ b/hermes_cli/cron.py
@@ -93,6 +93,39 @@ def cron_list(show_all: bool = False):
        script = job.get("script")
        if script:
            print(f"    Script:    {script}")
+        
+        # Show health status
+        last_status = job.get("last_status")
+        last_error = job.get("last_error")
+        last_error_at = job.get("last_error_at")
+        last_success_at = job.get("last_success_at")
+        error_cleared_at = job.get("error_cleared_at")
+        error_resolved_at = job.get("error_resolved_at")
+        
+        if last_status == "error" and last_error:
+            if error_cleared_at or error_resolved_at:
+                # Error was cleared/resolved
+                cleared_time = error_cleared_at or error_resolved_at
+                print(color(f"    Status:      ok (error cleared)", Colors.GREEN))
+                print(color(f"    Last error:  {last_error[:80]}...", Colors.DIM))
+                print(color(f"    Resolved:    {cleared_time}", Colors.DIM))
+            else:
+                # Current error
+                print(color(f"    Status:      ERROR", Colors.RED))
+                print(color(f"    Error:       {last_error[:80]}...", Colors.RED))
+                if last_error_at:
+                    print(color(f"    Since:       {last_error_at}", Colors.RED))
+        elif last_status == "retrying":
+            print(color(f"    Status:      retrying (error cleared)", Colors.YELLOW))
+        elif last_status == "ok":
+            if last_success_at:
+                print(color(f"    Status:      ok (last success: {last_success_at})", Colors.GREEN))
+        elif last_status:
+            print(f"    Status:      {last_status}")
+        
+        # Show success history if available
+        if last_success_at and last_status != "error":
+            print(f"    Last ok:     {last_success_at}")
        print()

    from hermes_cli.gateway import find_gateway_pids
@@ -222,7 +255,18 @@ def cron_edit(args):


 def _job_action(action: str, job_id: str, success_verb: str, now: bool = False) -> int:
-    if action == "run" and now:
+    if action == "clear_error":
+        result = _cron_api(action="clear_error", job_id=job_id)
+        if not result.get("success"):
+            print(color(f"Failed to clear error: {result.get('error', 'unknown error')}", Colors.RED))
+            return 1
+        job = result.get("job", {})
+        name = job.get("name", job_id)
+        print(color(f"Cleared stale error state for job '{name}'", Colors.GREEN))
+        if job.get("error_cleared_at"):
+            print(f"  Cleared at: {job['error_cleared_at']}")
+        return 0
+if action == "run" and now:
        # Synchronous execution — run job immediately and show result
        result = _cron_api(action="run_now", job_id=job_id)
        if not result.get("success"):
@@ -292,9 +336,13 @@ def cron_command(args):
        now = getattr(args, 'now', False)
        return _job_action("run", args.job_id, "Triggered", now=now)

+    
+    if subcmd == "clear-error":
+        return _job_action("clear_error", args.job_id, "Cleared")
+    
    if subcmd in {"remove", "rm", "delete"}:
        return _job_action("remove", args.job_id, "Removed")

    print(f"Unknown cron command: {subcmd}")
-    print("Usage: hermes cron [list|create|edit|pause|resume|run|remove|status|tick]")
+    print("Usage: hermes cron [list|create|edit|pause|resume|run|remove|clear-error|status|tick]")
    sys.exit(1)
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -4576,6 +4576,9 @@ For more help on a command:
    cron_run.add_argument("job_id", help="Job ID to trigger")
    cron_run.add_argument("--now", action="store_true", help="Execute immediately and wait for result (clears stale errors)")

+    cron_clear_error = cron_subparsers.add_parser("clear-error", help="Clear stale error state for a job")
+    cron_clear_error.add_argument("job_id", help="Job ID to clear error for")
+
    cron_remove = cron_subparsers.add_parser("remove", aliases=["rm", "delete"], help="Remove a scheduled job")
    cron_remove.add_argument("job_id", help="Job ID to remove")

@@ -5005,7 +5008,7 @@ For more help on a command:
    # =========================================================================
    sessions_parser = subparsers.add_parser(
        "sessions",
-        help="Manage session history (list, rename, export, prune, delete)",
+        help="Manage session history (list, rename, export, prune, gc, delete)",
        description="View and manage the SQLite session store"
    )
    sessions_subparsers = sessions_parser.add_subparsers(dest="sessions_action")
@@ -5028,6 +5031,14 @@ For more help on a command:
    sessions_prune.add_argument("--source", help="Only prune sessions from this source")
    sessions_prune.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")

+    sessions_gc = sessions_subparsers.add_parser("gc", help="Garbage-collect empty/trivial sessions")
+    sessions_gc.add_argument("--empty-hours", type=int, default=24, help="Delete empty (0-msg) sessions older than N hours (default: 24)")
+    sessions_gc.add_argument("--trivial-days", type=int, default=7, help="Delete trivial (1-5 msg) sessions older than N days (default: 7)")
+    sessions_gc.add_argument("--trivial-max", type=int, default=5, help="Max messages to consider trivial (default: 5)")
+    sessions_gc.add_argument("--source", help="Only GC sessions from this source")
+    sessions_gc.add_argument("--dry-run", action="store_true", help="Show what would be deleted without deleting")
+    sessions_gc.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
+
    sessions_stats = sessions_subparsers.add_parser("stats", help="Show session store statistics")

    sessions_rename = sessions_subparsers.add_parser("rename", help="Set or change a session's title")
@@ -5197,6 +5208,49 @@ For more help on a command:
                size_mb = os.path.getsize(db_path) / (1024 * 1024)
                print(f"Database size: {size_mb:.1f} MB")

+        elif action == "gc":
+            dry_run = getattr(args, "dry_run", False)
+            if dry_run:
+                counts = db.garbage_collect(
+                    empty_older_than_hours=args.empty_hours,
+                    trivial_max_messages=args.trivial_max,
+                    trivial_older_than_days=args.trivial_days,
+                    source=args.source,
+                    dry_run=True,
+                )
+                print(f"[dry-run] Would delete {counts['total']} session(s):")
+                print(f"  Empty (0 msgs, >{args.empty_hours}h old): {counts['empty']}")
+                print(f"  Trivial (<={args.trivial_max} msgs, >{args.trivial_days}d old): {counts['trivial']}")
+            else:
+                # Preview first
+                preview = db.garbage_collect(
+                    empty_older_than_hours=args.empty_hours,
+                    trivial_max_messages=args.trivial_max,
+                    trivial_older_than_days=args.trivial_days,
+                    source=args.source,
+                    dry_run=True,
+                )
+                if preview["total"] == 0:
+                    print("Nothing to collect.")
+                else:
+                    if not args.yes:
+                        if not _confirm_prompt(
+                            f"Delete {preview['total']} session(s) "
+                            f"({preview['empty']} empty, {preview['trivial']} trivial)? [y/N] "
+                        ):
+                            print("Cancelled.")
+                            return
+                    counts = db.garbage_collect(
+                        empty_older_than_hours=args.empty_hours,
+                        trivial_max_messages=args.trivial_max,
+                        trivial_older_than_days=args.trivial_days,
+                        source=args.source,
+                        dry_run=False,
+                    )
+                    print(f"Collected {counts['total']} session(s):")
+                    print(f"  Empty: {counts['empty']}")
+                    print(f"  Trivial: {counts['trivial']}")
+
        else:
            sessions_parser.print_help()

--- a/hermes_state.py
+++ b/hermes_state.py
@@ -1355,3 +1355,78 @@ class SessionDB:
            return len(session_ids)

        return self._execute_write(_do)
+
+    def garbage_collect(
+        self,
+        empty_older_than_hours: int = 24,
+        trivial_max_messages: int = 5,
+        trivial_older_than_days: int = 7,
+        source: str = None,
+        dry_run: bool = False,
+    ) -> Dict[str, int]:
+        """Delete empty and trivial sessions based on age.
+
+        Policy (matches #315):
+        - Empty sessions (0 messages) older than ``empty_older_than_hours``
+        - Trivial sessions (1..``trivial_max_messages`` msgs) older than
+          ``trivial_older_than_days``
+        - Sessions with more than ``trivial_max_messages`` are kept indefinitely
+        - Active (not ended) sessions are never deleted
+
+        Returns a dict with counts: ``empty``, ``trivial``, ``total``.
+        """
+        now = time.time()
+        empty_cutoff = now - (empty_older_than_hours * 3600)
+        trivial_cutoff = now - (trivial_older_than_days * 86400)
+
+        def _do(conn):
+            # --- Find empty sessions ---
+            empty_q = (
+                "SELECT id FROM sessions "
+                "WHERE message_count = 0 AND started_at < ? AND ended_at IS NOT NULL"
+            )
+            params = [empty_cutoff]
+            if source:
+                empty_q += " AND source = ?"
+                params.append(source)
+            empty_ids = [r[0] for r in conn.execute(empty_q, params).fetchall()]
+
+            # --- Find trivial sessions ---
+            trivial_q = (
+                "SELECT id FROM sessions "
+                "WHERE message_count BETWEEN 1 AND ? AND started_at < ? AND ended_at IS NOT NULL"
+            )
+            t_params = [trivial_max_messages, trivial_cutoff]
+            if source:
+                trivial_q += " AND source = ?"
+                t_params.append(source)
+            trivial_ids = [r[0] for r in conn.execute(trivial_q, t_params).fetchall()]
+
+            all_ids = set(empty_ids) | set(trivial_ids)
+
+            if dry_run:
+                return {"empty": len(empty_ids), "trivial": len(trivial_ids),
+                        "total": len(all_ids)}
+
+            # --- Collect child sessions to delete first (FK constraint) ---
+            child_ids = set()
+            for sid in all_ids:
+                for r in conn.execute(
+                    "SELECT id FROM sessions WHERE parent_session_id = ?", (sid,)
+                ).fetchall():
+                    child_ids.add(r[0])
+
+            # Delete children
+            for cid in child_ids:
+                conn.execute("DELETE FROM messages WHERE session_id = ?", (cid,))
+                conn.execute("DELETE FROM sessions WHERE id = ?", (cid,))
+
+            # Delete targets
+            for sid in all_ids:
+                conn.execute("DELETE FROM messages WHERE session_id = ?", (sid,))
+                conn.execute("DELETE FROM sessions WHERE id = ?", (sid,))
+
+            return {"empty": len(empty_ids), "trivial": len(trivial_ids),
+                    "total": len(all_ids)}
+
+        return self._execute_write(_do)
--- a/model-watchdog.py
+++ b/model-watchdog.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+"""
+Model Watchdog — monitors tmux panes for model drift.
+Checks all hermes TUI sessions in dev and timmy tmux sessions.
+If any pane is running a non-mimo model, kills and restarts it.
+
+Usage: python3 ~/.hermes/bin/model-watchdog.py [--fix]
+  --fix   Actually restart drifted panes (default: dry-run)
+"""
+
+import subprocess
+import sys
+import re
+import time
+import os
+
+ALLOWED_MODEL = "mimo-v2-pro"
+
+# Profile -> expected model. If a pane is running this profile with this model, it's healthy.
+# Profiles not in this map are checked against ALLOWED_MODEL.
+PROFILE_MODELS = {
+    "default": "mimo-v2-pro",
+    "timmy-sprint": "mimo-v2-pro",
+    "fenrir": "mimo-v2-pro",
+    "bezalel": "gpt-5.4",
+    "burn": "mimo-v2-pro",
+    "creative": "claude-sonnet",
+    "research": "claude-sonnet",
+    "review": "claude-sonnet",
+}
+
+TMUX_SESSIONS = ["dev", "timmy"]
+LOG_FILE = os.path.expanduser("~/.hermes/logs/model-watchdog.log")
+
+def log(msg):
+    os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
+    ts = time.strftime("%Y-%m-%d %H:%M:%S")
+    line = f"[{ts}] {msg}"
+    print(line)
+    with open(LOG_FILE, "a") as f:
+        f.write(line + "\n")
+
+def run(cmd):
+    r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
+    return r.stdout.strip(), r.returncode
+
+def get_panes(session):
+    """Get all pane info from ALL windows in a tmux session."""
+    # First get all windows
+    win_out, win_rc = run(f"tmux list-windows -t {session} -F '#{{window_name}}' 2>/dev/null")
+    if win_rc != 0:
+        return []
+
+    panes = []
+    for window_name in win_out.split("\n"):
+        if not window_name.strip():
+            continue
+        target = f"{session}:{window_name}"
+        out, rc = run(f"tmux list-panes -t {target} -F '#{{pane_index}}|#{{pane_pid}}|#{{pane_tty}}' 2>/dev/null")
+        if rc != 0:
+            continue
+        for line in out.split("\n"):
+            if "|" in line:
+                idx, pid, tty = line.split("|")
+                panes.append({
+                    "session": session,
+                    "window": window_name,
+                    "index": int(idx),
+                    "pid": int(pid),
+                    "tty": tty,
+                })
+    return panes
+
+def get_hermes_pid_for_tty(tty):
+    """Find hermes process running on a specific TTY."""
+    out, _ = run(f"ps aux | grep '{tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | awk '{{print $2}}'")
+    if out:
+        return int(out.split("\n")[0])
+    return None
+
+def get_model_from_pane(session, pane_idx, window=None):
+    """Capture the pane and extract the model from the status bar."""
+    target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}"
+    out, _ = run(f"tmux capture-pane -t {target} -p 2>/dev/null | tail -30")
+    # Look for model in status bar: ⚕ model-name │
+    matches = re.findall(r'⚕\s+(\S+)\s+│', out)
+    if matches:
+        return matches[0]
+    return None
+
+def check_session_meta(session_id):
+    """Check what model a hermes session was last using from its session file."""
+    import json
+    session_file = os.path.expanduser(f"~/.hermes/sessions/session_{session_id}.json")
+    if os.path.exists(session_file):
+        try:
+            with open(session_file) as f:
+                data = json.load(f)
+            return data.get("model"), data.get("provider")
+        except:
+            pass
+    # Try jsonl
+    jsonl_file = os.path.expanduser(f"~/.hermes/sessions/{session_id}.jsonl")
+    if os.path.exists(jsonl_file):
+        try:
+            with open(jsonl_file) as f:
+                for line in f:
+                    d = json.loads(line.strip())
+                    if d.get("role") == "session_meta":
+                        return d.get("model"), d.get("provider")
+                    break
+        except:
+            pass
+    return None, None
+
+def is_drifted(model_name, profile=None):
+    """Check if a model name indicates drift from the expected model for this profile."""
+    if model_name is None:
+        return False, "no-model-detected"
+
+    # If we know the profile, check against its expected model
+    if profile and profile in PROFILE_MODELS:
+        expected = PROFILE_MODELS[profile]
+        if expected in model_name:
+            return False, model_name
+        return True, model_name
+
+    # No profile known — fall back to ALLOWED_MODEL
+    if ALLOWED_MODEL in model_name:
+        return False, model_name
+    return True, model_name
+
+def get_profile_from_pane(tty):
+    """Detect which hermes profile a pane is running by inspecting its process args."""
+    # ps shows short TTY (s031) not full path (/dev/ttys031)
+    short_tty = tty.replace("/dev/ttys", "s").replace("/dev/ttys", "")
+    out, _ = run(f"ps aux | grep '{short_tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | grep -v cron")
+    if not out:
+        return None
+    # Look for -p <profile> in the command line
+    match = re.search(r'-p\s+(\S+)', out)
+    if match:
+        return match.group(1)
+    return None
+
+def kill_and_restart(session, pane_idx, window=None):
+    """Kill the hermes process in a pane and restart it with the same profile."""
+    target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}"
+
+    # Get the pane's TTY
+    out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'")
+    tty = out.strip()
+
+    # Detect which profile was running
+    profile = get_profile_from_pane(tty)
+
+    # Find and kill hermes on that TTY
+    hermes_pid = get_hermes_pid_for_tty(tty)
+    if hermes_pid:
+        log(f"Killing hermes PID {hermes_pid} on {target} (tty={tty}, profile={profile})")
+        run(f"kill {hermes_pid}")
+        time.sleep(2)
+
+    # Send Ctrl+C to clear any state
+    run(f"tmux send-keys -t {target} C-c")
+    time.sleep(1)
+
+    # Restart hermes with the same profile
+    if profile:
+        cmd = f"hermes -p {profile} chat"
+    else:
+        cmd = "hermes chat"
+    run(f"tmux send-keys -t {target} '{cmd}' Enter")
+    log(f"Restarted hermes in {target} with: {cmd}")
+
+    # Wait and verify
+    time.sleep(8)
+    new_model = get_model_from_pane(session, pane_idx, window)
+    if new_model and ALLOWED_MODEL in new_model:
+        log(f"✓ {target} now on {new_model}")
+        return True
+    else:
+        log(f"⚠ {target} model after restart: {new_model}")
+        return False
+
+def verify_expected_model(provider_yaml, expected):
+    """Compare actual provider in a YAML config against expected value."""
+    return provider_yaml.strip() == expected.strip()
+
+def check_config_drift():
+    """Scan all relevant config.yaml files for provider drift. Does NOT modify anything.
+    Returns list of drift issues found."""
+    issues = []
+    CONFIGS = {
+        "main_config": (os.path.expanduser("~/.hermes/config.yaml"), "nous"),
+        "fenrir": (os.path.expanduser("~/.hermes/profiles/fenrir/config.yaml"), "nous"),
+        "timmy_sprint": (os.path.expanduser("~/.hermes/profiles/timmy-sprint/config.yaml"), "nous"),
+        "default_profile": (os.path.expanduser("~/.hermes/profiles/default/config.yaml"), "nous"),
+    }
+    for name, (path, expected_provider) in CONFIGS.items():
+        if not os.path.exists(path):
+            continue
+        try:
+            with open(path, "r") as f:
+                content = f.read()
+            # Parse YAML to correctly read model.provider (not the first provider: line)
+            try:
+                import yaml
+                cfg = yaml.safe_load(content) or {}
+            except ImportError:
+                # Fallback: find provider under model: block via indentation-aware scan
+                cfg = {}
+                in_model = False
+                for line in content.split("\n"):
+                    stripped = line.strip()
+                    indent = len(line) - len(line.lstrip())
+                    if stripped.startswith("model:") and indent == 0:
+                        in_model = True
+                        continue
+                    if in_model and indent == 0 and stripped:
+                        in_model = False
+                    if in_model and stripped.startswith("provider:"):
+                        cfg = {"model": {"provider": stripped.split(":", 1)[1].strip()}}
+                        break
+            actual = (cfg.get("model") or {}).get("provider", "")
+            if actual and expected_provider and actual != expected_provider:
+                issues.append(f"CONFIG DRIFT [{name}]: provider is '{actual}' (expected '{expected_provider}')")
+        except Exception as e:
+            issues.append(f"CONFIG CHECK ERROR [{name}]: {e}")
+    return issues
+
+def main():
+    fix_mode = "--fix" in sys.argv
+    drift_found = False
+    issues = []
+    
+    # Always check config files for provider drift (read-only, never writes)
+    config_drift_issues = check_config_drift()
+    if config_drift_issues:
+        for issue in config_drift_issues:
+            log(f"CONFIG DRIFT: {issue}")
+    
+    for session in TMUX_SESSIONS:
+        panes = get_panes(session)
+        for pane in panes:
+            window = pane.get("window")
+            target = f"{session}:{window}.{pane['index']}" if window else f"{session}.{pane['index']}"
+
+            # Detect profile from running process
+            out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'")
+            tty = out.strip()
+            profile = get_profile_from_pane(tty)
+
+            model = get_model_from_pane(session, pane["index"], window)
+            drifted, model_name = is_drifted(model, profile)
+            
+            if drifted:
+                drift_found = True
+                issues.append(f"{target}: {model_name} (profile={profile})")
+                log(f"DRIFT DETECTED: {target} is on '{model_name}' (profile={profile}, expected='{PROFILE_MODELS.get(profile, ALLOWED_MODEL)}')")
+                
+                if fix_mode:
+                    log(f"Auto-fixing {target}...")
+                    success = kill_and_restart(session, pane["index"], window)
+                    if not success:
+                        issues.append(f"  ↳ RESTART FAILED for {target}")
+    
+    if not drift_found:
+        total = sum(len(get_panes(s)) for s in TMUX_SESSIONS)
+        log(f"All {total} panes healthy (on {ALLOWED_MODEL})")
+    
+    # Print summary for cron output
+    if issues or config_drift_issues:
+        print("\n=== MODEL DRIFT REPORT ===")
+        for issue in issues:
+            print(f"  [PANE] {issue}")
+        if config_drift_issues:
+            for issue in config_drift_issues:
+                print(f"  [CONFIG] {issue}")
+        if not fix_mode:
+            print("\nRun with --fix to auto-restart drifted panes.")
+        return 1
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/run_agent.py
+++ b/run_agent.py
@@ -1001,30 +1001,10 @@ class AIAgent:
        self._session_db = session_db
        self._parent_session_id = parent_session_id
        self._last_flushed_db_idx = 0  # tracks DB-write cursor to prevent duplicate writes
-        if self._session_db:
-            try:
-                self._session_db.create_session(
-                    session_id=self.session_id,
-                    source=self.platform or os.environ.get("HERMES_SESSION_SOURCE", "cli"),
-                    model=self.model,
-                    model_config={
-                        "max_iterations": self.max_iterations,
-                        "reasoning_config": reasoning_config,
-                        "max_tokens": max_tokens,
-                    },
-                    user_id=None,
-                    parent_session_id=self._parent_session_id,
-                )
-            except Exception as e:
-                # Transient SQLite lock contention (e.g. CLI and gateway writing
-                # concurrently) must NOT permanently disable session_search for
-                # this agent.  Keep _session_db alive — subsequent message
-                # flushes and session_search calls will still work once the
-                # lock clears.  The session row may be missing from the index
-                # for this run, but that is recoverable (flushes upsert rows).
-                logger.warning(
-                    "Session DB create_session failed (session_search still available): %s", e
-                )
+        # Lazy session creation: defer until first message flush (#314).
+        # _flush_messages_to_session_db() calls ensure_session() which uses
+        # INSERT OR IGNORE — creating the row only when messages arrive.
+        # This eliminates 32% of sessions that are created but never used.
        
        # In-memory todo list for task planning (one per agent/session)
        from tools.todo_tool import TodoStore
--- a/tests/hermes_cli/test_config_validation.py
+++ b/tests/hermes_cli/test_config_validation.py
@@ -249,3 +249,111 @@ class TestConfigIssueDataclass:
        a = ConfigIssue("error", "msg", "hint")
        b = ConfigIssue("error", "msg", "hint")
        assert a == b
+
+
+class TestFallbackProvidersValidation:
+    """fallback_providers must be a list of dicts with provider + model."""
+
+    def test_non_list(self):
+        """fallback_providers as string should error."""
+        issues = validate_config_structure({
+            "fallback_providers": "openrouter:google/gemini-3-flash-preview",
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("fallback_providers" in i.message and "list" in i.message for i in errors)
+
+    def test_dict_instead_of_list(self):
+        """fallback_providers as dict should error."""
+        issues = validate_config_structure({
+            "fallback_providers": {"provider": "openrouter", "model": "test"},
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("fallback_providers" in i.message and "dict" in i.message for i in errors)
+
+    def test_entry_missing_provider(self):
+        """Entry without provider should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": [{"model": "google/gemini-3-flash-preview"}],
+        })
+        assert any("missing 'provider'" in i.message for i in issues)
+
+    def test_entry_missing_model(self):
+        """Entry without model should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": [{"provider": "openrouter"}],
+        })
+        assert any("missing 'model'" in i.message for i in issues)
+
+    def test_entry_not_dict(self):
+        """Non-dict entries should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": ["not-a-dict"],
+        })
+        assert any("not a dict" in i.message for i in issues)
+
+    def test_valid_entries(self):
+        """Valid fallback_providers should produce no fallback-related issues."""
+        issues = validate_config_structure({
+            "fallback_providers": [
+                {"provider": "openrouter", "model": "google/gemini-3-flash-preview"},
+                {"provider": "gemini", "model": "gemini-2.5-flash"},
+            ],
+        })
+        fb_issues = [i for i in issues if "fallback_providers" in i.message]
+        assert len(fb_issues) == 0
+
+    def test_empty_list_no_issues(self):
+        """Empty list is valid (fallback disabled)."""
+        issues = validate_config_structure({
+            "fallback_providers": [],
+        })
+        fb_issues = [i for i in issues if "fallback_providers" in i.message]
+        assert len(fb_issues) == 0
+
+
+class TestSessionResetValidation:
+    """session_reset.idle_minutes must be positive."""
+
+    def test_zero_idle_minutes(self):
+        """idle_minutes=0 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": 0},
+        })
+        assert any("idle_minutes=0" in i.message for i in issues)
+
+    def test_negative_idle_minutes(self):
+        """idle_minutes=-5 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": -5},
+        })
+        assert any("idle_minutes=-5" in i.message for i in issues)
+
+    def test_string_idle_minutes(self):
+        """idle_minutes as string should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": "abc"},
+        })
+        assert any("idle_minutes=" in i.message for i in issues)
+
+    def test_valid_idle_minutes(self):
+        """Valid idle_minutes should not warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": 1440},
+        })
+        idle_issues = [i for i in issues if "idle_minutes" in i.message]
+        assert len(idle_issues) == 0
+
+    def test_invalid_at_hour(self):
+        """at_hour=25 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"at_hour": 25},
+        })
+        assert any("at_hour=25" in i.message for i in issues)
+
+    def test_valid_at_hour(self):
+        """Valid at_hour should not warn."""
+        issues = validate_config_structure({
+            "session_reset": {"at_hour": 4},
+        })
+        hour_issues = [i for i in issues if "at_hour" in i.message]
+        assert len(hour_issues) == 0
--- a/tests/test_cron_cloud_terminal.py
+++ b/tests/test_cron_cloud_terminal.py
@@ -0,0 +1,73 @@
+"""Tests for cron scheduler cloud-provider terminal disabling (#379).
+
+When a cron job runs on a cloud inference endpoint (Nous, OpenRouter, etc.),
+the terminal toolset must be disabled because SSH keys don't exist on cloud
+servers. Only local endpoints (localhost, 127.0.0.1, RFC-1918) retain
+terminal access.
+"""
+
+import pytest
+from agent.model_metadata import is_local_endpoint
+
+
+class TestIsLocalEndpoint:
+    """Verify is_local_endpoint correctly classifies endpoints."""
+
+    def test_localhost(self):
+        assert is_local_endpoint("http://localhost:11434/v1") is True
+
+    def test_127_loopback(self):
+        assert is_local_endpoint("http://127.0.0.1:8080/v1") is True
+
+    def test_0_0_0_0(self):
+        assert is_local_endpoint("http://0.0.0.0:11434/v1") is True
+
+    def test_rfc1918_10(self):
+        assert is_local_endpoint("http://10.0.0.5:8080/v1") is True
+
+    def test_rfc1918_192(self):
+        assert is_local_endpoint("http://192.168.1.100:11434/v1") is True
+
+    def test_rfc1918_172(self):
+        assert is_local_endpoint("http://172.16.0.1:8080/v1") is True
+
+    def test_cloud_openrouter(self):
+        assert is_local_endpoint("https://openrouter.ai/api/v1") is False
+
+    def test_cloud_nous(self):
+        assert is_local_endpoint("https://inference-api.nousresearch.com/v1") is False
+
+    def test_cloud_anthropic(self):
+        assert is_local_endpoint("https://api.anthropic.com") is False
+
+    def test_empty_url(self):
+        assert is_local_endpoint("") is False
+
+    def test_none_url(self):
+        assert is_local_endpoint(None) is False
+
+
+class TestCronDisabledToolsetsLogic:
+    """Verify the disabled_toolsets logic matches scheduler expectations."""
+
+    def _build_disabled(self, base_url, job=None):
+        """Mirror the scheduler's disabled_toolsets logic."""
+        from agent.model_metadata import is_local_endpoint
+        cron_disabled = ["cronjob", "messaging", "clarify"]
+        if not is_local_endpoint(base_url):
+            cron_disabled.append("terminal")
+        return cron_disabled
+
+    def test_local_keeps_terminal(self):
+        disabled = self._build_disabled("http://localhost:11434/v1")
+        assert "terminal" not in disabled
+        assert "cronjob" in disabled
+
+    def test_cloud_disables_terminal(self):
+        disabled = self._build_disabled("https://openrouter.ai/api/v1")
+        assert "terminal" in disabled
+        assert "cronjob" in disabled
+
+    def test_empty_url_disables_terminal(self):
+        disabled = self._build_disabled("")
+        assert "terminal" in disabled
--- a/tests/test_cron_model_routing.py
+++ b/tests/test_cron_model_routing.py
@@ -0,0 +1,128 @@
+"""Tests for time-aware cron model routing — Issue #317."""
+
+import pytest
+from datetime import datetime
+
+from agent.smart_model_routing import resolve_cron_model, _hour_in_window
+
+
+class TestHourInWindow:
+    """Hour-in-window detection including midnight wrap."""
+
+    def test_normal_window(self):
+        assert _hour_in_window(18, 17, 22) is True
+        assert _hour_in_window(16, 17, 22) is False
+        assert _hour_in_window(22, 17, 22) is False
+
+    def test_midnight_wrap(self):
+        assert _hour_in_window(23, 22, 6) is True
+        assert _hour_in_window(3, 22, 6) is True
+        assert _hour_in_window(10, 22, 6) is False
+
+    def test_edge_cases(self):
+        assert _hour_in_window(0, 0, 24) is True
+        assert _hour_in_window(23, 0, 24) is True
+        assert _hour_in_window(0, 22, 6) is True
+        assert _hour_in_window(5, 22, 6) is True
+        assert _hour_in_window(6, 22, 6) is False
+
+
+class TestResolveCronModel:
+    """Time-aware model resolution for cron jobs."""
+
+    def _config(self, **overrides):
+        base = {
+            "enabled": True,
+            "fallback_model": "anthropic/claude-sonnet-4",
+            "fallback_provider": "openrouter",
+            "windows": [
+                {"start_hour": 17, "end_hour": 22, "reason": "evening_error_peak"},
+            ],
+        }
+        base.update(overrides)
+        return base
+
+    def test_disabled_returns_base(self):
+        result = resolve_cron_model("mimo", {"enabled": False}, now=datetime(2026, 4, 12, 18, 0))
+        assert result["model"] == "mimo"
+        assert result["overridden"] is False
+
+    def test_no_config_returns_base(self):
+        result = resolve_cron_model("mimo", None)
+        assert result["model"] == "mimo"
+        assert result["overridden"] is False
+
+    def test_no_windows_returns_base(self):
+        result = resolve_cron_model("mimo", {"enabled": True, "windows": []}, now=datetime(2026, 4, 12, 18, 0))
+        assert result["overridden"] is False
+
+    def test_evening_window_overrides(self):
+        result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 18, 0))
+        assert result["model"] == "anthropic/claude-sonnet-4"
+        assert result["provider"] == "openrouter"
+        assert result["overridden"] is True
+        assert "evening_error_peak" in result["reason"]
+        assert "hour=18" in result["reason"]
+
+    def test_outside_window_keeps_base(self):
+        result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 9, 0))
+        assert result["model"] == "mimo"
+        assert result["overridden"] is False
+
+    def test_window_boundary_start_inclusive(self):
+        result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 17, 0))
+        assert result["overridden"] is True
+
+    def test_window_boundary_end_exclusive(self):
+        result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 22, 0))
+        assert result["overridden"] is False
+
+    def test_midnight_window(self):
+        config = self._config(windows=[{"start_hour": 22, "end_hour": 6, "reason": "overnight"}])
+        assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 23, 0))["overridden"] is True
+        assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 13, 3, 0))["overridden"] is True
+        assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 10, 0))["overridden"] is False
+
+    def test_per_window_model_override(self):
+        config = self._config(windows=[{
+            "start_hour": 17, "end_hour": 22,
+            "model": "anthropic/claude-opus-4-6", "provider": "anthropic", "reason": "peak",
+        }])
+        result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 18, 0))
+        assert result["model"] == "anthropic/claude-opus-4-6"
+        assert result["provider"] == "anthropic"
+
+    def test_first_matching_window_wins(self):
+        config = self._config(windows=[
+            {"start_hour": 17, "end_hour": 20, "model": "strong-1", "provider": "p1", "reason": "w1"},
+            {"start_hour": 19, "end_hour": 22, "model": "strong-2", "provider": "p2", "reason": "w2"},
+        ])
+        result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 19, 0))
+        assert result["model"] == "strong-1"
+
+    def test_no_fallback_model_keeps_base(self):
+        config = {"enabled": True, "windows": [{"start_hour": 17, "end_hour": 22, "reason": "test"}]}
+        result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 18, 0))
+        assert result["overridden"] is False
+        assert result["model"] == "mimo"
+
+    def test_malformed_windows_skipped(self):
+        config = self._config(windows=[
+            "not-a-dict",
+            {"start_hour": 17},
+            {"end_hour": 22},
+            {"start_hour": "bad", "end_hour": "bad"},
+            {"start_hour": 17, "end_hour": 22, "reason": "valid"},
+        ])
+        result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 18, 0))
+        assert result["overridden"] is True
+        assert "valid" in result["reason"]
+
+    def test_multiple_windows_coverage(self):
+        config = self._config(windows=[
+            {"start_hour": 17, "end_hour": 22, "reason": "evening"},
+            {"start_hour": 2, "end_hour": 5, "reason": "overnight"},
+        ])
+        assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 20, 0))["overridden"] is True
+        assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 13, 3, 0))["overridden"] is True
+        assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 10, 0))["overridden"] is False
--- a/tests/test_hermes_state.py
+++ b/tests/test_hermes_state.py
@@ -665,6 +665,127 @@ class TestPruneSessions:


 # =========================================================================
+# =========================================================================
+# Garbage Collect
+# =========================================================================
+
+class TestGarbageCollect:
+    def test_gc_deletes_empty_old_sessions(self, db):
+        """Empty sessions (0 messages) older than 24h should be deleted."""
+        db.create_session(session_id="empty_old", source="cli")
+        db.end_session("empty_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "empty_old"),  # 48 hours ago
+        )
+        db._conn.commit()
+
+        # Recent empty session should be kept
+        db.create_session(session_id="empty_new", source="cli")
+        db.end_session("empty_new", end_reason="done")
+
+        result = db.garbage_collect()
+        assert result["empty"] == 1
+        assert result["trivial"] == 0
+        assert result["total"] == 1
+        assert db.get_session("empty_old") is None
+        assert db.get_session("empty_new") is not None
+
+    def test_gc_deletes_trivial_old_sessions(self, db):
+        """Sessions with 1-5 messages older than 7 days should be deleted."""
+        db.create_session(session_id="trivial_old", source="cli")
+        for i in range(3):
+            db.append_message("trivial_old", role="user", content=f"msg {i}")
+        db.end_session("trivial_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 10 * 86400, "trivial_old"),  # 10 days ago
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["trivial"] == 1
+        assert result["total"] == 1
+        assert db.get_session("trivial_old") is None
+
+    def test_gc_keeps_active_sessions(self, db):
+        """Active (not ended) sessions should never be deleted."""
+        db.create_session(session_id="active_old", source="cli")
+        # Backdate but don't end
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "active_old"),
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["total"] == 0
+        assert db.get_session("active_old") is not None
+
+    def test_gc_keeps_substantial_sessions(self, db):
+        """Sessions with >5 messages should never be deleted."""
+        db.create_session(session_id="big_old", source="cli")
+        for i in range(10):
+            db.append_message("big_old", role="user", content=f"msg {i}")
+        db.end_session("big_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 365 * 86400, "big_old"),  # 1 year ago
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["total"] == 0
+        assert db.get_session("big_old") is not None
+
+    def test_gc_dry_run_does_not_delete(self, db):
+        """dry_run=True should return counts but not delete anything."""
+        db.create_session(session_id="empty_old", source="cli")
+        db.end_session("empty_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "empty_old"),
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect(dry_run=True)
+        assert result["total"] == 1
+        assert db.get_session("empty_old") is not None  # Still exists
+
+    def test_gc_with_source_filter(self, db):
+        """--source should only GC sessions from that source."""
+        for sid, src in [("old_cli", "cli"), ("old_tg", "telegram")]:
+            db.create_session(session_id=sid, source=src)
+            db.end_session(sid, end_reason="done")
+            db._conn.execute(
+                "UPDATE sessions SET started_at = ? WHERE id = ?",
+                (time.time() - 48 * 3600, sid),
+            )
+        db._conn.commit()
+
+        result = db.garbage_collect(source="cli")
+        assert result["total"] == 1
+        assert db.get_session("old_cli") is None
+        assert db.get_session("old_tg") is not None
+
+    def test_gc_handles_child_sessions(self, db):
+        """Child sessions should be deleted when parent is GC'd."""
+        db.create_session(session_id="parent_old", source="cli")
+        db.end_session("parent_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "parent_old"),
+        )
+        # Create child session
+        db.create_session(session_id="child", source="cli", parent_session_id="parent_old")
+        db.end_session("child", end_reason="done")
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["total"] == 1
+        assert db.get_session("parent_old") is None
+        assert db.get_session("child") is None
+
 # Schema and WAL mode
 # =========================================================================

--- a/tools/cronjob_tools.py
+++ b/tools/cronjob_tools.py
@@ -201,6 +201,17 @@ def _format_job(job: Dict[str, Any]) -> Dict[str, Any]:
        "paused_at": job.get("paused_at"),
        "paused_reason": job.get("paused_reason"),
    }
+    # Health timestamps
+    if job.get("last_error_at"):
+        result["last_error_at"] = job["last_error_at"]
+    if job.get("last_success_at"):
+        result["last_success_at"] = job["last_success_at"]
+    if job.get("error_resolved_at"):
+        result["error_resolved_at"] = job["error_resolved_at"]
+    if job.get("error_cleared_at"):
+        result["error_cleared_at"] = job["error_cleared_at"]
+    
+
    if job.get("script"):
        result["script"] = job["script"]
    return result
@@ -326,6 +337,13 @@ def cronjob(
            if result is None:
                return json.dumps({"success": False, "error": "Job not found"}, indent=2)
            return json.dumps(result, indent=2)
+        if normalized == "clear_error":
+            from cron.jobs import clear_job_error
+            job = clear_job_error(job_id)
+            if job is None:
+                return json.dumps({"success": False, "error": "Job not found"}, indent=2)
+            return json.dumps({"success": True, "job": _format_job(job)}, indent=2)
+

        if normalized == "update":
            updates: Dict[str, Any] = {}
Author	SHA1	Message	Date
Alexander Whitestone	66b0febdfb	fix(cron): expand _SCRIPT_FAILURE_PHRASES with SSH patterns Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 59s Details Part of #457, Closes #350 Detect SSH-specific errors: 'no such file or directory', 'command not found', 'ssh: connect to host', etc.	2026-04-14 01:21:43 +00:00
Alexander Whitestone	6d79bf7783	feat(cron): SSH dispatch validation utilities Part of #457, Closes #350 Provides SSHEnvironment that validates remote hermes binary exists before dispatch, and DispatchResult with structured failure reasons.	2026-04-14 01:21:39 +00:00
Alexander Whitestone	954fd992eb	Merge pull request 'perf: lazy session creation — defer DB write until first message (#314 )' (#449 ) from whip/314-1776127532 into main Some checks failed Forge CI / smoke-and-build (push) Failing after 55s Details Forge CI / smoke-and-build (pull_request) Failing after 1m12s Details perf: lazy session creation (#314) Closes #314.	2026-04-14 01:08:13 +00:00
Metatron	f35f56e397	perf: lazy session creation — defer DB write until first message (closes #314 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 56s Details Remove eager create_session() call from AIAgent.__init__(). Sessions are now created lazily on first _flush_messages_to_session_db() call via ensure_session() which uses INSERT OR IGNORE. Impact: eliminates 32.4% of sessions (3,564 of 10,985) that were created at agent init but never received any messages. The existing ensure_session() fallback in _flush_messages_to_session_db() already handles this pattern — it was originally designed for recovery after transient SQLite lock failures. Now it's the primary creation path. Compression-initiated sessions still use create_session() directly (line ~5995) since they have messages to write immediately.	2026-04-13 20:52:06 -04:00
Alexander Whitestone	8d0cad13c4	Merge pull request 'fix: watchdog config drift check uses YAML parse, not grep (#377 )' (#398 ) from burn/377-1776117775 into main Some checks failed Forge CI / smoke-and-build (push) Failing after 28s Details	2026-04-14 00:34:14 +00:00
Alexander Whitestone	b9aca0a3b4	Merge pull request 'feat: time-aware model routing for cron jobs (#317 )' (#432 ) from burn/317-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:34:06 +00:00
Alexander Whitestone	99d36533d5	Merge pull request 'feat: add /debug slash command with paste service upload (#320 )' (#416 ) from burn/320-1776120221 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:33:59 +00:00
Alexander Whitestone	b562a3d94c	Merge pull request 'docs(#322 ): comprehensive Honcho evaluation — recommendation: KEEP' (#430 ) from burn/322-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:33:56 +00:00
Alexander Whitestone	37af40a38e	Merge pull request 'feat: session garbage collection (#315 )' (#383 ) from feat/315-session-gc into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:33:15 +00:00
Alexander Whitestone	5aa8581e2b	Merge pull request 'fix: gateway config debt - validation, defaults, fallback chain checks (#328 )' (#381 ) from fix/gateway-config-debt-328 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:32:56 +00:00
Alexander Whitestone	b44255f21e	Merge pull request 'cron: Comprehensive stale error state handling for recovered jobs (#349 )' (#431 ) from burn/349-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:32:48 +00:00
Alexander Whitestone	6b41bafccd	Merge pull request 'fix(cron): disable terminal toolset for cloud providers in cron jobs (#379 )' (#436 ) from burn/379-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:32:45 +00:00
Alexander Whitestone	053fa3a2dd	Merge pull request 'fix(cron): normalize model field types in deploy-crons.py' (#410 ) from burn/376-1776117777 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:31:47 +00:00
Timmy	57418dae07	fix(cron): disable terminal toolset for cloud providers in cron jobs (#379 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m16s Details Cron jobs like nightwatch-health-monitor SSH into remote VPSes. When the runtime provider is cloud (Nous, OpenRouter, Anthropic), SSH keys don't exist on the inference server — causing silent failures and wasted iterations. Changes: - cron/scheduler.py: Import is_local_endpoint from model_metadata. Build disabled_toolsets dynamically: append 'terminal' when the runtime base_url is NOT a local endpoint. Log when terminal is disabled for observability. Also warn when a job declares requires_local_infra=true but runs on cloud. - tests/test_cron_cloud_terminal.py: 14 tests verifying is_local_endpoint classification and disabled_toolsets logic. Behavior: Local (localhost/127/RFC-1918): terminal enabled, SSH works. Cloud (openrouter/nous/anthropic): terminal disabled, agent reports SSH unavailable instead of wasting iterations. Closes #379	2026-04-13 20:20:41 -04:00
Alexander Whitestone	5989600d80	feat: time-aware model routing for cron jobs (#317 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m1s Details Empirical audit: cron error rate peaks at 18:00 (9.4%) vs 4.0% at 09:00. During configured high-error windows, automatically route cron jobs to more capable models when the user is not present to correct errors. - agent/smart_model_routing.py: resolve_cron_model() + _hour_in_window() - cron/scheduler.py: wired into run_job() after base model resolution - tests/test_cron_model_routing.py: 16 tests Config: cron_model_routing: enabled: true fallback_model: "anthropic/claude-sonnet-4" fallback_provider: "openrouter" windows: - {start_hour: 17, end_hour: 22, reason: evening_error_peak} - {start_hour: 2, end_hour: 5, reason: overnight_api_instability} Features: midnight-wrap, per-window overrides, first-match-wins, graceful degradation on malformed config. Closes #317	2026-04-13 20:19:37 -04:00
Alexander Whitestone	379769ca6d	feat(cron): Show health status in job list Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m0s Details Part of #349. Shows current vs. cleared errors, success history.	2026-04-14 00:19:11 +00:00
Alexander Whitestone	91bc02bc38	feat(cron): Add clear-error CLI subparser Part of #349. Adds `hermes cron clear-error JOB_ID` command.	2026-04-14 00:18:52 +00:00
Alexander Whitestone	77265a31e1	feat(cron): Add clear-error CLI command Part of #349. Adds `hermes cron clear-error JOB_ID` command.	2026-04-14 00:18:30 +00:00
Timmy	7a32df9ca3	docs(#322 ): comprehensive Honcho evaluation — recommendation: KEEP Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m9s Details Decision: Cloud vs Local → BOTH (user's choice) - Cloud: HONCHO_API_KEY from app.honcho.dev - Self-hosted: HONCHO_BASE_URL=http://localhost:8000 - Disabled: No config = zero overhead Integration is already production-ready: - 3 components, ~700 lines of code - 7 tests passing - Async prefetch (zero-latency) - Configurable recall modes - Cron guard (inactive in cron context) Recommendation: KEEP — provides unique cross-session user modeling that complements local holographic fact_store. Refs #322	2026-04-13 20:18:23 -04:00
Alexander Whitestone	cf36bd2ddf	feat(cron): Add clear_error action and health timestamps Part of #349. Adds clear_error action and includes health timestamps in job format.	2026-04-14 00:18:09 +00:00
Alexander Whitestone	0413fc1788	feat(cron): Comprehensive stale error state handling - mark_job_run: track last_error_at, last_success_at, error_resolved_at - trigger_job: clear stale error state when re-triggering - clear_job_error: manual clearing of stale errors Closes #349	2026-04-14 00:17:45 +00:00
Alexander Whitestone	f1626a932c	feat: add /debug command handler with paste service upload (#320 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m1s Details	2026-04-13 22:48:33 +00:00
Alexander Whitestone	d68ab4cff4	feat: add /debug slash command to command registry (#320 )	2026-04-13 22:47:51 +00:00
Alexander Whitestone	3c66333c94	fix(cron): add deploy-crons.py to normalize model field types Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 48s Details Fixes #376 Normalize model field in jobs.json to always be a dict when either model or provider is specified, preventing schema inconsistency.	2026-04-13 22:24:31 +00:00
Timmy Time	87867f3d10	fix: config drift check uses YAML parse not grep (#377 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 59s Details	2026-04-13 22:12:56 +00:00
Alexander Whitestone	69e10967bd	feat: session garbage collection (#315 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 14s Details Add garbage_collect() method to SessionDB that cleans up empty and trivial sessions based on age: - Empty sessions (0 messages) older than 24h - Trivial sessions (1-5 messages) older than 7 days - Sessions with >5 messages kept indefinitely Add `hermes sessions gc` CLI command with: - --empty-hours (default: 24) - --trivial-days (default: 7) - --trivial-max (default: 5) - --source filter - --dry-run preview mode - --yes skip confirmation The dry-run flow: preview what would be deleted, ask for confirmation, then execute. Handles child session FK constraints properly. 7 tests covering: empty/trivial deletion, active session protection, substantial session preservation, dry-run, source filtering, and child session handling. Closes #315	2026-04-13 17:30:39 -04:00
Alexander Whitestone	992498463e	fix: gateway config debt - validation, defaults, fallback chain checks (#328 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m32s Details - Expand validate_config_structure() to catch: - fallback_providers format errors (non-list, missing provider/model) - session_reset.idle_minutes <= 0 (causes immediate resets) - session_reset.at_hour out of 0-23 range - API_SERVER enabled without API_SERVER_KEY - Unknown root-level keys that look like misplaced custom_providers fields - Add _validate_fallback_providers() in gateway/config.py to validate fallback chain at gateway startup (logs warnings for malformed entries) - Add API_SERVER_KEY check in gateway config loader (warns on unauthenticated endpoint) - Expand _KNOWN_ROOT_KEYS to include all valid top-level config sections (session_reset, browser, checkpoints, voice, stt, tts, etc.) - Add 13 new tests for fallback_providers and session_reset validation - All existing tests pass (47/47) Closes #328	2026-04-13 17:29:20 -04:00
Alexander Whitestone	ec3cd2081b	fix(poka-yoke): add tool fixation detection (#310 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 26s Details Detect when the same tool is called 5+ times consecutively and inject a nudge advising the agent to diversify its approach. Evidence from empirical audit: - Top marathon session (qwen, 1643 msgs): execute_code streak of 20 - Opus session (1472 msgs): terminal streak of 10 The nudge fires every 5 consecutive calls (5, 10, 15...) so it persists without being spammy. Tracks independently in both sequential and concurrent execution paths.	2026-04-13 10:16:11 -04:00
Alexander Whitestone	110642d86a	fix(poka-yoke): add circuit breaker for error cascading (#309 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 28s Details After 3 consecutive tool errors, inject a warning into the tool result advising the agent to switch strategies. Escalates at 6 and 9+ errors. Empirical data from audit: - P(error \| prev error) = 58.6% vs P(error \| prev success) = 25.2% - 2.33x cascade amplification factor - Max observed streak: 31 consecutive errors Intervention tiers: - 3 errors: advisory warning (try different tool, use terminal, simplify) - 6 errors: urgent stop (halt retries, investigate or switch) - 9+ errors: terminal-only recovery path Tracks errors in both sequential and concurrent execution paths.	2026-04-13 10:12:24 -04:00