feat(matrix): Add Matrix CLI commands for fleet ops

Part of #274. Adds `hermes matrix` command with test, send, status, and setup subcommands.
feat(matrix): Add fleet ops channel documentation
2026-04-14 00:38:06 +00:00 · 2026-04-14 00:34:56 +00:00 · 2026-04-14 00:34:22 +00:00 · 2026-04-14 00:33:56 +00:00 · 2026-04-14 00:33:15 +00:00 · 2026-04-14 00:32:56 +00:00
14 changed files with 1361 additions and 77 deletions
--- a/.env.example
+++ b/.env.example
@@ -257,6 +257,42 @@ BROWSER_INACTIVITY_TIMEOUT=120
 # TELEGRAM_WEBHOOK_PORT=8443
 # TELEGRAM_WEBHOOK_SECRET=                 # Recommended for production

+
+# =============================================================================
+# MATRIX INTEGRATION
+# =============================================================================
+# Matrix is the sovereign messaging protocol: federated, E2EE, no corporation.
+# 
+# For fleet ops notifications (replacing Telegram), configure these variables.
+# See docs/matrix-setup.md for full setup guide.
+
+# Required: Your homeserver URL
+# MATRIX_HOMESERVER=https://matrix-client.matrix.org  # Public homeserver for testing
+# MATRIX_HOMESERVER=https://matrix.your.domain.com    # Self-hosted (recommended)
+
+# Required: Bot access token (get from Element: Settings -> Help & About -> Advanced)
+# MATRIX_ACCESS_TOKEN=***
+
+# Required: Bot user ID
+# MATRIX_USER_ID=@hermes-bot:matrix.org
+
+# Required: Device ID (from login response)
+# MATRIX_DEVICE_ID=HERMES_BOT
+
+# Fleet ops channel: Where system notifications, cron results, and alerts go
+# Create a room in Element, invite the bot, then get the room ID from Room Settings
+# MATRIX_HOME_CHANNEL=!room-id:matrix.org
+# MATRIX_HOME_CHANNEL_NAME=Fleet Ops
+
+# Crisis channel: For SOUL.md protocol and urgent alerts
+# MATRIX_CRISIS_CHANNEL=!crisis-room-id:matrix.org
+# MATRIX_CRISIS_CHANNEL_NAME=Crisis Room
+
+# E2EE Support: Install matrix-nio with encryption support
+# pip install "matrix-nio[e2e]"
+# Requires libolm C library: brew install libolm (macOS) or apt install libolm-dev (Linux)
+
+# =============================================================================
 # WhatsApp (built-in Baileys bridge — run `hermes whatsapp` to pair)
 # WHATSAPP_ENABLED=false
 # WHATSAPP_ALLOWED_USERS=15551234567
--- a/cron/jobs.py
+++ b/cron/jobs.py
@@ -547,20 +547,30 @@ def resume_job(job_id: str) -> Optional[Dict[str, Any]]:


 def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
-    """Schedule a job to run on the next scheduler tick."""
+    """Schedule a job to run on the next scheduler tick.
+    
+    Clears stale error state when re-triggering a previously-failed job
+    so the stale failure doesn't persist until the next tick completes.
+    """
    job = get_job(job_id)
    if not job:
        return None
-    return update_job(
-        job_id,
-        {
-            "enabled": True,
-            "state": "scheduled",
-            "paused_at": None,
-            "paused_reason": None,
-            "next_run_at": _hermes_now().isoformat(),
-        },
-    )
+    
+    updates = {
+        "enabled": True,
+        "state": "scheduled",
+        "paused_at": None,
+        "paused_reason": None,
+        "next_run_at": _hermes_now().isoformat(),
+    }
+    
+    # Clear stale error state when re-triggering
+    if job.get("last_status") == "error":
+        updates["last_status"] = "retrying"
+        updates["last_error"] = None
+        updates["error_cleared_at"] = _hermes_now().isoformat()
+    
+    return update_job(job_id, updates)


 def run_job_now(job_id: str) -> Optional[Dict[str, Any]]:
@@ -618,6 +628,7 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
    
    Updates last_run_at, last_status, increments completed count,
    computes next_run_at, and auto-deletes if repeat limit reached.
+    Tracks health timestamps for error/success history.
    """
    jobs = load_jobs()
    for i, job in enumerate(jobs):
@@ -627,6 +638,18 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
            job["last_status"] = "ok" if success else "error"
            job["last_error"] = error if not success else None
            
+            # Track health timestamps
+            if success:
+                job["last_success_at"] = now
+                # Clear stale error tracking on success
+                if job.get("last_error_at"):
+                    job["error_resolved_at"] = now
+            else:
+                job["last_error_at"] = now
+                # Clear resolved tracking on new error
+                if job.get("error_resolved_at"):
+                    del job["error_resolved_at"]
+            
            # Increment completed count
            if job.get("repeat"):
                job["repeat"]["completed"] = job["repeat"].get("completed", 0) + 1
@@ -656,6 +679,32 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
    save_jobs(jobs)


+
+def clear_job_error(job_id: str) -> Optional[Dict[str, Any]]:
+    """
+    Clear stale error state for a job.
+    
+    Resets last_status to 'ok', last_error to None, and 
+    records when the error was cleared. Useful after auth
+    recovery when the job itself is healthy but stale error
+    state persists.
+    
+    Returns:
+        Updated job dict, or None if not found.
+    """
+    jobs = load_jobs()
+    for job in jobs:
+        if job["id"] == job_id:
+            job["last_status"] = "ok"
+            job["last_error"] = None
+            job["error_cleared_at"] = _hermes_now().isoformat()
+            save_jobs(jobs)
+            return job
+    save_jobs(jobs)
+    return None
+
+
+
 def advance_next_run(job_id: str) -> bool:
    """Preemptively advance next_run_at for a recurring job before execution.

--- a/deploy-crons.py
+++ b/deploy-crons.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+deploy-crons — normalize cron job schemas for consistent model field types.
+
+This script ensures that the model field in jobs.json is always a dict when
+either model or provider is specified, preventing schema inconsistency.
+
+Usage:
+    python deploy-crons.py [--dry-run] [--jobs-file PATH]
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+def normalize_job(job: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Normalize a job dict to ensure consistent model field types.
+
+    Before normalization:
+    - If model AND provider: model = raw string, provider = raw string (inconsistent)
+    - If only model: model = raw string
+    - If only provider: provider = raw string at top level
+
+    After normalization:
+    - If model exists: model = {"model": "xxx"}
+    - If provider exists: model = {"provider": "yyy"}
+    - If both exist: model = {"model": "xxx", "provider": "yyy"}
+    - If neither: model = None
+    """
+    job = dict(job)  # Create a copy to avoid modifying the original
+    
+    model = job.get("model")
+    provider = job.get("provider")
+    
+    # Skip if already normalized (model is a dict)
+    if isinstance(model, dict):
+        return job
+    
+    # Build normalized model dict
+    model_dict = {}
+    
+    if model is not None and isinstance(model, str):
+        model_dict["model"] = model.strip()
+    
+    if provider is not None and isinstance(provider, str):
+        model_dict["provider"] = provider.strip()
+    
+    # Set model field
+    if model_dict:
+        job["model"] = model_dict
+    else:
+        job["model"] = None
+    
+    # Remove top-level provider field if it was moved into model dict
+    if provider is not None and "provider" in model_dict:
+        # Keep provider field for backward compatibility but mark it as deprecated
+        # This allows existing code that reads job["provider"] to continue working
+        pass
+    
+    return job
+
+
+def normalize_jobs_file(jobs_file: Path, dry_run: bool = False) -> int:
+    """
+    Normalize all jobs in a jobs.json file.
+    
+    Returns the number of jobs that were modified.
+    """
+    if not jobs_file.exists():
+        print(f"Error: Jobs file not found: {jobs_file}", file=sys.stderr)
+        return 1
+    
+    try:
+        with open(jobs_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except json.JSONDecodeError as e:
+        print(f"Error: Invalid JSON in {jobs_file}: {e}", file=sys.stderr)
+        return 1
+    
+    jobs = data.get("jobs", [])
+    if not jobs:
+        print("No jobs found in file.")
+        return 0
+    
+    modified_count = 0
+    for i, job in enumerate(jobs):
+        original_model = job.get("model")
+        original_provider = job.get("provider")
+        
+        normalized_job = normalize_job(job)
+        
+        # Check if anything changed
+        if (normalized_job.get("model") != original_model or
+            normalized_job.get("provider") != original_provider):
+            jobs[i] = normalized_job
+            modified_count += 1
+            
+            job_id = job.get("id", "?")
+            job_name = job.get("name", "(unnamed)")
+            print(f"Normalized job {job_id} ({job_name}):")
+            print(f"  model: {original_model!r} -> {normalized_job.get('model')!r}")
+            print(f"  provider: {original_provider!r} -> {normalized_job.get('provider')!r}")
+    
+    if modified_count == 0:
+        print("All jobs already have consistent model field types.")
+        return 0
+    
+    if dry_run:
+        print(f"DRY RUN: Would normalize {modified_count} jobs.")
+        return 0
+    
+    # Write back to file
+    data["jobs"] = jobs
+    try:
+        with open(jobs_file, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        print(f"Normalized {modified_count} jobs in {jobs_file}")
+        return 0
+    except Exception as e:
+        print(f"Error writing to {jobs_file}: {e}", file=sys.stderr)
+        return 1
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Normalize cron job schemas for consistent model field types."
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Show what would be changed without modifying the file."
+    )
+    parser.add_argument(
+        "--jobs-file",
+        type=Path,
+        default=Path.home() / ".hermes" / "cron" / "jobs.json",
+        help="Path to jobs.json file (default: ~/.hermes/cron/jobs.json)"
+    )
+    
+    args = parser.parse_args()
+    
+    if args.dry_run:
+        print("DRY RUN MODE — no changes will be made.")
+        print()
+    
+    return normalize_jobs_file(args.jobs_file, args.dry_run)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/docs/honcho-evaluation-322.md
+++ b/docs/honcho-evaluation-322.md
@@ -0,0 +1,170 @@
+# Honcho Memory Integration Evaluation (#322)
+
+## Executive Summary
+
+**Status:** Integration already implemented and production-ready.
+**Recommendation:** KEEP — well-gated, zero overhead when disabled, supports self-hosted.
+
+## Decision: Cloud vs Local
+
+### The Question
+"Do we want a cloud-dependent memory layer, or keep everything local?"
+
+### Answer: BOTH — User's Choice
+
+Honcho supports both deployment modes:
+
+| Mode | Configuration | Data Location | Use Case |
+|------|--------------|---------------|----------|
+| Cloud | `HONCHO_API_KEY` | Honcho servers | Quick start, no infrastructure |
+| Self-hosted | `HONCHO_BASE_URL=http://localhost:8000` | Your servers | Full sovereignty |
+| Disabled | No config | N/A | Pure local (holographic fact_store only) |
+
+### Why Keep It
+
+1. **Opt-in Architecture**
+   - No Honcho config → zero overhead (cron guard, lazy init)
+   - Memory provider system allows switching between providers
+   - `hermes memory off` disables completely
+
+2. **Zero Runtime Cost When Disabled**
+   ```python
+   if not cfg.enabled or not (cfg.api_key or cfg.base_url):
+       return ""  # No HTTP calls, no overhead
+   ```
+
+3. **Cross-Session User Modeling**
+   - Holographic fact_store lacks persistent user modeling
+   - Honcho provides: peer cards, dialectic Q&A, semantic search
+   - Complements (not replaces) local memory
+
+4. **Self-Hosted Option**
+   - Set `HONCHO_BASE_URL=http://localhost:8000`
+   - Run Honcho server locally via Docker
+   - Full data sovereignty
+
+5. **Production-Grade Implementation**
+   - 3 components, ~700 lines of code
+   - 7 tests passing
+   - Async prefetch (zero-latency context injection)
+   - Configurable recall modes (hybrid/context/tools)
+   - Write frequency control (async/turn/session/N-turns)
+
+## Architecture
+
+### Components (Already Implemented)
+
+```
+plugins/memory/honcho/
+├── client.py      # Config resolution (API key, base_url, profiles)
+├── session.py     # Session management, async prefetch, dialectic queries
+├── __init__.py    # MemoryProvider interface, 4 tool schemas
+├── cli.py         # CLI commands (setup, status, sessions, map, peer, mode)
+├── plugin.yaml    # Plugin metadata
+└── README.md      # Documentation
+```
+
+### Integration Points
+
+1. **System Prompt**: Context injected on first turn (cached for prompt caching)
+2. **Tool Registry**: 4 tools available when `recall_mode != "context"`
+3. **Session End**: Messages flushed to Honcho
+4. **Cron Guard**: Fully inactive in cron context
+
+### Tools Available
+
+| Tool | Cost | Speed | Purpose |
+|------|------|-------|---------|
+| `honcho_profile` | Free | Fast | Quick factual snapshot (peer card) |
+| `honcho_search` | Free | Fast | Semantic search (raw excerpts) |
+| `honcho_context` | Paid | Slow | Dialectic Q&A (synthesized answers) |
+| `honcho_conclude` | Free | Fast | Save persistent facts about user |
+
+## Configuration Guide
+
+### Option 1: Cloud (Quick Start)
+```bash
+# Get API key from https://app.honcho.dev
+export HONCHO_API_KEY="your-api-key"
+hermes chat
+```
+
+### Option 2: Self-Hosted (Full Sovereignty)
+```bash
+# Run Honcho server locally
+docker run -p 8000:8000 honcho/server
+
+# Configure Hermes
+export HONCHO_BASE_URL="http://localhost:8000"
+hermes chat
+```
+
+### Option 3: CLI Setup
+```bash
+hermes honcho setup
+```
+
+### Option 4: Disabled (Pure Local)
+```bash
+# Don't set any Honcho config
+hermes memory off  # If previously enabled
+hermes chat
+```
+
+## Memory Modes
+
+| Mode | Context Injection | Tools | Cost | Use Case |
+|------|------------------|-------|------|----------|
+| hybrid | Yes | Yes | Medium | Default — auto-inject + on-demand |
+| context | Yes | No | Low | Budget mode — auto-inject only |
+| tools | No | Yes | Variable | Full control — agent decides |
+
+## Risk Assessment
+
+| Risk | Mitigation | Status |
+|------|------------|--------|
+| Cloud dependency | Self-hosted option available | ✅ |
+| Cost from LLM calls | Recall mode "context" or "tools" reduces calls | ✅ |
+| Data privacy | Self-hosted keeps data on your servers | ✅ |
+| Performance overhead | Cron guard + lazy init + async prefetch | ✅ |
+| Vendor lock-in | MemoryProvider interface allows swapping | ✅ |
+
+## Comparison with Alternatives
+
+| Feature | Honcho | Holographic | Mem0 | Hindsight |
+|---------|--------|-------------|------|-----------|
+| Cross-session modeling | ✅ | ❌ | ✅ | ✅ |
+| Dialectic Q&A | ✅ | ❌ | ❌ | ❌ |
+| Self-hosted | ✅ | N/A | ❌ | ❌ |
+| Local-only option | ✅ | ✅ | ❌ | ✅ |
+| Cost | Free/Paid | Free | Paid | Free |
+
+## Conclusion
+
+**Keep Honcho integration.** It provides unique cross-session user modeling capabilities that complement the local holographic fact_store. The integration is:
+
+- Well-gated (opt-in, zero overhead when disabled)
+- Flexible (cloud or self-hosted)
+- Production-ready (7 tests passing, async prefetch, configurable)
+- Non-exclusive (works alongside other memory providers)
+
+### To Enable
+
+```bash
+# Cloud
+hermes honcho setup
+
+# Self-hosted
+export HONCHO_BASE_URL="http://localhost:8000"
+hermes chat
+```
+
+### To Disable
+
+```bash
+hermes memory off
+```
+
+---
+
+*Evaluated by SANDALPHON — Cron/Ops lane*
--- a/docs/matrix-setup.md
+++ b/docs/matrix-setup.md
@@ -214,58 +214,98 @@ matrix:
    - "!room1:matrix.org"
 ```

+
+## Fleet Ops Channel (Phase 4)
+
+The fleet ops channel replaces Telegram for system notifications, cron job results, and operational alerts.
+
+### 1. Create Fleet Ops Room
+
+1. Open Element (or any Matrix client)
+2. Create a new room named "Fleet Ops" or "Hermes Ops"
+3. Set room to **invite-only** (not public)
+4. Invite your bot: `@hermes-bot:your.domain.com`
+5. Get the room ID from Room Settings -> Advanced -> Internal Room ID
+   - Format: `!randomstring:your.domain.com`
+
+### 2. Configure Environment
+
+Add to `~/.hermes/.env`:
+
+```bash
+# Fleet ops channel
+MATRIX_HOME_CHANNEL=!your-room-id:matrix.org
+MATRIX_HOME_CHANNEL_NAME=Fleet Ops
+```
+
+### 3. Test Fleet Ops
+
+```bash
+# Test Matrix connection
+hermes matrix test
+
+# Send a test message to fleet ops channel
+hermes matrix send "Fleet ops channel test"
+
+# Check cron job delivery
+hermes cron list
+# Jobs with deliver=origin will now deliver to Matrix if configured
+```
+
+### 4. Configure Cron Delivery
+
+Cron jobs can deliver results to Matrix instead of Telegram:
+
+```bash
+# Create a job that delivers to Matrix
+hermes cron create "Check system health" --schedule "every 1h" --deliver matrix:home
+
+# Or deliver to a specific Matrix room
+hermes cron create "Backup status" --schedule "every 6h" --deliver matrix:!room-id:matrix.org
+```
+
+### 5. Fleet Ops Notifications
+
+The following notifications go to the fleet ops channel:
+
+- **Cron job results**: Success/failure of scheduled tasks
+- **System alerts**: Memory, disk, GPU usage warnings
+- **Agent status**: Model changes, provider switches, errors
+- **Security events**: Auth failures, suspicious activity
+
+### 6. Crisis Room (Optional)
+
+For urgent alerts (SOUL.md protocol), create a separate crisis room:
+
+```bash
+# Crisis channel for urgent alerts
+MATRIX_CRISIS_CHANNEL=!crisis-room-id:matrix.org
+MATRIX_CRISIS_CHANNEL_NAME=Crisis Room
+```
+
+### 7. Migrate from Telegram
+
+Once Matrix is working:
+
+1. Update cron jobs to deliver to Matrix: `--deliver matrix:home`
+2. Test all critical notifications
+3. Disable Telegram delivery for migrated jobs
+4. Monitor both channels during transition
+
 ## Troubleshooting

-### "Matrix: need MATRIX_ACCESS_TOKEN or MATRIX_USER_ID + MATRIX_PASSWORD"
+### Bot not receiving messages
+- Check bot has permission to read room
+- Verify E2EE is working (see E2EE section)
+- Check `MATRIX_HOME_CHANNEL` is set correctly

-Neither auth method is configured. Set `MATRIX_ACCESS_TOKEN` in `~/.hermes/.env`
-or provide `MATRIX_USER_ID` + `MATRIX_PASSWORD`.
+### Messages not sending
+- Verify `MATRIX_ACCESS_TOKEN` is valid
+- Check homeserver is reachable
+- Look at gateway logs: `hermes gateway logs`

-### "Matrix: whoami failed"
+### E2EE issues
+- Install: `pip install "matrix-nio[e2e]"`
+- Install libolm: `brew install libolm` (macOS) or `apt install libolm-dev` (Linux)
+- Restart gateway after installing

-The access token is invalid or expired. Generate a new one via the login API.
-
-### "Matrix: E2EE dependencies are missing"
-
-Install libolm and matrix-nio with E2EE support:
-
-```bash
-brew install libolm  # macOS
-pip install "matrix-nio[e2e]"
-```
-
-### "Matrix: login failed"
-
- Check username and password.
- Ensure the account exists on the target homeserver.
- Some homeservers require admin approval for new registrations.
-
-### Bot Not Responding in Rooms
-
-1. Check `MATRIX_REQUIRE_MENTION` — if `true` (default), messages must
-   @mention the bot.
-2. Check `MATRIX_ALLOWED_USERS` — if set, only listed users can interact.
-3. Check logs: `tail -f ~/.hermes/logs/gateway.log`
-
-### E2EE Rooms Show "Unable to Decrypt"
-
-1. Ensure `MATRIX_DEVICE_ID` is set to a stable value.
-2. Check that `~/.hermes/platforms/matrix/store/` has read/write permissions.
-3. Verify libolm is installed: `python -c "from nio.crypto import ENCRYPTION_ENABLED; print(ENCRYPTION_ENABLED)"`
-
-### Slow Message Delivery
-
-Matrix federation can add latency. For faster responses:
- Use the same homeserver for the bot and users.
- Set `MATRIX_HOME_ROOM` to a local room.
- Check network connectivity between Hermes and the homeserver.
-
-## Quick Start (Automated)
-
-Run the interactive setup script:
-
-```bash
-python scripts/setup_matrix.py
-```
-
-This guides you through homeserver selection, authentication, and verification.
--- a/gateway/config.py
+++ b/gateway/config.py
@@ -412,6 +412,52 @@ class GatewayConfig:
        return self.unauthorized_dm_behavior


+def _validate_fallback_providers() -> None:
+    """Validate fallback_providers from config.yaml at gateway startup.
+
+    Checks that each entry has 'provider' and 'model' fields and logs
+    warnings for malformed entries.  This catches broken fallback chains
+    before they silently degrade into no-fallback mode.
+    """
+    try:
+        _home = get_hermes_home()
+        _config_path = _home / "config.yaml"
+        if not _config_path.exists():
+            return
+        import yaml
+        with open(_config_path, encoding="utf-8") as _f:
+            _cfg = yaml.safe_load(_f) or {}
+        fbp = _cfg.get("fallback_providers")
+        if not fbp:
+            return
+        if not isinstance(fbp, list):
+            logger.warning(
+                "fallback_providers should be a YAML list, got %s. "
+                "Fallback chain will be disabled.",
+                type(fbp).__name__,
+            )
+            return
+        for i, entry in enumerate(fbp):
+            if not isinstance(entry, dict):
+                logger.warning(
+                    "fallback_providers[%d] is not a dict (got %s). Skipping entry.",
+                    i, type(entry).__name__,
+                )
+                continue
+            if not entry.get("provider"):
+                logger.warning(
+                    "fallback_providers[%d] missing 'provider' field. Skipping entry.",
+                    i,
+                )
+            if not entry.get("model"):
+                logger.warning(
+                    "fallback_providers[%d] missing 'model' field. Skipping entry.",
+                    i,
+                )
+    except Exception:
+        pass  # Non-fatal; validation is advisory
+
+
 def load_gateway_config() -> GatewayConfig:
    """
    Load gateway configuration from multiple sources.
@@ -645,6 +691,19 @@ def load_gateway_config() -> GatewayConfig:
                platform.value, env_name,
            )

+    # Warn about API Server enabled without a key (unauthenticated endpoint)
+    if Platform.API_SERVER in config.platforms:
+        api_cfg = config.platforms[Platform.API_SERVER]
+        if api_cfg.enabled and not api_cfg.extra.get("key"):
+            logger.warning(
+                "api_server is enabled but API_SERVER_KEY is not set. "
+                "The API endpoint will run unauthenticated. "
+                "Set API_SERVER_KEY in ~/.hermes/.env to secure it.",
+            )
+
+    # Validate fallback_providers structure from config.yaml
+    _validate_fallback_providers()
+
    return config


--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1026,6 +1026,16 @@ class GatewayRunner:
                    cfg = _y.safe_load(_f) or {}
                fb = cfg.get("fallback_providers") or cfg.get("fallback_model") or None
                if fb:
+                    # Treat empty dict / disabled fallback as "not configured"
+                    if isinstance(fb, dict):
+                        _enabled = fb.get("enabled")
+                        if _enabled is False or (
+                            isinstance(_enabled, str)
+                            and _enabled.strip().lower() in ("false", "0", "no", "off")
+                        ):
+                            return None
+                        if not fb.get("provider") and not fb.get("model"):
+                            return None
                    return fb
        except Exception:
            pass
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -1338,6 +1338,11 @@ _KNOWN_ROOT_KEYS = {
    "fallback_providers", "credential_pool_strategies", "toolsets",
    "agent", "terminal", "display", "compression", "delegation",
    "auxiliary", "custom_providers", "memory", "gateway",
+    "session_reset", "browser", "checkpoints", "smart_model_routing",
+    "voice", "stt", "tts", "human_delay", "security", "privacy",
+    "cron", "logging", "approvals", "command_allowlist", "quick_commands",
+    "personalities", "skills", "honcho", "timezone", "discord",
+    "whatsapp", "prefill_messages_file", "file_read_max_chars",
 }

 # Valid fields inside a custom_providers list entry
@@ -1421,6 +1426,7 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
                    ))

    # ── fallback_model must be a top-level dict with provider + model ────
+    # Blank or explicitly disabled fallback is intentional — skip validation.
    fb = config.get("fallback_model")
    if fb is not None:
        if not isinstance(fb, dict):
@@ -1430,21 +1436,40 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
                "Change to:\n"
                "  fallback_model:\n"
                "    provider: openrouter\n"
-                "    model: anthropic/claude-sonnet-4",
+                "    model: anthropic/claude-sonnet-4\n"
+                "Or disable with:\n"
+                "  fallback_model:\n"
+                "    enabled: false",
            ))
        elif fb:
-            if not fb.get("provider"):
-                issues.append(ConfigIssue(
-                    "warning",
-                    "fallback_model is missing 'provider' field — fallback will be disabled",
-                    "Add: provider: openrouter (or another provider)",
-                ))
-            if not fb.get("model"):
-                issues.append(ConfigIssue(
-                    "warning",
-                    "fallback_model is missing 'model' field — fallback will be disabled",
-                    "Add: model: anthropic/claude-sonnet-4 (or another model)",
-                ))
+            # Skip warnings when fallback is explicitly disabled (enabled: false)
+            _enabled = fb.get("enabled")
+            if _enabled is False or (isinstance(_enabled, str) and _enabled.strip().lower() in ("false", "0", "no", "off")):
+                pass  # intentionally disabled — no warnings
+            else:
+                # Check if both fields are blank (intentional disable)
+                provider = fb.get("provider")
+                model = fb.get("model")
+                provider_blank = not provider or (isinstance(provider, str) and not provider.strip())
+                model_blank = not model or (isinstance(model, str) and not model.strip())
+                
+                # Only warn if at least one field is set (user might be trying to configure)
+                # If both are blank, treat as intentionally disabled
+                if not provider_blank or not model_blank:
+                    if provider_blank:
+                        issues.append(ConfigIssue(
+                            "warning",
+                            "fallback_model is missing 'provider' field — fallback will be disabled",
+                            "Add: provider: openrouter (or another provider)\n"
+                            "Or disable with: enabled: false",
+                        ))
+                    if model_blank:
+                        issues.append(ConfigIssue(
+                            "warning",
+                            "fallback_model is missing 'model' field — fallback will be disabled",
+                            "Add: model: anthropic/claude-sonnet-4 (or another model)\n"
+                            "Or disable with: enabled: false",
+                        ))

    # ── Check for fallback_model accidentally nested inside custom_providers ──
    if isinstance(cp, dict) and "fallback_model" not in config and "fallback_model" in (cp or {}):
@@ -1478,6 +1503,72 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
                f"Move '{key}' under the appropriate section",
            ))

+    # ── fallback_providers must be a list of dicts with provider + model ─
+    fbp = config.get("fallback_providers")
+    if fbp is not None:
+        if not isinstance(fbp, list):
+            issues.append(ConfigIssue(
+                "error",
+                f"fallback_providers should be a YAML list, got {type(fbp).__name__}",
+                "Change to:\n"
+                "  fallback_providers:\n"
+                "    - provider: openrouter\n"
+                "      model: google/gemini-3-flash-preview",
+            ))
+        elif fbp:
+            for i, entry in enumerate(fbp):
+                if not isinstance(entry, dict):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is not a dict (got {type(entry).__name__})",
+                        "Each entry needs at minimum: provider, model",
+                    ))
+                    continue
+                if not entry.get("provider"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is missing 'provider' field — this fallback will be skipped",
+                        "Add: provider: openrouter (or another provider name)",
+                    ))
+                if not entry.get("model"):
+                    issues.append(ConfigIssue(
+                        "warning",
+                        f"fallback_providers[{i}] is missing 'model' field — this fallback will be skipped",
+                        "Add: model: google/gemini-3-flash-preview (or another model slug)",
+                    ))
+
+    # ── session_reset validation ─────────────────────────────────────────
+    session_reset = config.get("session_reset", {})
+    if isinstance(session_reset, dict):
+        idle_minutes = session_reset.get("idle_minutes")
+        if idle_minutes is not None:
+            if not isinstance(idle_minutes, (int, float)) or idle_minutes <= 0:
+                issues.append(ConfigIssue(
+                    "warning",
+                    f"session_reset.idle_minutes={idle_minutes} is invalid (must be a positive number)",
+                    "Set to a positive integer, e.g. 1440 (24 hours). Using 0 causes immediate resets.",
+                ))
+        at_hour = session_reset.get("at_hour")
+        if at_hour is not None:
+            if not isinstance(at_hour, (int, float)) or not (0 <= at_hour <= 23):
+                issues.append(ConfigIssue(
+                    "warning",
+                    f"session_reset.at_hour={at_hour} is invalid (must be 0-23)",
+                    "Set to an hour between 0 and 23, e.g. 4 for 4am",
+                ))
+
+    # ── API Server key check ─────────────────────────────────────────────
+    # If api_server is enabled via env, but no key is set, warn.
+    # This catches the "API_SERVER_KEY not configured" error from gateway logs.
+    api_server_enabled = os.getenv("API_SERVER_ENABLED", "").lower() in ("true", "1", "yes")
+    api_server_key = os.getenv("API_SERVER_KEY", "").strip()
+    if api_server_enabled and not api_server_key:
+        issues.append(ConfigIssue(
+            "warning",
+            "API_SERVER is enabled but API_SERVER_KEY is not set — the API server will run unauthenticated",
+            "Set API_SERVER_KEY in ~/.hermes/.env to secure the API endpoint",
+        ))
+
    return issues


--- a/hermes_cli/cron.py
+++ b/hermes_cli/cron.py
@@ -93,6 +93,39 @@ def cron_list(show_all: bool = False):
        script = job.get("script")
        if script:
            print(f"    Script:    {script}")
+        
+        # Show health status
+        last_status = job.get("last_status")
+        last_error = job.get("last_error")
+        last_error_at = job.get("last_error_at")
+        last_success_at = job.get("last_success_at")
+        error_cleared_at = job.get("error_cleared_at")
+        error_resolved_at = job.get("error_resolved_at")
+        
+        if last_status == "error" and last_error:
+            if error_cleared_at or error_resolved_at:
+                # Error was cleared/resolved
+                cleared_time = error_cleared_at or error_resolved_at
+                print(color(f"    Status:      ok (error cleared)", Colors.GREEN))
+                print(color(f"    Last error:  {last_error[:80]}...", Colors.DIM))
+                print(color(f"    Resolved:    {cleared_time}", Colors.DIM))
+            else:
+                # Current error
+                print(color(f"    Status:      ERROR", Colors.RED))
+                print(color(f"    Error:       {last_error[:80]}...", Colors.RED))
+                if last_error_at:
+                    print(color(f"    Since:       {last_error_at}", Colors.RED))
+        elif last_status == "retrying":
+            print(color(f"    Status:      retrying (error cleared)", Colors.YELLOW))
+        elif last_status == "ok":
+            if last_success_at:
+                print(color(f"    Status:      ok (last success: {last_success_at})", Colors.GREEN))
+        elif last_status:
+            print(f"    Status:      {last_status}")
+        
+        # Show success history if available
+        if last_success_at and last_status != "error":
+            print(f"    Last ok:     {last_success_at}")
        print()

    from hermes_cli.gateway import find_gateway_pids
@@ -222,7 +255,18 @@ def cron_edit(args):


 def _job_action(action: str, job_id: str, success_verb: str, now: bool = False) -> int:
-    if action == "run" and now:
+    if action == "clear_error":
+        result = _cron_api(action="clear_error", job_id=job_id)
+        if not result.get("success"):
+            print(color(f"Failed to clear error: {result.get('error', 'unknown error')}", Colors.RED))
+            return 1
+        job = result.get("job", {})
+        name = job.get("name", job_id)
+        print(color(f"Cleared stale error state for job '{name}'", Colors.GREEN))
+        if job.get("error_cleared_at"):
+            print(f"  Cleared at: {job['error_cleared_at']}")
+        return 0
+if action == "run" and now:
        # Synchronous execution — run job immediately and show result
        result = _cron_api(action="run_now", job_id=job_id)
        if not result.get("success"):
@@ -292,9 +336,13 @@ def cron_command(args):
        now = getattr(args, 'now', False)
        return _job_action("run", args.job_id, "Triggered", now=now)

+    
+    if subcmd == "clear-error":
+        return _job_action("clear_error", args.job_id, "Cleared")
+    
    if subcmd in {"remove", "rm", "delete"}:
        return _job_action("remove", args.job_id, "Removed")

    print(f"Unknown cron command: {subcmd}")
-    print("Usage: hermes cron [list|create|edit|pause|resume|run|remove|status|tick]")
+    print("Usage: hermes cron [list|create|edit|pause|resume|run|remove|clear-error|status|tick]")
    sys.exit(1)
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -662,6 +662,153 @@ def cmd_chat(args):
        sys.exit(1)


+
+def cmd_matrix(args):
+    """Handle Matrix integration commands."""
+    from hermes_cli.colors import Colors, color
+    import os
+    
+    subcmd = getattr(args, 'matrix_command', None)
+    
+    if subcmd is None or subcmd == "status":
+        # Show Matrix configuration status
+        print(color("\n=== Matrix Integration Status ===\n", Colors.CYAN))
+        
+        homeserver = os.getenv("MATRIX_HOMESERVER", "")
+        access_token = os.getenv("MATRIX_ACCESS_TOKEN", "")
+        user_id = os.getenv("MATRIX_USER_ID", "")
+        device_id = os.getenv("MATRIX_DEVICE_ID", "")
+        home_channel = os.getenv("MATRIX_HOME_CHANNEL", "")
+        crisis_channel = os.getenv("MATRIX_CRISIS_CHANNEL", "")
+        
+        if not homeserver:
+            print(color("✗ Matrix not configured", Colors.RED))
+            print("  Set MATRIX_HOMESERVER, MATRIX_ACCESS_TOKEN, MATRIX_USER_ID in ~/.hermes/.env")
+            print("  See docs/matrix-setup.md for setup guide")
+            return 1
+        
+        print(color("✓ Homeserver:", Colors.GREEN), homeserver)
+        print(color("✓ User ID:", Colors.GREEN) if user_id else color("✗ User ID: not set", Colors.RED))
+        print(color("✓ Access Token:", Colors.GREEN) if access_token else color("✗ Access Token: not set", Colors.RED))
+        print(color("✓ Device ID:", Colors.GREEN) if device_id else color("✗ Device ID: not set", Colors.YELLOW))
+        
+        print()
+        print(color("Channels:", Colors.CYAN))
+        if home_channel:
+            print(color("  ✓ Fleet Ops:", Colors.GREEN), home_channel)
+        else:
+            print(color("  ✗ Fleet Ops: not configured", Colors.YELLOW))
+            print("    Set MATRIX_HOME_CHANNEL for system notifications")
+        
+        if crisis_channel:
+            print(color("  ✓ Crisis Room:", Colors.GREEN), crisis_channel)
+        else:
+            print(color("  ○ Crisis Room: not configured", Colors.DIM))
+        
+        # Check E2EE support
+        try:
+            import nio
+            print(color("  ✓ E2EE support: available", Colors.GREEN))
+        except ImportError:
+            print(color("  ✗ E2EE support: missing (pip install 'matrix-nio[e2e]')", Colors.YELLOW))
+        
+        return 0
+    
+    if subcmd == "test":
+        # Test Matrix connection
+        print(color("Testing Matrix connection...", Colors.CYAN))
+        
+        homeserver = os.getenv("MATRIX_HOMESERVER", "")
+        access_token = os.getenv("MATRIX_ACCESS_TOKEN", "")
+        
+        if not homeserver or not access_token:
+            print(color("✗ Matrix not configured", Colors.RED))
+            return 1
+        
+        try:
+            import urllib.request, json, ssl
+            ctx = ssl.create_default_context()
+            
+            # Test homeserver reachability
+            req = urllib.request.Request(f"{homeserver}/_matrix/client/versions")
+            resp = urllib.request.urlopen(req, context=ctx, timeout=10)
+            versions = json.loads(resp.read())
+            print(color("✓ Homeserver reachable", Colors.GREEN))
+            
+            # Test authentication
+            req = urllib.request.Request(
+                f"{homeserver}/_matrix/client/v3/account/whoami",
+                headers={"Authorization": f"Bearer {access_token}"}
+            )
+            resp = urllib.request.urlopen(req, context=ctx, timeout=10)
+            whoami = json.loads(resp.read())
+            print(color(f"✓ Authenticated as: {whoami.get('user_id')}", Colors.GREEN))
+            
+            # Test home channel if configured
+            home_channel = os.getenv("MATRIX_HOME_CHANNEL", "")
+            if home_channel:
+                print(color(f"✓ Fleet Ops channel configured: {home_channel}", Colors.GREEN))
+            
+            print(color("\n✓ Matrix integration working!", Colors.GREEN))
+            return 0
+            
+        except Exception as e:
+            print(color(f"✗ Matrix test failed: {e}", Colors.RED))
+            return 1
+    
+    if subcmd == "send":
+        # Send message to Matrix
+        message = args.message
+        channel = getattr(args, 'channel', None) or os.getenv("MATRIX_HOME_CHANNEL", "")
+        
+        if not channel:
+            print(color("✗ No channel specified and MATRIX_HOME_CHANNEL not set", Colors.RED))
+            return 1
+        
+        print(color(f"Sending to Matrix channel: {channel}", Colors.CYAN))
+        
+        # Use the send_message_tool
+        try:
+            import asyncio
+            from tools.send_message_tool import _send_matrix
+            
+            homeserver = os.getenv("MATRIX_HOMESERVER", "")
+            access_token = os.getenv("MATRIX_ACCESS_TOKEN", "")
+            
+            if not homeserver or not access_token:
+                print(color("✗ Matrix not configured", Colors.RED))
+                return 1
+            
+            result = asyncio.run(_send_matrix(access_token, {"homeserver": homeserver}, channel, message))
+            
+            if result.get("success"):
+                print(color("✓ Message sent", Colors.GREEN))
+                return 0
+            else:
+                print(color(f"✗ Send failed: {result.get('error')}", Colors.RED))
+                return 1
+                
+        except Exception as e:
+            print(color(f"✗ Error: {e}", Colors.RED))
+            return 1
+    
+    if subcmd == "setup":
+        print(color("Matrix Setup Wizard", Colors.CYAN))
+        print("\nTo set up Matrix integration:")
+        print("1. See docs/matrix-setup.md for full guide")
+        print("2. Set environment variables in ~/.hermes/.env:")
+        print("   MATRIX_HOMESERVER=https://your-homeserver.com")
+        print("   MATRIX_ACCESS_TOKEN=your-token")
+        print("   MATRIX_USER_ID=@bot:your-homeserver.com")
+        print("   MATRIX_HOME_CHANNEL=!room-id:your-homeserver.com")
+        print("3. Run: hermes matrix test")
+        print("4. Create cron jobs with --deliver matrix:home")
+        return 0
+    
+    print(color(f"Unknown matrix command: {subcmd}", Colors.RED))
+    return 1
+
+
 def cmd_gateway(args):
    """Gateway management commands."""
    from hermes_cli.gateway import gateway_command
@@ -4272,6 +4419,33 @@ For more help on a command:
    )
    chat_parser.set_defaults(func=cmd_chat)

+    # =========================================================================
+    # matrix command
+    # =========================================================================
+    matrix_parser = subparsers.add_parser(
+        "matrix",
+        help="Matrix sovereign messaging integration",
+        description="Manage Matrix integration for fleet ops and notifications."
+    )
+    matrix_subparsers = matrix_parser.add_subparsers(dest="matrix_command")
+    
+    # Matrix test command
+    matrix_test = matrix_subparsers.add_parser("test", help="Test Matrix connection and configuration")
+    
+    # Matrix send command
+    matrix_send = matrix_subparsers.add_parser("send", help="Send a message to Matrix channel")
+    matrix_send.add_argument("message", help="Message to send")
+    matrix_send.add_argument("--channel", "-c", help="Channel to send to (default: home channel)")
+    
+    # Matrix status command
+    matrix_status = matrix_subparsers.add_parser("status", help="Show Matrix integration status")
+    
+    # Matrix setup command
+    matrix_setup = matrix_subparsers.add_parser("setup", help="Interactive Matrix setup wizard")
+    
+    matrix_parser.set_defaults(func=cmd_matrix)
+
+
    # =========================================================================
    # model command
    # =========================================================================
@@ -4576,6 +4750,9 @@ For more help on a command:
    cron_run.add_argument("job_id", help="Job ID to trigger")
    cron_run.add_argument("--now", action="store_true", help="Execute immediately and wait for result (clears stale errors)")

+    cron_clear_error = cron_subparsers.add_parser("clear-error", help="Clear stale error state for a job")
+    cron_clear_error.add_argument("job_id", help="Job ID to clear error for")
+
    cron_remove = cron_subparsers.add_parser("remove", aliases=["rm", "delete"], help="Remove a scheduled job")
    cron_remove.add_argument("job_id", help="Job ID to remove")

@@ -5005,7 +5182,7 @@ For more help on a command:
    # =========================================================================
    sessions_parser = subparsers.add_parser(
        "sessions",
-        help="Manage session history (list, rename, export, prune, delete)",
+        help="Manage session history (list, rename, export, prune, gc, delete)",
        description="View and manage the SQLite session store"
    )
    sessions_subparsers = sessions_parser.add_subparsers(dest="sessions_action")
@@ -5028,6 +5205,14 @@ For more help on a command:
    sessions_prune.add_argument("--source", help="Only prune sessions from this source")
    sessions_prune.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")

+    sessions_gc = sessions_subparsers.add_parser("gc", help="Garbage-collect empty/trivial sessions")
+    sessions_gc.add_argument("--empty-hours", type=int, default=24, help="Delete empty (0-msg) sessions older than N hours (default: 24)")
+    sessions_gc.add_argument("--trivial-days", type=int, default=7, help="Delete trivial (1-5 msg) sessions older than N days (default: 7)")
+    sessions_gc.add_argument("--trivial-max", type=int, default=5, help="Max messages to consider trivial (default: 5)")
+    sessions_gc.add_argument("--source", help="Only GC sessions from this source")
+    sessions_gc.add_argument("--dry-run", action="store_true", help="Show what would be deleted without deleting")
+    sessions_gc.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
+
    sessions_stats = sessions_subparsers.add_parser("stats", help="Show session store statistics")

    sessions_rename = sessions_subparsers.add_parser("rename", help="Set or change a session's title")
@@ -5197,6 +5382,49 @@ For more help on a command:
                size_mb = os.path.getsize(db_path) / (1024 * 1024)
                print(f"Database size: {size_mb:.1f} MB")

+        elif action == "gc":
+            dry_run = getattr(args, "dry_run", False)
+            if dry_run:
+                counts = db.garbage_collect(
+                    empty_older_than_hours=args.empty_hours,
+                    trivial_max_messages=args.trivial_max,
+                    trivial_older_than_days=args.trivial_days,
+                    source=args.source,
+                    dry_run=True,
+                )
+                print(f"[dry-run] Would delete {counts['total']} session(s):")
+                print(f"  Empty (0 msgs, >{args.empty_hours}h old): {counts['empty']}")
+                print(f"  Trivial (<={args.trivial_max} msgs, >{args.trivial_days}d old): {counts['trivial']}")
+            else:
+                # Preview first
+                preview = db.garbage_collect(
+                    empty_older_than_hours=args.empty_hours,
+                    trivial_max_messages=args.trivial_max,
+                    trivial_older_than_days=args.trivial_days,
+                    source=args.source,
+                    dry_run=True,
+                )
+                if preview["total"] == 0:
+                    print("Nothing to collect.")
+                else:
+                    if not args.yes:
+                        if not _confirm_prompt(
+                            f"Delete {preview['total']} session(s) "
+                            f"({preview['empty']} empty, {preview['trivial']} trivial)? [y/N] "
+                        ):
+                            print("Cancelled.")
+                            return
+                    counts = db.garbage_collect(
+                        empty_older_than_hours=args.empty_hours,
+                        trivial_max_messages=args.trivial_max,
+                        trivial_older_than_days=args.trivial_days,
+                        source=args.source,
+                        dry_run=False,
+                    )
+                    print(f"Collected {counts['total']} session(s):")
+                    print(f"  Empty: {counts['empty']}")
+                    print(f"  Trivial: {counts['trivial']}")
+
        else:
            sessions_parser.print_help()

--- a/hermes_state.py
+++ b/hermes_state.py
@@ -1355,3 +1355,78 @@ class SessionDB:
            return len(session_ids)

        return self._execute_write(_do)
+
+    def garbage_collect(
+        self,
+        empty_older_than_hours: int = 24,
+        trivial_max_messages: int = 5,
+        trivial_older_than_days: int = 7,
+        source: str = None,
+        dry_run: bool = False,
+    ) -> Dict[str, int]:
+        """Delete empty and trivial sessions based on age.
+
+        Policy (matches #315):
+        - Empty sessions (0 messages) older than ``empty_older_than_hours``
+        - Trivial sessions (1..``trivial_max_messages`` msgs) older than
+          ``trivial_older_than_days``
+        - Sessions with more than ``trivial_max_messages`` are kept indefinitely
+        - Active (not ended) sessions are never deleted
+
+        Returns a dict with counts: ``empty``, ``trivial``, ``total``.
+        """
+        now = time.time()
+        empty_cutoff = now - (empty_older_than_hours * 3600)
+        trivial_cutoff = now - (trivial_older_than_days * 86400)
+
+        def _do(conn):
+            # --- Find empty sessions ---
+            empty_q = (
+                "SELECT id FROM sessions "
+                "WHERE message_count = 0 AND started_at < ? AND ended_at IS NOT NULL"
+            )
+            params = [empty_cutoff]
+            if source:
+                empty_q += " AND source = ?"
+                params.append(source)
+            empty_ids = [r[0] for r in conn.execute(empty_q, params).fetchall()]
+
+            # --- Find trivial sessions ---
+            trivial_q = (
+                "SELECT id FROM sessions "
+                "WHERE message_count BETWEEN 1 AND ? AND started_at < ? AND ended_at IS NOT NULL"
+            )
+            t_params = [trivial_max_messages, trivial_cutoff]
+            if source:
+                trivial_q += " AND source = ?"
+                t_params.append(source)
+            trivial_ids = [r[0] for r in conn.execute(trivial_q, t_params).fetchall()]
+
+            all_ids = set(empty_ids) | set(trivial_ids)
+
+            if dry_run:
+                return {"empty": len(empty_ids), "trivial": len(trivial_ids),
+                        "total": len(all_ids)}
+
+            # --- Collect child sessions to delete first (FK constraint) ---
+            child_ids = set()
+            for sid in all_ids:
+                for r in conn.execute(
+                    "SELECT id FROM sessions WHERE parent_session_id = ?", (sid,)
+                ).fetchall():
+                    child_ids.add(r[0])
+
+            # Delete children
+            for cid in child_ids:
+                conn.execute("DELETE FROM messages WHERE session_id = ?", (cid,))
+                conn.execute("DELETE FROM sessions WHERE id = ?", (cid,))
+
+            # Delete targets
+            for sid in all_ids:
+                conn.execute("DELETE FROM messages WHERE session_id = ?", (sid,))
+                conn.execute("DELETE FROM sessions WHERE id = ?", (sid,))
+
+            return {"empty": len(empty_ids), "trivial": len(trivial_ids),
+                    "total": len(all_ids)}
+
+        return self._execute_write(_do)
--- a/tests/hermes_cli/test_config_validation.py
+++ b/tests/hermes_cli/test_config_validation.py
@@ -136,6 +136,83 @@ class TestFallbackModelValidation:
        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
        assert len(fb_issues) == 0

+    def test_blank_fallback_fields_no_issues(self):
+        """Blank fallback_model fields (both empty) should not trigger warnings."""
+        issues = validate_config_structure({
+            "fallback_model": {
+                "provider": "",
+                "model": "",
+            },
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+
+    def test_blank_fallback_fields_with_whitespace_no_issues(self):
+        """Blank fallback_model fields with whitespace should not trigger warnings."""
+        issues = validate_config_structure({
+            "fallback_model": {
+                "provider": "  ",
+                "model": "  ",
+            },
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+
+    def test_none_fallback_fields_no_issues(self):
+        """None fallback_model fields should not trigger warnings."""
+        issues = validate_config_structure({
+            "fallback_model": {
+                "provider": None,
+                "model": None,
+            },
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+
+    def test_enabled_false_no_issues(self):
+        """enabled: false should suppress warnings."""
+        issues = validate_config_structure({
+            "fallback_model": {
+                "enabled": False,
+            },
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+
+    def test_enabled_false_string_no_issues(self):
+        """enabled: 'false' (string) should suppress warnings."""
+        issues = validate_config_structure({
+            "fallback_model": {
+                "enabled": "false",
+            },
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+
+    def test_partial_blank_fallback_warns(self):
+        """Partial blank fallback (only one field blank) should warn."""
+        issues = validate_config_structure({
+            "fallback_model": {
+                "provider": "",
+                "model": "anthropic/claude-sonnet-4",
+            },
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 1
+        assert "provider" in fb_issues[0].message
+
+    def test_valid_fallback_with_enabled_true(self):
+        """Valid fallback with enabled: true should not warn."""
+        issues = validate_config_structure({
+            "fallback_model": {
+                "enabled": True,
+                "provider": "openrouter",
+                "model": "anthropic/claude-sonnet-4",
+            },
+        })
+        fb_issues = [i for i in issues if "fallback" in i.message.lower()]
+        assert len(fb_issues) == 0
+

 class TestMissingModelSection:
    """Warn when custom_providers exists but model section is missing."""
@@ -172,3 +249,111 @@ class TestConfigIssueDataclass:
        a = ConfigIssue("error", "msg", "hint")
        b = ConfigIssue("error", "msg", "hint")
        assert a == b
+
+
+class TestFallbackProvidersValidation:
+    """fallback_providers must be a list of dicts with provider + model."""
+
+    def test_non_list(self):
+        """fallback_providers as string should error."""
+        issues = validate_config_structure({
+            "fallback_providers": "openrouter:google/gemini-3-flash-preview",
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("fallback_providers" in i.message and "list" in i.message for i in errors)
+
+    def test_dict_instead_of_list(self):
+        """fallback_providers as dict should error."""
+        issues = validate_config_structure({
+            "fallback_providers": {"provider": "openrouter", "model": "test"},
+        })
+        errors = [i for i in issues if i.severity == "error"]
+        assert any("fallback_providers" in i.message and "dict" in i.message for i in errors)
+
+    def test_entry_missing_provider(self):
+        """Entry without provider should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": [{"model": "google/gemini-3-flash-preview"}],
+        })
+        assert any("missing 'provider'" in i.message for i in issues)
+
+    def test_entry_missing_model(self):
+        """Entry without model should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": [{"provider": "openrouter"}],
+        })
+        assert any("missing 'model'" in i.message for i in issues)
+
+    def test_entry_not_dict(self):
+        """Non-dict entries should warn."""
+        issues = validate_config_structure({
+            "fallback_providers": ["not-a-dict"],
+        })
+        assert any("not a dict" in i.message for i in issues)
+
+    def test_valid_entries(self):
+        """Valid fallback_providers should produce no fallback-related issues."""
+        issues = validate_config_structure({
+            "fallback_providers": [
+                {"provider": "openrouter", "model": "google/gemini-3-flash-preview"},
+                {"provider": "gemini", "model": "gemini-2.5-flash"},
+            ],
+        })
+        fb_issues = [i for i in issues if "fallback_providers" in i.message]
+        assert len(fb_issues) == 0
+
+    def test_empty_list_no_issues(self):
+        """Empty list is valid (fallback disabled)."""
+        issues = validate_config_structure({
+            "fallback_providers": [],
+        })
+        fb_issues = [i for i in issues if "fallback_providers" in i.message]
+        assert len(fb_issues) == 0
+
+
+class TestSessionResetValidation:
+    """session_reset.idle_minutes must be positive."""
+
+    def test_zero_idle_minutes(self):
+        """idle_minutes=0 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": 0},
+        })
+        assert any("idle_minutes=0" in i.message for i in issues)
+
+    def test_negative_idle_minutes(self):
+        """idle_minutes=-5 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": -5},
+        })
+        assert any("idle_minutes=-5" in i.message for i in issues)
+
+    def test_string_idle_minutes(self):
+        """idle_minutes as string should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": "abc"},
+        })
+        assert any("idle_minutes=" in i.message for i in issues)
+
+    def test_valid_idle_minutes(self):
+        """Valid idle_minutes should not warn."""
+        issues = validate_config_structure({
+            "session_reset": {"idle_minutes": 1440},
+        })
+        idle_issues = [i for i in issues if "idle_minutes" in i.message]
+        assert len(idle_issues) == 0
+
+    def test_invalid_at_hour(self):
+        """at_hour=25 should warn."""
+        issues = validate_config_structure({
+            "session_reset": {"at_hour": 25},
+        })
+        assert any("at_hour=25" in i.message for i in issues)
+
+    def test_valid_at_hour(self):
+        """Valid at_hour should not warn."""
+        issues = validate_config_structure({
+            "session_reset": {"at_hour": 4},
+        })
+        hour_issues = [i for i in issues if "at_hour" in i.message]
+        assert len(hour_issues) == 0
--- a/tests/test_hermes_state.py
+++ b/tests/test_hermes_state.py
@@ -665,6 +665,127 @@ class TestPruneSessions:


 # =========================================================================
+# =========================================================================
+# Garbage Collect
+# =========================================================================
+
+class TestGarbageCollect:
+    def test_gc_deletes_empty_old_sessions(self, db):
+        """Empty sessions (0 messages) older than 24h should be deleted."""
+        db.create_session(session_id="empty_old", source="cli")
+        db.end_session("empty_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "empty_old"),  # 48 hours ago
+        )
+        db._conn.commit()
+
+        # Recent empty session should be kept
+        db.create_session(session_id="empty_new", source="cli")
+        db.end_session("empty_new", end_reason="done")
+
+        result = db.garbage_collect()
+        assert result["empty"] == 1
+        assert result["trivial"] == 0
+        assert result["total"] == 1
+        assert db.get_session("empty_old") is None
+        assert db.get_session("empty_new") is not None
+
+    def test_gc_deletes_trivial_old_sessions(self, db):
+        """Sessions with 1-5 messages older than 7 days should be deleted."""
+        db.create_session(session_id="trivial_old", source="cli")
+        for i in range(3):
+            db.append_message("trivial_old", role="user", content=f"msg {i}")
+        db.end_session("trivial_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 10 * 86400, "trivial_old"),  # 10 days ago
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["trivial"] == 1
+        assert result["total"] == 1
+        assert db.get_session("trivial_old") is None
+
+    def test_gc_keeps_active_sessions(self, db):
+        """Active (not ended) sessions should never be deleted."""
+        db.create_session(session_id="active_old", source="cli")
+        # Backdate but don't end
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "active_old"),
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["total"] == 0
+        assert db.get_session("active_old") is not None
+
+    def test_gc_keeps_substantial_sessions(self, db):
+        """Sessions with >5 messages should never be deleted."""
+        db.create_session(session_id="big_old", source="cli")
+        for i in range(10):
+            db.append_message("big_old", role="user", content=f"msg {i}")
+        db.end_session("big_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 365 * 86400, "big_old"),  # 1 year ago
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["total"] == 0
+        assert db.get_session("big_old") is not None
+
+    def test_gc_dry_run_does_not_delete(self, db):
+        """dry_run=True should return counts but not delete anything."""
+        db.create_session(session_id="empty_old", source="cli")
+        db.end_session("empty_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "empty_old"),
+        )
+        db._conn.commit()
+
+        result = db.garbage_collect(dry_run=True)
+        assert result["total"] == 1
+        assert db.get_session("empty_old") is not None  # Still exists
+
+    def test_gc_with_source_filter(self, db):
+        """--source should only GC sessions from that source."""
+        for sid, src in [("old_cli", "cli"), ("old_tg", "telegram")]:
+            db.create_session(session_id=sid, source=src)
+            db.end_session(sid, end_reason="done")
+            db._conn.execute(
+                "UPDATE sessions SET started_at = ? WHERE id = ?",
+                (time.time() - 48 * 3600, sid),
+            )
+        db._conn.commit()
+
+        result = db.garbage_collect(source="cli")
+        assert result["total"] == 1
+        assert db.get_session("old_cli") is None
+        assert db.get_session("old_tg") is not None
+
+    def test_gc_handles_child_sessions(self, db):
+        """Child sessions should be deleted when parent is GC'd."""
+        db.create_session(session_id="parent_old", source="cli")
+        db.end_session("parent_old", end_reason="done")
+        db._conn.execute(
+            "UPDATE sessions SET started_at = ? WHERE id = ?",
+            (time.time() - 48 * 3600, "parent_old"),
+        )
+        # Create child session
+        db.create_session(session_id="child", source="cli", parent_session_id="parent_old")
+        db.end_session("child", end_reason="done")
+        db._conn.commit()
+
+        result = db.garbage_collect()
+        assert result["total"] == 1
+        assert db.get_session("parent_old") is None
+        assert db.get_session("child") is None
+
 # Schema and WAL mode
 # =========================================================================

--- a/tools/cronjob_tools.py
+++ b/tools/cronjob_tools.py
@@ -201,6 +201,17 @@ def _format_job(job: Dict[str, Any]) -> Dict[str, Any]:
        "paused_at": job.get("paused_at"),
        "paused_reason": job.get("paused_reason"),
    }
+    # Health timestamps
+    if job.get("last_error_at"):
+        result["last_error_at"] = job["last_error_at"]
+    if job.get("last_success_at"):
+        result["last_success_at"] = job["last_success_at"]
+    if job.get("error_resolved_at"):
+        result["error_resolved_at"] = job["error_resolved_at"]
+    if job.get("error_cleared_at"):
+        result["error_cleared_at"] = job["error_cleared_at"]
+    
+
    if job.get("script"):
        result["script"] = job["script"]
    return result
@@ -326,6 +337,13 @@ def cronjob(
            if result is None:
                return json.dumps({"success": False, "error": "Job not found"}, indent=2)
            return json.dumps(result, indent=2)
+        if normalized == "clear_error":
+            from cron.jobs import clear_job_error
+            job = clear_job_error(job_id)
+            if job is None:
+                return json.dumps({"success": False, "error": "Job not found"}, indent=2)
+            return json.dumps({"success": True, "job": _format_job(job)}, indent=2)
+

        if normalized == "update":
            updates: Dict[str, Any] = {}
Author	SHA1	Message	Date
Alexander Whitestone	3c9a1d9010	feat(matrix): Add Matrix CLI commands for fleet ops Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 31s Details Part of #274. Adds `hermes matrix` command with test, send, status, and setup subcommands.	2026-04-14 00:38:06 +00:00
Alexander Whitestone	56f5df8ade	feat(matrix): Add fleet ops channel documentation Part of #274. Comprehensive guide for setting up Matrix fleet ops channel to replace Telegram notifications.	2026-04-14 00:34:56 +00:00
Alexander Whitestone	b68d6c7259	feat(matrix): Add Matrix configuration to .env.example Part of #274. Adds Matrix environment variables for fleet ops channel setup.	2026-04-14 00:34:22 +00:00
Alexander Whitestone	b562a3d94c	Merge pull request 'docs(#322 ): comprehensive Honcho evaluation — recommendation: KEEP' (#430 ) from burn/322-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:33:56 +00:00
Alexander Whitestone	37af40a38e	Merge pull request 'feat: session garbage collection (#315 )' (#383 ) from feat/315-session-gc into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:33:15 +00:00
Alexander Whitestone	5aa8581e2b	Merge pull request 'fix: gateway config debt - validation, defaults, fallback chain checks (#328 )' (#381 ) from fix/gateway-config-debt-328 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:32:56 +00:00
Alexander Whitestone	b44255f21e	Merge pull request 'cron: Comprehensive stale error state handling for recovered jobs (#349 )' (#431 ) from burn/349-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:32:48 +00:00
Alexander Whitestone	6b41bafccd	Merge pull request 'fix(cron): disable terminal toolset for cloud providers in cron jobs (#379 )' (#436 ) from burn/379-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:32:45 +00:00
Alexander Whitestone	053fa3a2dd	Merge pull request 'fix(cron): normalize model field types in deploy-crons.py' (#410 ) from burn/376-1776117777 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:31:47 +00:00
Alexander Whitestone	cda29991e0	Merge pull request 'Fix #373 : fallback_model blank fields no longer trigger gateway warnings' (#433 ) from burn/373-1776125702 into main Some checks failed Forge CI / smoke-and-build (push) Has been cancelled Details	2026-04-14 00:30:10 +00:00
Timmy Time	1899878c27	Fix #373 : fallback_model blank fields no longer trigger gateway warnings Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m1s Details When users blank fallback_model fields or set enabled: false, the validation and gateway now treat this as intentionally disabling fallback instead of showing warnings. Changes: - hermes_cli/config.py: Skip warnings when both provider and model are blank or when enabled: false is set - gateway/run.py: Return None for disabled fallback configs - tests: Added 8 new tests for blank/disabled fallback scenarios Behavior: - Both fields blank: no warnings (intentional disable) - enabled: false: no warnings (explicit disable) - One field blank: warning shown (likely misconfiguration) - Valid config: no warnings Fixes #373	2026-04-13 20:19:21 -04:00
Alexander Whitestone	379769ca6d	feat(cron): Show health status in job list Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m0s Details Part of #349. Shows current vs. cleared errors, success history.	2026-04-14 00:19:11 +00:00
Alexander Whitestone	91bc02bc38	feat(cron): Add clear-error CLI subparser Part of #349. Adds `hermes cron clear-error JOB_ID` command.	2026-04-14 00:18:52 +00:00
Alexander Whitestone	77265a31e1	feat(cron): Add clear-error CLI command Part of #349. Adds `hermes cron clear-error JOB_ID` command.	2026-04-14 00:18:30 +00:00
Timmy	7a32df9ca3	docs(#322 ): comprehensive Honcho evaluation — recommendation: KEEP Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m9s Details Decision: Cloud vs Local → BOTH (user's choice) - Cloud: HONCHO_API_KEY from app.honcho.dev - Self-hosted: HONCHO_BASE_URL=http://localhost:8000 - Disabled: No config = zero overhead Integration is already production-ready: - 3 components, ~700 lines of code - 7 tests passing - Async prefetch (zero-latency) - Configurable recall modes - Cron guard (inactive in cron context) Recommendation: KEEP — provides unique cross-session user modeling that complements local holographic fact_store. Refs #322	2026-04-13 20:18:23 -04:00
Alexander Whitestone	cf36bd2ddf	feat(cron): Add clear_error action and health timestamps Part of #349. Adds clear_error action and includes health timestamps in job format.	2026-04-14 00:18:09 +00:00
Alexander Whitestone	0413fc1788	feat(cron): Comprehensive stale error state handling - mark_job_run: track last_error_at, last_success_at, error_resolved_at - trigger_job: clear stale error state when re-triggering - clear_job_error: manual clearing of stale errors Closes #349	2026-04-14 00:17:45 +00:00
Alexander Whitestone	3c66333c94	fix(cron): add deploy-crons.py to normalize model field types Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 48s Details Fixes #376 Normalize model field in jobs.json to always be a dict when either model or provider is specified, preventing schema inconsistency.	2026-04-13 22:24:31 +00:00
Alexander Whitestone	69e10967bd	feat: session garbage collection (#315 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 14s Details Add garbage_collect() method to SessionDB that cleans up empty and trivial sessions based on age: - Empty sessions (0 messages) older than 24h - Trivial sessions (1-5 messages) older than 7 days - Sessions with >5 messages kept indefinitely Add `hermes sessions gc` CLI command with: - --empty-hours (default: 24) - --trivial-days (default: 7) - --trivial-max (default: 5) - --source filter - --dry-run preview mode - --yes skip confirmation The dry-run flow: preview what would be deleted, ask for confirmation, then execute. Handles child session FK constraints properly. 7 tests covering: empty/trivial deletion, active session protection, substantial session preservation, dry-run, source filtering, and child session handling. Closes #315	2026-04-13 17:30:39 -04:00
Alexander Whitestone	992498463e	fix: gateway config debt - validation, defaults, fallback chain checks (#328 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 1m32s Details - Expand validate_config_structure() to catch: - fallback_providers format errors (non-list, missing provider/model) - session_reset.idle_minutes <= 0 (causes immediate resets) - session_reset.at_hour out of 0-23 range - API_SERVER enabled without API_SERVER_KEY - Unknown root-level keys that look like misplaced custom_providers fields - Add _validate_fallback_providers() in gateway/config.py to validate fallback chain at gateway startup (logs warnings for malformed entries) - Add API_SERVER_KEY check in gateway config loader (warns on unauthenticated endpoint) - Expand _KNOWN_ROOT_KEYS to include all valid top-level config sections (session_reset, browser, checkpoints, voice, stt, tts, etc.) - Add 13 new tests for fallback_providers and session_reset validation - All existing tests pass (47/47) Closes #328	2026-04-13 17:29:20 -04:00
Alexander Whitestone	ec3cd2081b	fix(poka-yoke): add tool fixation detection (#310 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 26s Details Detect when the same tool is called 5+ times consecutively and inject a nudge advising the agent to diversify its approach. Evidence from empirical audit: - Top marathon session (qwen, 1643 msgs): execute_code streak of 20 - Opus session (1472 msgs): terminal streak of 10 The nudge fires every 5 consecutive calls (5, 10, 15...) so it persists without being spammy. Tracks independently in both sequential and concurrent execution paths.	2026-04-13 10:16:11 -04:00
Alexander Whitestone	110642d86a	fix(poka-yoke): add circuit breaker for error cascading (#309 ) Some checks failed Forge CI / smoke-and-build (pull_request) Failing after 28s Details After 3 consecutive tool errors, inject a warning into the tool result advising the agent to switch strategies. Escalates at 6 and 9+ errors. Empirical data from audit: - P(error \| prev error) = 58.6% vs P(error \| prev success) = 25.2% - 2.33x cascade amplification factor - Max observed streak: 31 consecutive errors Intervention tiers: - 3 errors: advisory warning (try different tool, use terminal, simplify) - 6 errors: urgent stop (halt retries, investigate or switch) - 9+ errors: terminal-only recovery path Tracks errors in both sequential and concurrent execution paths.	2026-04-13 10:12:24 -04:00