diff --git a/agent/smart_model_routing.py b/agent/smart_model_routing.py
new file mode 100644
index 000000000..249548701
--- /dev/null
+++ b/agent/smart_model_routing.py
@@ -0,0 +1,184 @@
+"""Helpers for optional cheap-vs-strong model routing."""
+
+from __future__ import annotations
+
+import os
+import re
+from typing import Any, Dict, Optional
+
+_COMPLEX_KEYWORDS = {
+ "debug",
+ "debugging",
+ "implement",
+ "implementation",
+ "refactor",
+ "patch",
+ "traceback",
+ "stacktrace",
+ "exception",
+ "error",
+ "analyze",
+ "analysis",
+ "investigate",
+ "architecture",
+ "design",
+ "compare",
+ "benchmark",
+ "optimize",
+ "optimise",
+ "review",
+ "terminal",
+ "shell",
+ "tool",
+ "tools",
+ "pytest",
+ "test",
+ "tests",
+ "plan",
+ "planning",
+ "delegate",
+ "subagent",
+ "cron",
+ "docker",
+ "kubernetes",
+}
+
+_URL_RE = re.compile(r"https?://|www\.", re.IGNORECASE)
+
+
+def _coerce_bool(value: Any, default: bool = False) -> bool:
+ if value is None:
+ return default
+ if isinstance(value, bool):
+ return value
+ if isinstance(value, str):
+ return value.strip().lower() in {"1", "true", "yes", "on"}
+ return bool(value)
+
+
+def _coerce_int(value: Any, default: int) -> int:
+ try:
+ return int(value)
+ except (TypeError, ValueError):
+ return default
+
+
+def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+ """Return the configured cheap-model route when a message looks simple.
+
+ Conservative by design: if the message has signs of code/tool/debugging/
+ long-form work, keep the primary model.
+ """
+ cfg = routing_config or {}
+ if not _coerce_bool(cfg.get("enabled"), False):
+ return None
+
+ cheap_model = cfg.get("cheap_model") or {}
+ if not isinstance(cheap_model, dict):
+ return None
+ provider = str(cheap_model.get("provider") or "").strip().lower()
+ model = str(cheap_model.get("model") or "").strip()
+ if not provider or not model:
+ return None
+
+ text = (user_message or "").strip()
+ if not text:
+ return None
+
+ max_chars = _coerce_int(cfg.get("max_simple_chars"), 160)
+ max_words = _coerce_int(cfg.get("max_simple_words"), 28)
+
+ if len(text) > max_chars:
+ return None
+ if len(text.split()) > max_words:
+ return None
+ if text.count("\n") > 1:
+ return None
+ if "```" in text or "`" in text:
+ return None
+ if _URL_RE.search(text):
+ return None
+
+ lowered = text.lower()
+ words = {token.strip(".,:;!?()[]{}\"'`") for token in lowered.split()}
+ if words & _COMPLEX_KEYWORDS:
+ return None
+
+ route = dict(cheap_model)
+ route["provider"] = provider
+ route["model"] = model
+ route["routing_reason"] = "simple_turn"
+ return route
+
+
+def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:
+ """Resolve the effective model/runtime for one turn.
+
+ Returns a dict with model/runtime/signature/label fields.
+ """
+ route = choose_cheap_model_route(user_message, routing_config)
+ if not route:
+ return {
+ "model": primary.get("model"),
+ "runtime": {
+ "api_key": primary.get("api_key"),
+ "base_url": primary.get("base_url"),
+ "provider": primary.get("provider"),
+ "api_mode": primary.get("api_mode"),
+ },
+ "label": None,
+ "signature": (
+ primary.get("model"),
+ primary.get("provider"),
+ primary.get("base_url"),
+ primary.get("api_mode"),
+ ),
+ }
+
+ from hermes_cli.runtime_provider import resolve_runtime_provider
+
+ explicit_api_key = None
+ api_key_env = str(route.get("api_key_env") or "").strip()
+ if api_key_env:
+ explicit_api_key = os.getenv(api_key_env) or None
+
+ try:
+ runtime = resolve_runtime_provider(
+ requested=route.get("provider"),
+ explicit_api_key=explicit_api_key,
+ explicit_base_url=route.get("base_url"),
+ )
+ except Exception:
+ return {
+ "model": primary.get("model"),
+ "runtime": {
+ "api_key": primary.get("api_key"),
+ "base_url": primary.get("base_url"),
+ "provider": primary.get("provider"),
+ "api_mode": primary.get("api_mode"),
+ },
+ "label": None,
+ "signature": (
+ primary.get("model"),
+ primary.get("provider"),
+ primary.get("base_url"),
+ primary.get("api_mode"),
+ ),
+ }
+
+ return {
+ "model": route.get("model"),
+ "runtime": {
+ "api_key": runtime.get("api_key"),
+ "base_url": runtime.get("base_url"),
+ "provider": runtime.get("provider"),
+ "api_mode": runtime.get("api_mode"),
+ },
+ "label": f"smart route → {route.get('model')} ({runtime.get('provider')})",
+ "signature": (
+ route.get("model"),
+ runtime.get("provider"),
+ runtime.get("base_url"),
+ runtime.get("api_mode"),
+ ),
+ }
diff --git a/cli-config.yaml.example b/cli-config.yaml.example
index 0c947e479..681405c44 100644
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -51,6 +51,20 @@ model:
# # Data policy: "allow" (default) or "deny" to exclude providers that may store data
# # data_collection: "deny"
+# =============================================================================
+# Smart Model Routing (optional)
+# =============================================================================
+# Use a cheaper model for short/simple turns while keeping your main model for
+# more complex requests. Disabled by default.
+#
+# smart_model_routing:
+# enabled: true
+# max_simple_chars: 160
+# max_simple_words: 28
+# cheap_model:
+# provider: openrouter
+# model: google/gemini-2.5-flash
+
# =============================================================================
# Git Worktree Isolation
# =============================================================================
diff --git a/cli.py b/cli.py
index 7716d6f57..eb0d7b101 100755
--- a/cli.py
+++ b/cli.py
@@ -176,6 +176,12 @@ def load_cli_config() -> Dict[str, Any]:
"threshold": 0.50, # Compress at 50% of model's context limit
"summary_model": "google/gemini-3-flash-preview", # Fast/cheap model for summaries
},
+ "smart_model_routing": {
+ "enabled": False,
+ "max_simple_chars": 160,
+ "max_simple_words": 28,
+ "cheap_model": {},
+ },
"agent": {
"max_turns": 90, # Default max tool-calling iterations (shared with subagents)
"verbose": False,
@@ -1126,6 +1132,10 @@ class HermesCLI:
fb = CLI_CONFIG.get("fallback_model") or {}
self._fallback_model = fb if fb.get("provider") and fb.get("model") else None
+ # Optional cheap-vs-strong routing for simple turns
+ self._smart_model_routing = CLI_CONFIG.get("smart_model_routing", {}) or {}
+ self._active_agent_route_signature = None
+
# Agent will be initialized on first use
self.agent: Optional[AIAgent] = None
self._app = None # prompt_toolkit Application (set in run())
@@ -1537,10 +1547,27 @@ class HermesCLI:
# routing, or the effective model changed.
if (credentials_changed or routing_changed or model_changed) and self.agent is not None:
self.agent = None
+ self._active_agent_route_signature = None
return True
- def _init_agent(self) -> bool:
+ def _resolve_turn_agent_config(self, user_message: str) -> dict:
+ """Resolve model/runtime overrides for a single user turn."""
+ from agent.smart_model_routing import resolve_turn_route
+
+ return resolve_turn_route(
+ user_message,
+ self._smart_model_routing,
+ {
+ "model": self.model,
+ "api_key": self.api_key,
+ "base_url": self.base_url,
+ "provider": self.provider,
+ "api_mode": self.api_mode,
+ },
+ )
+
+ def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None) -> bool:
"""
Initialize the agent on first use.
When resuming a session, restores conversation history from SQLite.
@@ -1600,12 +1627,19 @@ class HermesCLI:
pass
try:
+ runtime = runtime_override or {
+ "api_key": self.api_key,
+ "base_url": self.base_url,
+ "provider": self.provider,
+ "api_mode": self.api_mode,
+ }
+ effective_model = model_override or self.model
self.agent = AIAgent(
- model=self.model,
- api_key=self.api_key,
- base_url=self.base_url,
- provider=self.provider,
- api_mode=self.api_mode,
+ model=effective_model,
+ api_key=runtime.get("api_key"),
+ base_url=runtime.get("base_url"),
+ provider=runtime.get("provider"),
+ api_mode=runtime.get("api_mode"),
max_iterations=self.max_turns,
enabled_toolsets=self.enabled_toolsets,
verbose_logging=self.verbose,
@@ -1632,7 +1666,13 @@ class HermesCLI:
pass_session_id=self.pass_session_id,
tool_progress_callback=self._on_tool_progress,
)
- # Apply any pending title now that the session exists in the DB
+ self._active_agent_route_signature = (
+ effective_model,
+ runtime.get("provider"),
+ runtime.get("base_url"),
+ runtime.get("api_mode"),
+ )
+
if self._pending_title and self._session_db:
try:
self._session_db.set_session_title(self.session_id, self._pending_title)
@@ -3455,14 +3495,16 @@ class HermesCLI:
_cprint(f" Task ID: {task_id}")
_cprint(f" You can continue chatting — results will appear when done.\n")
+ turn_route = self._resolve_turn_agent_config(prompt)
+
def run_background():
try:
bg_agent = AIAgent(
- model=self.model,
- api_key=self.api_key,
- base_url=self.base_url,
- provider=self.provider,
- api_mode=self.api_mode,
+ model=turn_route["model"],
+ api_key=turn_route["runtime"].get("api_key"),
+ base_url=turn_route["runtime"].get("base_url"),
+ provider=turn_route["runtime"].get("provider"),
+ api_mode=turn_route["runtime"].get("api_mode"),
max_iterations=self.max_turns,
enabled_toolsets=self.enabled_toolsets,
quiet_mode=True,
@@ -4886,8 +4928,16 @@ class HermesCLI:
if not self._ensure_runtime_credentials():
return None
+ turn_route = self._resolve_turn_agent_config(message)
+ if turn_route["signature"] != self._active_agent_route_signature:
+ self.agent = None
+
# Initialize agent if needed
- if not self._init_agent():
+ if not self._init_agent(
+ model_override=turn_route["model"],
+ runtime_override=turn_route["runtime"],
+ route_label=turn_route["label"],
+ ):
return None
# Pre-process images through the vision tool (Gemini Flash) so the
@@ -6616,13 +6666,21 @@ def main(
# Quiet mode: suppress banner, spinner, tool previews.
# Only print the final response and parseable session info.
cli.tool_progress_mode = "off"
- if cli._init_agent():
- cli.agent.quiet_mode = True
- result = cli.agent.run_conversation(query)
- response = result.get("final_response", "") if isinstance(result, dict) else str(result)
- if response:
- print(response)
- print(f"\nsession_id: {cli.session_id}")
+ if cli._ensure_runtime_credentials():
+ turn_route = cli._resolve_turn_agent_config(query)
+ if turn_route["signature"] != cli._active_agent_route_signature:
+ cli.agent = None
+ if cli._init_agent(
+ model_override=turn_route["model"],
+ runtime_override=turn_route["runtime"],
+ route_label=turn_route["label"],
+ ):
+ cli.agent.quiet_mode = True
+ result = cli.agent.run_conversation(query)
+ response = result.get("final_response", "") if isinstance(result, dict) else str(result)
+ if response:
+ print(response)
+ print(f"\nsession_id: {cli.session_id}")
else:
cli.show_banner()
cli.console.print(f"[bold blue]Query:[/] {query}")
diff --git a/cron/scheduler.py b/cron/scheduler.py
index 8d75e1a95..ded88ef53 100644
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -315,6 +315,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
# Provider routing
pr = _cfg.get("provider_routing", {})
+ smart_routing = _cfg.get("smart_model_routing", {}) or {}
from hermes_cli.runtime_provider import (
resolve_runtime_provider,
@@ -331,12 +332,25 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
message = format_runtime_provider_error(exc)
raise RuntimeError(message) from exc
+ from agent.smart_model_routing import resolve_turn_route
+ turn_route = resolve_turn_route(
+ prompt,
+ smart_routing,
+ {
+ "model": model,
+ "api_key": runtime.get("api_key"),
+ "base_url": runtime.get("base_url"),
+ "provider": runtime.get("provider"),
+ "api_mode": runtime.get("api_mode"),
+ },
+ )
+
agent = AIAgent(
- model=model,
- api_key=runtime.get("api_key"),
- base_url=runtime.get("base_url"),
- provider=runtime.get("provider"),
- api_mode=runtime.get("api_mode"),
+ model=turn_route["model"],
+ api_key=turn_route["runtime"].get("api_key"),
+ base_url=turn_route["runtime"].get("base_url"),
+ provider=turn_route["runtime"].get("provider"),
+ api_mode=turn_route["runtime"].get("api_mode"),
max_iterations=max_iterations,
reasoning_config=reasoning_config,
prefill_messages=prefill_messages,
diff --git a/gateway/run.py b/gateway/run.py
index 50e68eaad..5c0b8b48c 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -324,6 +324,7 @@ class GatewayRunner:
self._show_reasoning = self._load_show_reasoning()
self._provider_routing = self._load_provider_routing()
self._fallback_model = self._load_fallback_model()
+ self._smart_model_routing = self._load_smart_model_routing()
# Wire process registry into session store for reset protection
from tools.process_registry import process_registry
@@ -593,6 +594,18 @@ class GatewayRunner:
group_sessions_per_user=getattr(config, "group_sessions_per_user", True),
)
+ def _resolve_turn_agent_config(self, user_message: str, model: str, runtime_kwargs: dict) -> dict:
+ from agent.smart_model_routing import resolve_turn_route
+
+ primary = {
+ "model": model,
+ "api_key": runtime_kwargs.get("api_key"),
+ "base_url": runtime_kwargs.get("base_url"),
+ "provider": runtime_kwargs.get("provider"),
+ "api_mode": runtime_kwargs.get("api_mode"),
+ }
+ return resolve_turn_route(user_message, getattr(self, "_smart_model_routing", {}), primary)
+
async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
"""React to a non-retryable adapter failure after startup."""
logger.error(
@@ -795,6 +808,20 @@ class GatewayRunner:
pass
return None
+ @staticmethod
+ def _load_smart_model_routing() -> dict:
+ """Load optional smart cheap-vs-strong model routing config."""
+ try:
+ import yaml as _y
+ cfg_path = _hermes_home / "config.yaml"
+ if cfg_path.exists():
+ with open(cfg_path, encoding="utf-8") as _f:
+ cfg = _y.safe_load(_f) or {}
+ return cfg.get("smart_model_routing", {}) or {}
+ except Exception:
+ pass
+ return {}
+
async def start(self) -> bool:
"""
Start the gateway and all configured platform adapters.
@@ -2931,11 +2958,12 @@ class GatewayRunner:
max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
reasoning_config = self._load_reasoning_config()
self._reasoning_config = reasoning_config
+ turn_route = self._resolve_turn_agent_config(prompt, model, runtime_kwargs)
def run_sync():
agent = AIAgent(
- model=model,
- **runtime_kwargs,
+ model=turn_route["model"],
+ **turn_route["runtime"],
max_iterations=max_iterations,
quiet_mode=True,
verbose_logging=False,
@@ -4169,9 +4197,10 @@ class GatewayRunner:
honcho_manager, honcho_config = self._get_or_create_gateway_honcho(session_key)
reasoning_config = self._load_reasoning_config()
self._reasoning_config = reasoning_config
+ turn_route = self._resolve_turn_agent_config(message, model, runtime_kwargs)
agent = AIAgent(
- model=model,
- **runtime_kwargs,
+ model=turn_route["model"],
+ **turn_route["runtime"],
max_iterations=max_iterations,
quiet_mode=True,
verbose_logging=False,
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index ee5e54470..1a46a80bb 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -147,6 +147,12 @@ DEFAULT_CONFIG = {
"summary_model": "google/gemini-3-flash-preview",
"summary_provider": "auto",
},
+ "smart_model_routing": {
+ "enabled": False,
+ "max_simple_chars": 160,
+ "max_simple_words": 28,
+ "cheap_model": {},
+ },
# Auxiliary model config — provider:model for each side task.
# Format: provider is the provider name, model is the model slug.
@@ -1010,6 +1016,19 @@ _FALLBACK_COMMENT = """
# fallback_model:
# provider: openrouter
# model: anthropic/claude-sonnet-4
+#
+# ── Smart Model Routing ────────────────────────────────────────────────
+# Optional cheap-vs-strong routing for simple turns.
+# Keeps the primary model for complex work, but can route short/simple
+# messages to a cheaper model across providers.
+#
+# smart_model_routing:
+# enabled: true
+# max_simple_chars: 160
+# max_simple_words: 28
+# cheap_model:
+# provider: openrouter
+# model: google/gemini-2.5-flash
"""
@@ -1040,6 +1059,19 @@ _COMMENTED_SECTIONS = """
# fallback_model:
# provider: openrouter
# model: anthropic/claude-sonnet-4
+#
+# ── Smart Model Routing ────────────────────────────────────────────────
+# Optional cheap-vs-strong routing for simple turns.
+# Keeps the primary model for complex work, but can route short/simple
+# messages to a cheaper model across providers.
+#
+# smart_model_routing:
+# enabled: true
+# max_simple_chars: 160
+# max_simple_words: 28
+# cheap_model:
+# provider: openrouter
+# model: google/gemini-2.5-flash
"""
diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py
index 2399436c4..4bc068c1e 100644
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -705,6 +705,7 @@ def generate_launchd_plist() -> str:
hermes_cli.main
gateway
run
+ --replace
WorkingDirectory
@@ -728,6 +729,36 @@ def generate_launchd_plist() -> str:
"""
+def launchd_plist_is_current() -> bool:
+ """Check if the installed launchd plist matches the currently generated one."""
+ plist_path = get_launchd_plist_path()
+ if not plist_path.exists():
+ return False
+
+ installed = plist_path.read_text(encoding="utf-8")
+ expected = generate_launchd_plist()
+ return _normalize_service_definition(installed) == _normalize_service_definition(expected)
+
+
+def refresh_launchd_plist_if_needed() -> bool:
+ """Rewrite the installed launchd plist when the generated definition has changed.
+
+ Unlike systemd, launchd picks up plist changes on the next ``launchctl stop``/
+ ``launchctl start`` cycle — no daemon-reload is needed. We still unload/reload
+ to make launchd re-read the updated plist immediately.
+ """
+ plist_path = get_launchd_plist_path()
+ if not plist_path.exists() or launchd_plist_is_current():
+ return False
+
+ plist_path.write_text(generate_launchd_plist(), encoding="utf-8")
+ # Unload/reload so launchd picks up the new definition
+ subprocess.run(["launchctl", "unload", str(plist_path)], check=False)
+ subprocess.run(["launchctl", "load", str(plist_path)], check=False)
+ print("↻ Updated gateway launchd service definition to match the current Hermes install")
+ return True
+
+
def launchd_install(force: bool = False):
plist_path = get_launchd_plist_path()
@@ -760,6 +791,7 @@ def launchd_uninstall():
print("✓ Service uninstalled")
def launchd_start():
+ refresh_launchd_plist_if_needed()
subprocess.run(["launchctl", "start", "ai.hermes.gateway"], check=True)
print("✓ Service started")
@@ -768,6 +800,7 @@ def launchd_stop():
print("✓ Service stopped")
def launchd_restart():
+ refresh_launchd_plist_if_needed()
launchd_stop()
launchd_start()
diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index 2cb220df6..845ae207e 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -2306,12 +2306,16 @@ def cmd_update(args):
# installation's gateway — safe with multiple installations.
try:
from gateway.status import get_running_pid, remove_pid_file
- from hermes_cli.gateway import get_service_name
+ from hermes_cli.gateway import (
+ get_service_name, get_launchd_plist_path, is_macos,
+ refresh_launchd_plist_if_needed,
+ )
import signal as _signal
_gw_service_name = get_service_name()
existing_pid = get_running_pid()
has_systemd_service = False
+ has_launchd_service = False
try:
check = subprocess.run(
@@ -2322,23 +2326,36 @@ def cmd_update(args):
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
- if existing_pid or has_systemd_service:
+ # Check for macOS launchd service
+ if is_macos():
+ try:
+ plist_path = get_launchd_plist_path()
+ if plist_path.exists():
+ check = subprocess.run(
+ ["launchctl", "list", "ai.hermes.gateway"],
+ capture_output=True, text=True, timeout=5,
+ )
+ has_launchd_service = check.returncode == 0
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ pass
+
+ if existing_pid or has_systemd_service or has_launchd_service:
print()
- # Kill the PID-file-tracked process (may be manual or systemd)
- if existing_pid:
- try:
- os.kill(existing_pid, _signal.SIGTERM)
- print(f"→ Stopped gateway process (PID {existing_pid})")
- except ProcessLookupError:
- pass # Already gone
- except PermissionError:
- print(f"⚠ Permission denied killing gateway PID {existing_pid}")
- remove_pid_file()
-
- # Restart the systemd service (starts a fresh process)
+ # When a service manager is handling the gateway, let it
+ # manage the lifecycle — don't manually SIGTERM the PID
+ # (launchd KeepAlive would respawn immediately, causing races).
if has_systemd_service:
import time as _time
+ if existing_pid:
+ try:
+ os.kill(existing_pid, _signal.SIGTERM)
+ print(f"→ Stopped gateway process (PID {existing_pid})")
+ except ProcessLookupError:
+ pass
+ except PermissionError:
+ print(f"⚠ Permission denied killing gateway PID {existing_pid}")
+ remove_pid_file()
_time.sleep(1) # Brief pause for port/socket release
print("→ Restarting gateway service...")
restart = subprocess.run(
@@ -2350,7 +2367,36 @@ def cmd_update(args):
else:
print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
print(" Try manually: hermes gateway restart")
+ elif has_launchd_service:
+ # Refresh the plist first (picks up --replace and other
+ # changes from the update we just pulled).
+ refresh_launchd_plist_if_needed()
+ # Explicit stop+start — don't rely on KeepAlive respawn
+ # after a manual SIGTERM, which would race with the
+ # PID file cleanup.
+ print("→ Restarting gateway service...")
+ stop = subprocess.run(
+ ["launchctl", "stop", "ai.hermes.gateway"],
+ capture_output=True, text=True, timeout=10,
+ )
+ start = subprocess.run(
+ ["launchctl", "start", "ai.hermes.gateway"],
+ capture_output=True, text=True, timeout=10,
+ )
+ if start.returncode == 0:
+ print("✓ Gateway restarted via launchd.")
+ else:
+ print(f"⚠ Gateway restart failed: {start.stderr.strip()}")
+ print(" Try manually: hermes gateway restart")
elif existing_pid:
+ try:
+ os.kill(existing_pid, _signal.SIGTERM)
+ print(f"→ Stopped gateway process (PID {existing_pid})")
+ except ProcessLookupError:
+ pass # Already gone
+ except PermissionError:
+ print(f"⚠ Permission denied killing gateway PID {existing_pid}")
+ remove_pid_file()
print(" ℹ️ Gateway was running manually (not as a service).")
print(" Restart it with: hermes gateway run")
except Exception as e:
diff --git a/tests/agent/test_smart_model_routing.py b/tests/agent/test_smart_model_routing.py
new file mode 100644
index 000000000..7e9025609
--- /dev/null
+++ b/tests/agent/test_smart_model_routing.py
@@ -0,0 +1,61 @@
+from agent.smart_model_routing import choose_cheap_model_route
+
+
+_BASE_CONFIG = {
+ "enabled": True,
+ "cheap_model": {
+ "provider": "openrouter",
+ "model": "google/gemini-2.5-flash",
+ },
+}
+
+
+def test_returns_none_when_disabled():
+ cfg = {**_BASE_CONFIG, "enabled": False}
+ assert choose_cheap_model_route("what time is it in tokyo?", cfg) is None
+
+
+def test_routes_short_simple_prompt():
+ result = choose_cheap_model_route("what time is it in tokyo?", _BASE_CONFIG)
+ assert result is not None
+ assert result["provider"] == "openrouter"
+ assert result["model"] == "google/gemini-2.5-flash"
+ assert result["routing_reason"] == "simple_turn"
+
+
+def test_skips_long_prompt():
+ prompt = "please summarize this carefully " * 20
+ assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
+
+
+def test_skips_code_like_prompt():
+ prompt = "debug this traceback: ```python\nraise ValueError('bad')\n```"
+ assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
+
+
+def test_skips_tool_heavy_prompt_keywords():
+ prompt = "implement a patch for this docker error"
+ assert choose_cheap_model_route(prompt, _BASE_CONFIG) is None
+
+
+def test_resolve_turn_route_falls_back_to_primary_when_route_runtime_cannot_be_resolved(monkeypatch):
+ from agent.smart_model_routing import resolve_turn_route
+
+ monkeypatch.setattr(
+ "hermes_cli.runtime_provider.resolve_runtime_provider",
+ lambda **kwargs: (_ for _ in ()).throw(RuntimeError("bad route")),
+ )
+ result = resolve_turn_route(
+ "what time is it in tokyo?",
+ _BASE_CONFIG,
+ {
+ "model": "anthropic/claude-sonnet-4",
+ "provider": "openrouter",
+ "base_url": "https://openrouter.ai/api/v1",
+ "api_mode": "chat_completions",
+ "api_key": "sk-primary",
+ },
+ )
+ assert result["model"] == "anthropic/claude-sonnet-4"
+ assert result["runtime"]["provider"] == "openrouter"
+ assert result["label"] is None
diff --git a/tests/hermes_cli/test_update_gateway_restart.py b/tests/hermes_cli/test_update_gateway_restart.py
new file mode 100644
index 000000000..b9cdecaa0
--- /dev/null
+++ b/tests/hermes_cli/test_update_gateway_restart.py
@@ -0,0 +1,305 @@
+"""Tests for cmd_update gateway auto-restart — systemd + launchd coverage.
+
+Ensures ``hermes update`` correctly detects running gateways managed by
+systemd (Linux) or launchd (macOS) and restarts/informs the user properly,
+rather than leaving zombie processes or telling users to manually restart
+when launchd will auto-respawn.
+"""
+
+import subprocess
+from types import SimpleNamespace
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+import hermes_cli.gateway as gateway_cli
+from hermes_cli.main import cmd_update
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_run_side_effect(
+ branch="main",
+ verify_ok=True,
+ commit_count="3",
+ systemd_active=False,
+ launchctl_loaded=False,
+):
+ """Build a subprocess.run side_effect that simulates git + service commands."""
+
+ def side_effect(cmd, **kwargs):
+ joined = " ".join(str(c) for c in cmd)
+
+ # git rev-parse --abbrev-ref HEAD
+ if "rev-parse" in joined and "--abbrev-ref" in joined:
+ return subprocess.CompletedProcess(cmd, 0, stdout=f"{branch}\n", stderr="")
+
+ # git rev-parse --verify origin/{branch}
+ if "rev-parse" in joined and "--verify" in joined:
+ rc = 0 if verify_ok else 128
+ return subprocess.CompletedProcess(cmd, rc, stdout="", stderr="")
+
+ # git rev-list HEAD..origin/{branch} --count
+ if "rev-list" in joined:
+ return subprocess.CompletedProcess(cmd, 0, stdout=f"{commit_count}\n", stderr="")
+
+ # systemctl --user is-active
+ if "systemctl" in joined and "is-active" in joined:
+ if systemd_active:
+ return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
+ return subprocess.CompletedProcess(cmd, 3, stdout="inactive\n", stderr="")
+
+ # systemctl --user restart
+ if "systemctl" in joined and "restart" in joined:
+ return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
+
+ # launchctl list ai.hermes.gateway
+ if "launchctl" in joined and "list" in joined:
+ if launchctl_loaded:
+ return subprocess.CompletedProcess(cmd, 0, stdout="PID\tStatus\tLabel\n123\t0\tai.hermes.gateway\n", stderr="")
+ return subprocess.CompletedProcess(cmd, 113, stdout="", stderr="Could not find service")
+
+ return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
+
+ return side_effect
+
+
+@pytest.fixture
+def mock_args():
+ return SimpleNamespace()
+
+
+# ---------------------------------------------------------------------------
+# Launchd plist includes --replace
+# ---------------------------------------------------------------------------
+
+
+class TestLaunchdPlistReplace:
+ """The generated launchd plist must include --replace so respawned
+ gateways kill stale instances."""
+
+ def test_plist_contains_replace_flag(self):
+ plist = gateway_cli.generate_launchd_plist()
+ assert "--replace" in plist
+
+ def test_plist_program_arguments_order(self):
+ """--replace comes after 'run' in the ProgramArguments."""
+ plist = gateway_cli.generate_launchd_plist()
+ lines = [line.strip() for line in plist.splitlines()]
+ # Find 'run' and '--replace' in the string entries
+ string_values = [
+ line.replace("", "").replace("", "")
+ for line in lines
+ if "" in line and "" in line
+ ]
+ assert "run" in string_values
+ assert "--replace" in string_values
+ run_idx = string_values.index("run")
+ replace_idx = string_values.index("--replace")
+ assert replace_idx == run_idx + 1
+
+
+# ---------------------------------------------------------------------------
+# cmd_update — macOS launchd detection
+# ---------------------------------------------------------------------------
+
+
+class TestLaunchdPlistRefresh:
+ """refresh_launchd_plist_if_needed rewrites stale plists (like systemd's
+ refresh_systemd_unit_if_needed)."""
+
+ def test_refresh_rewrites_stale_plist(self, tmp_path, monkeypatch):
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ plist_path.write_text("old content")
+
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+
+ calls = []
+ def fake_run(cmd, check=False, **kwargs):
+ calls.append(cmd)
+ return SimpleNamespace(returncode=0, stdout="", stderr="")
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+ result = gateway_cli.refresh_launchd_plist_if_needed()
+
+ assert result is True
+ # Plist should now contain the generated content (which includes --replace)
+ assert "--replace" in plist_path.read_text()
+ # Should have unloaded then reloaded
+ assert any("unload" in str(c) for c in calls)
+ assert any("load" in str(c) for c in calls)
+
+ def test_refresh_skips_when_current(self, tmp_path, monkeypatch):
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+
+ # Write the current expected content
+ plist_path.write_text(gateway_cli.generate_launchd_plist())
+
+ calls = []
+ monkeypatch.setattr(
+ gateway_cli.subprocess, "run",
+ lambda cmd, **kw: calls.append(cmd) or SimpleNamespace(returncode=0),
+ )
+
+ result = gateway_cli.refresh_launchd_plist_if_needed()
+
+ assert result is False
+ assert len(calls) == 0 # No launchctl calls needed
+
+ def test_refresh_skips_when_no_plist(self, tmp_path, monkeypatch):
+ plist_path = tmp_path / "nonexistent.plist"
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+
+ result = gateway_cli.refresh_launchd_plist_if_needed()
+ assert result is False
+
+ def test_launchd_start_calls_refresh(self, tmp_path, monkeypatch):
+ """launchd_start refreshes the plist before starting."""
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ plist_path.write_text("old")
+ monkeypatch.setattr(gateway_cli, "get_launchd_plist_path", lambda: plist_path)
+
+ calls = []
+ def fake_run(cmd, check=False, **kwargs):
+ calls.append(cmd)
+ return SimpleNamespace(returncode=0, stdout="", stderr="")
+
+ monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+ gateway_cli.launchd_start()
+
+ # First calls should be refresh (unload/load), then start
+ cmd_strs = [" ".join(c) for c in calls]
+ assert any("unload" in s for s in cmd_strs)
+ assert any("start" in s for s in cmd_strs)
+
+
+class TestCmdUpdateLaunchdRestart:
+ """cmd_update correctly detects and handles launchd on macOS."""
+
+ @patch("shutil.which", return_value=None)
+ @patch("subprocess.run")
+ def test_update_detects_launchd_and_skips_manual_restart_message(
+ self, mock_run, _mock_which, mock_args, capsys, tmp_path, monkeypatch,
+ ):
+ """When launchd is running the gateway, update should print
+ 'auto-restart via launchd' instead of 'Restart it with: hermes gateway run'."""
+ # Create a fake launchd plist so is_macos + plist.exists() passes
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ plist_path.write_text("")
+
+ monkeypatch.setattr(
+ gateway_cli, "is_macos", lambda: True,
+ )
+ monkeypatch.setattr(
+ gateway_cli, "get_launchd_plist_path", lambda: plist_path,
+ )
+
+ mock_run.side_effect = _make_run_side_effect(
+ commit_count="3",
+ launchctl_loaded=True,
+ )
+
+ # Mock get_running_pid to return a PID
+ with patch("gateway.status.get_running_pid", return_value=12345), \
+ patch("gateway.status.remove_pid_file"):
+ cmd_update(mock_args)
+
+ captured = capsys.readouterr().out
+ assert "Gateway restarted via launchd" in captured
+ assert "Restart it with: hermes gateway run" not in captured
+ # Verify launchctl stop + start were called (not manual SIGTERM)
+ launchctl_calls = [
+ c for c in mock_run.call_args_list
+ if len(c.args[0]) > 0 and c.args[0][0] == "launchctl"
+ ]
+ stop_calls = [c for c in launchctl_calls if "stop" in c.args[0]]
+ start_calls = [c for c in launchctl_calls if "start" in c.args[0]]
+ assert len(stop_calls) >= 1
+ assert len(start_calls) >= 1
+
+ @patch("shutil.which", return_value=None)
+ @patch("subprocess.run")
+ def test_update_without_launchd_shows_manual_restart(
+ self, mock_run, _mock_which, mock_args, capsys, tmp_path, monkeypatch,
+ ):
+ """When no service manager is running, update should show the manual restart hint."""
+ monkeypatch.setattr(
+ gateway_cli, "is_macos", lambda: True,
+ )
+ plist_path = tmp_path / "ai.hermes.gateway.plist"
+ # plist does NOT exist — no launchd service
+ monkeypatch.setattr(
+ gateway_cli, "get_launchd_plist_path", lambda: plist_path,
+ )
+
+ mock_run.side_effect = _make_run_side_effect(
+ commit_count="3",
+ launchctl_loaded=False,
+ )
+
+ with patch("gateway.status.get_running_pid", return_value=12345), \
+ patch("gateway.status.remove_pid_file"), \
+ patch("os.kill"):
+ cmd_update(mock_args)
+
+ captured = capsys.readouterr().out
+ assert "Restart it with: hermes gateway run" in captured
+ assert "Gateway restarted via launchd" not in captured
+
+ @patch("shutil.which", return_value=None)
+ @patch("subprocess.run")
+ def test_update_with_systemd_still_restarts_via_systemd(
+ self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
+ ):
+ """On Linux with systemd active, update should restart via systemctl."""
+ monkeypatch.setattr(
+ gateway_cli, "is_macos", lambda: False,
+ )
+
+ mock_run.side_effect = _make_run_side_effect(
+ commit_count="3",
+ systemd_active=True,
+ )
+
+ with patch("gateway.status.get_running_pid", return_value=12345), \
+ patch("gateway.status.remove_pid_file"), \
+ patch("os.kill"):
+ cmd_update(mock_args)
+
+ captured = capsys.readouterr().out
+ assert "Gateway restarted" in captured
+ # Verify systemctl restart was called
+ restart_calls = [
+ c for c in mock_run.call_args_list
+ if "restart" in " ".join(str(a) for a in c.args[0])
+ and "systemctl" in " ".join(str(a) for a in c.args[0])
+ ]
+ assert len(restart_calls) == 1
+
+ @patch("shutil.which", return_value=None)
+ @patch("subprocess.run")
+ def test_update_no_gateway_running_skips_restart(
+ self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
+ ):
+ """When no gateway is running, update should skip the restart section entirely."""
+ monkeypatch.setattr(
+ gateway_cli, "is_macos", lambda: False,
+ )
+
+ mock_run.side_effect = _make_run_side_effect(
+ commit_count="3",
+ systemd_active=False,
+ )
+
+ with patch("gateway.status.get_running_pid", return_value=None):
+ cmd_update(mock_args)
+
+ captured = capsys.readouterr().out
+ assert "Stopped gateway" not in captured
+ assert "Gateway restarted" not in captured
+ assert "Gateway restarted via launchd" not in captured
diff --git a/tests/test_cli_provider_resolution.py b/tests/test_cli_provider_resolution.py
index 3144bed80..99d8830fa 100644
--- a/tests/test_cli_provider_resolution.py
+++ b/tests/test_cli_provider_resolution.py
@@ -162,6 +162,57 @@ def test_runtime_resolution_rebuilds_agent_on_routing_change(monkeypatch):
assert shell.api_mode == "codex_responses"
+def test_cli_turn_routing_uses_primary_when_disabled(monkeypatch):
+ cli = _import_cli()
+ shell = cli.HermesCLI(model="gpt-5", compact=True, max_turns=1)
+ shell.provider = "openrouter"
+ shell.api_mode = "chat_completions"
+ shell.base_url = "https://openrouter.ai/api/v1"
+ shell.api_key = "sk-primary"
+ shell._smart_model_routing = {"enabled": False}
+
+ result = shell._resolve_turn_agent_config("what time is it in tokyo?")
+
+ assert result["model"] == "gpt-5"
+ assert result["runtime"]["provider"] == "openrouter"
+ assert result["label"] is None
+
+
+def test_cli_turn_routing_uses_cheap_model_when_simple(monkeypatch):
+ cli = _import_cli()
+
+ def _runtime_resolve(**kwargs):
+ assert kwargs["requested"] == "zai"
+ return {
+ "provider": "zai",
+ "api_mode": "chat_completions",
+ "base_url": "https://open.z.ai/api/v1",
+ "api_key": "cheap-key",
+ "source": "env/config",
+ }
+
+ monkeypatch.setattr("hermes_cli.runtime_provider.resolve_runtime_provider", _runtime_resolve)
+
+ shell = cli.HermesCLI(model="anthropic/claude-sonnet-4", compact=True, max_turns=1)
+ shell.provider = "openrouter"
+ shell.api_mode = "chat_completions"
+ shell.base_url = "https://openrouter.ai/api/v1"
+ shell.api_key = "primary-key"
+ shell._smart_model_routing = {
+ "enabled": True,
+ "cheap_model": {"provider": "zai", "model": "glm-5-air"},
+ "max_simple_chars": 160,
+ "max_simple_words": 28,
+ }
+
+ result = shell._resolve_turn_agent_config("what time is it in tokyo?")
+
+ assert result["model"] == "glm-5-air"
+ assert result["runtime"]["provider"] == "zai"
+ assert result["runtime"]["api_key"] == "cheap-key"
+ assert result["label"] is not None
+
+
def test_cli_prefers_config_provider_over_stale_env_override(monkeypatch):
cli = _import_cli()
diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md
index 1df9c491f..56a2ab125 100644
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -441,6 +441,39 @@ Supported providers: `openrouter`, `nous`, `openai-codex`, `anthropic`, `zai`, `
Fallback is configured exclusively through `config.yaml` — there are no environment variables for it. For full details on when it triggers, supported providers, and how it interacts with auxiliary tasks and delegation, see [Fallback Providers](/docs/user-guide/features/fallback-providers).
:::
+## Smart Model Routing
+
+Optional cheap-vs-strong routing lets Hermes keep your main model for complex work while sending very short/simple turns to a cheaper model.
+
+```yaml
+smart_model_routing:
+ enabled: true
+ max_simple_chars: 160
+ max_simple_words: 28
+ cheap_model:
+ provider: openrouter
+ model: google/gemini-2.5-flash
+ # base_url: http://localhost:8000/v1 # optional custom endpoint
+ # api_key_env: MY_CUSTOM_KEY # optional env var name for that endpoint's API key
+```
+
+How it works:
+- If a turn is short, single-line, and does not look code/tool/debug heavy, Hermes may route it to `cheap_model`
+- If the turn looks complex, Hermes stays on your primary model/provider
+- If the cheap route cannot be resolved cleanly, Hermes falls back to the primary model automatically
+
+This is intentionally conservative. It is meant for quick, low-stakes turns like:
+- short factual questions
+- quick rewrites
+- lightweight summaries
+
+It will avoid routing prompts that look like:
+- coding/debugging work
+- tool-heavy requests
+- long or multi-line analysis asks
+
+Use this when you want lower latency or cost without fully changing your default model.
+
## Terminal Backend Configuration
Configure which environment the agent uses for terminal commands: