diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 957452fc3..255efe076 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -480,9 +480,44 @@ def _read_main_model() -> str:
     return ""
 
 
+def _resolve_custom_runtime() -> Tuple[Optional[str], Optional[str]]:
+    """Resolve the active custom/main endpoint the same way the main CLI does.
+
+    This covers both env-driven OPENAI_BASE_URL setups and config-saved custom
+    endpoints where the base URL lives in config.yaml instead of the live
+    environment.
+    """
+    try:
+        from hermes_cli.runtime_provider import resolve_runtime_provider
+
+        runtime = resolve_runtime_provider(requested="custom")
+    except Exception as exc:
+        logger.debug("Auxiliary client: custom runtime resolution failed: %s", exc)
+        return None, None
+
+    custom_base = runtime.get("base_url")
+    custom_key = runtime.get("api_key")
+    if not isinstance(custom_base, str) or not custom_base.strip():
+        return None, None
+    if not isinstance(custom_key, str) or not custom_key.strip():
+        return None, None
+
+    custom_base = custom_base.strip().rstrip("/")
+    if "openrouter.ai" in custom_base.lower():
+        # requested='custom' falls back to OpenRouter when no custom endpoint is
+        # configured. Treat that as "no custom endpoint" for auxiliary routing.
+        return None, None
+
+    return custom_base, custom_key.strip()
+
+
+def _current_custom_base_url() -> str:
+    custom_base, _ = _resolve_custom_runtime()
+    return custom_base or ""
+
+
 def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
-    custom_base = os.getenv("OPENAI_BASE_URL")
-    custom_key = os.getenv("OPENAI_API_KEY")
+    custom_base, custom_key = _resolve_custom_runtime()
     if not custom_base or not custom_key:
         return None, None
     model = _read_main_model() or "gpt-4o-mini"
@@ -947,7 +982,7 @@ def auxiliary_max_tokens_param(value: int) -> dict:
     The Codex adapter translates max_tokens internally, so we use max_tokens
     for it as well.
     """
-    custom_base = os.getenv("OPENAI_BASE_URL", "")
+    custom_base = _current_custom_base_url()
     or_key = os.getenv("OPENROUTER_API_KEY")
     # Only use max_completion_tokens for direct OpenAI custom endpoints
     if (not or_key
@@ -1097,7 +1132,7 @@ def _build_call_kwargs(
         # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
         # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
         if provider == "custom":
-            custom_base = base_url or os.getenv("OPENAI_BASE_URL", "")
+            custom_base = base_url or _current_custom_base_url()
             if "api.openai.com" in custom_base.lower():
                 kwargs["max_completion_tokens"] = max_tokens
             else:
diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py
index 6e75c9b51..df9694843 100644
--- a/hermes_cli/gateway.py
+++ b/hermes_cli/gateway.py
@@ -141,6 +141,37 @@ def _service_scope_label(system: bool = False) -> str:
     return "system" if system else "user"
 
 
+def get_installed_systemd_scopes() -> list[str]:
+    scopes = []
+    seen_paths: set[Path] = set()
+    for system, label in ((False, "user"), (True, "system")):
+        unit_path = get_systemd_unit_path(system=system)
+        if unit_path in seen_paths:
+            continue
+        if unit_path.exists():
+            scopes.append(label)
+            seen_paths.add(unit_path)
+    return scopes
+
+
+def has_conflicting_systemd_units() -> bool:
+    return len(get_installed_systemd_scopes()) > 1
+
+
+def print_systemd_scope_conflict_warning() -> None:
+    scopes = get_installed_systemd_scopes()
+    if len(scopes) < 2:
+        return
+
+    rendered_scopes = " + ".join(scopes)
+    print_warning(f"Both user and system gateway services are installed ({rendered_scopes}).")
+    print_info("  This is confusing and can make start/stop/status behavior ambiguous.")
+    print_info("  Default gateway commands target the user service unless you pass --system.")
+    print_info("  Keep one of these:")
+    print_info("    hermes gateway uninstall")
+    print_info("    sudo hermes gateway uninstall --system")
+
+
 def _require_root_for_system_service(action: str) -> None:
     if os.geteuid() != 0:
         print(f"System gateway {action} requires root. Re-run with sudo.")
@@ -178,6 +209,57 @@ def _read_systemd_user_from_unit(unit_path: Path) -> str | None:
     return None
 
 
+def _default_system_service_user() -> str | None:
+    for candidate in (os.getenv("SUDO_USER"), os.getenv("USER"), os.getenv("LOGNAME")):
+        if candidate and candidate.strip() and candidate.strip() != "root":
+            return candidate.strip()
+    return None
+
+
+def prompt_linux_gateway_install_scope() -> str | None:
+    choice = prompt_choice(
+        "  Choose how the gateway should run in the background:",
+        [
+            "User service (no sudo; best for laptops/dev boxes; may need linger after logout)",
+            "System service (starts on boot; requires sudo; still runs as your user)",
+            "Skip service install for now",
+        ],
+        default=0,
+    )
+    return {0: "user", 1: "system", 2: None}[choice]
+
+
+def install_linux_gateway_from_setup(force: bool = False) -> tuple[str | None, bool]:
+    scope = prompt_linux_gateway_install_scope()
+    if scope is None:
+        return None, False
+
+    if scope == "system":
+        run_as_user = _default_system_service_user()
+        if os.geteuid() != 0:
+            print_warning("  System service install requires sudo, so Hermes can't create it from this user session.")
+            if run_as_user:
+                print_info(f"  After setup, run: sudo hermes gateway install --system --run-as-user {run_as_user}")
+            else:
+                print_info("  After setup, run: sudo hermes gateway install --system --run-as-user <your-user>")
+            print_info("  Then start it with: sudo hermes gateway start --system")
+            return scope, False
+
+        if not run_as_user:
+            while True:
+                run_as_user = prompt("  Run the system gateway service as which user?", default="")
+                run_as_user = (run_as_user or "").strip()
+                if run_as_user and run_as_user != "root":
+                    break
+                print_error("  Enter a non-root username.")
+
+        systemd_install(force=force, system=True, run_as_user=run_as_user)
+        return scope, True
+
+    systemd_install(force=force, system=False)
+    return scope, True
+
+
 def get_systemd_linger_status() -> tuple[bool | None, str]:
     """Return whether systemd user lingering is enabled for the current user.
 
@@ -462,6 +544,8 @@ def systemd_install(force: bool = False, system: bool = False, run_as_user: str
     else:
         _ensure_linger_enabled()
 
+    print_systemd_scope_conflict_warning()
+
 
 def systemd_uninstall(system: bool = False):
     system = _select_systemd_scope(system)
@@ -519,6 +603,10 @@ def systemd_status(deep: bool = False, system: bool = False):
         print(f"  Run: {'sudo ' if system else ''}hermes gateway install{scope_flag}")
         return
 
+    if has_conflicting_systemd_units():
+        print_systemd_scope_conflict_warning()
+        print()
+
     if not systemd_unit_is_current(system=system):
         print("⚠ Installed gateway service definition is outdated")
         print(f"  Run: {'sudo ' if system else ''}hermes gateway restart{scope_flag}  # auto-refreshes the unit")
@@ -1025,18 +1113,26 @@ def _is_service_installed() -> bool:
 def _is_service_running() -> bool:
     """Check if the gateway service is currently running."""
     if is_linux():
-        if get_systemd_unit_path(system=False).exists():
+        user_unit_exists = get_systemd_unit_path(system=False).exists()
+        system_unit_exists = get_systemd_unit_path(system=True).exists()
+
+        if user_unit_exists:
             result = subprocess.run(
                 _systemctl_cmd(False) + ["is-active", SERVICE_NAME],
                 capture_output=True, text=True
             )
-            return result.stdout.strip() == "active"
-        if get_systemd_unit_path(system=True).exists():
+            if result.stdout.strip() == "active":
+                return True
+
+        if system_unit_exists:
             result = subprocess.run(
                 _systemctl_cmd(True) + ["is-active", SERVICE_NAME],
                 capture_output=True, text=True
             )
-            return result.stdout.strip() == "active"
+            if result.stdout.strip() == "active":
+                return True
+
+        return False
     elif is_macos() and get_launchd_plist_path().exists():
         result = subprocess.run(
             ["launchctl", "list", "ai.hermes.gateway"],
@@ -1178,6 +1274,10 @@ def gateway_setup():
     service_installed = _is_service_installed()
     service_running = _is_service_running()
 
+    if is_linux() and has_conflicting_systemd_units():
+        print_systemd_scope_conflict_warning()
+        print()
+
     if service_installed and service_running:
         print_success("Gateway service is installed and running.")
     elif service_installed:
@@ -1259,16 +1359,18 @@ def gateway_setup():
                 platform_name = "systemd" if is_linux() else "launchd"
                 if prompt_yes_no(f"  Install the gateway as a {platform_name} service? (runs in background, starts on boot)", True):
                     try:
-                        force = False
+                        installed_scope = None
+                        did_install = False
                         if is_linux():
-                            systemd_install(force)
+                            installed_scope, did_install = install_linux_gateway_from_setup(force=False)
                         else:
-                            launchd_install(force)
+                            launchd_install(force=False)
+                            did_install = True
                         print()
-                        if prompt_yes_no("  Start the service now?", True):
+                        if did_install and prompt_yes_no("  Start the service now?", True):
                             try:
                                 if is_linux():
-                                    systemd_start()
+                                    systemd_start(system=installed_scope == "system")
                                 else:
                                     launchd_start()
                             except subprocess.CalledProcessError as e:
@@ -1278,6 +1380,8 @@ def gateway_setup():
                         print_info("  You can try manually: hermes gateway install")
                 else:
                     print_info("  You can install later: hermes gateway install")
+                    if is_linux():
+                        print_info("  Or as a boot-time service: sudo hermes gateway install --system")
                     print_info("  Or run in foreground:  hermes gateway")
             else:
                 print_info("  Service install not supported on this platform.")
diff --git a/hermes_cli/runtime_provider.py b/hermes_cli/runtime_provider.py
index fead68000..e0535357a 100644
--- a/hermes_cli/runtime_provider.py
+++ b/hermes_cli/runtime_provider.py
@@ -144,10 +144,16 @@ def _resolve_openrouter_runtime(
     env_openrouter_base_url = os.getenv("OPENROUTER_BASE_URL", "").strip()
 
     use_config_base_url = False
-    if requested_norm == "auto":
-        if cfg_base_url.strip() and not explicit_base_url and not env_openai_base_url:
+    if cfg_base_url.strip() and not explicit_base_url and not env_openai_base_url:
+        if requested_norm == "auto":
             if not cfg_provider or cfg_provider == "auto":
                 use_config_base_url = True
+        elif requested_norm == "custom":
+            # Persisted custom endpoints store their base URL in config.yaml.
+            # If OPENAI_BASE_URL is not currently set in the environment, keep
+            # honoring that saved endpoint instead of falling back to OpenRouter.
+            if cfg_provider == "custom":
+                use_config_base_url = True
 
     # When the user explicitly requested the openrouter provider, skip
     # OPENAI_BASE_URL — it typically points to a custom / non-OpenRouter
diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py
index 051de13c1..ef5f0969f 100644
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -2240,7 +2240,9 @@ def setup_gateway(config: dict):
         from hermes_cli.gateway import (
             _is_service_installed,
             _is_service_running,
-            systemd_install,
+            has_conflicting_systemd_units,
+            install_linux_gateway_from_setup,
+            print_systemd_scope_conflict_warning,
             systemd_start,
             systemd_restart,
             launchd_install,
@@ -2252,6 +2254,10 @@ def setup_gateway(config: dict):
         service_running = _is_service_running()
 
         print()
+        if _is_linux and has_conflicting_systemd_units():
+            print_systemd_scope_conflict_warning()
+            print()
+
         if service_running:
             if prompt_yes_no("  Restart the gateway to pick up changes?", True):
                 try:
@@ -2277,15 +2283,18 @@ def setup_gateway(config: dict):
                 True,
             ):
                 try:
+                    installed_scope = None
+                    did_install = False
                     if _is_linux:
-                        systemd_install(force=False)
+                        installed_scope, did_install = install_linux_gateway_from_setup(force=False)
                     else:
                         launchd_install(force=False)
+                        did_install = True
                     print()
-                    if prompt_yes_no("  Start the service now?", True):
+                    if did_install and prompt_yes_no("  Start the service now?", True):
                         try:
                             if _is_linux:
-                                systemd_start()
+                                systemd_start(system=installed_scope == "system")
                             elif _is_macos:
                                 launchd_start()
                         except Exception as e:
@@ -2295,6 +2304,8 @@ def setup_gateway(config: dict):
                     print_info("  You can try manually: hermes gateway install")
             else:
                 print_info("  You can install later: hermes gateway install")
+                if _is_linux:
+                    print_info("  Or as a boot-time service: sudo hermes gateway install --system")
                 print_info("  Or run in foreground:  hermes gateway")
         else:
             print_info("Start the gateway to bring your bots online:")
diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py
index d60e3c813..3ddb6d7c6 100644
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -165,6 +165,29 @@ class TestGetTextAuxiliaryClient:
         assert model is None
         mock_openai.assert_not_called()
 
+    def test_custom_endpoint_uses_config_saved_base_url(self, monkeypatch):
+        config = {
+            "model": {
+                "provider": "custom",
+                "base_url": "http://localhost:1234/v1",
+                "default": "my-local-model",
+            }
+        }
+        monkeypatch.setenv("OPENAI_API_KEY", "lm-studio-key")
+        monkeypatch.setattr("hermes_cli.config.load_config", lambda: config)
+        monkeypatch.setattr("hermes_cli.runtime_provider.load_config", lambda: config)
+
+        with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
+             patch("agent.auxiliary_client._read_codex_access_token", return_value=None), \
+             patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)), \
+             patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client()
+
+        assert client is not None
+        assert model == "my-local-model"
+        call_kwargs = mock_openai.call_args
+        assert call_kwargs.kwargs["base_url"] == "http://localhost:1234/v1"
+
     def test_codex_fallback_when_nothing_else(self, codex_auth_dir):
         with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
              patch("agent.auxiliary_client.OpenAI") as mock_openai:
@@ -364,6 +387,27 @@ class TestResolveForcedProvider:
             client, model = _resolve_forced_provider("main")
         assert model == "my-local-model"
 
+    def test_forced_main_uses_config_saved_custom_endpoint(self, monkeypatch):
+        config = {
+            "model": {
+                "provider": "custom",
+                "base_url": "http://local:8080/v1",
+                "default": "my-local-model",
+            }
+        }
+        monkeypatch.setenv("OPENAI_API_KEY", "local-key")
+        monkeypatch.setattr("hermes_cli.config.load_config", lambda: config)
+        monkeypatch.setattr("hermes_cli.runtime_provider.load_config", lambda: config)
+        with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
+             patch("agent.auxiliary_client._read_codex_access_token", return_value=None), \
+             patch("agent.auxiliary_client._resolve_api_key_provider", return_value=(None, None)), \
+             patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = _resolve_forced_provider("main")
+        assert client is not None
+        assert model == "my-local-model"
+        call_kwargs = mock_openai.call_args
+        assert call_kwargs.kwargs["base_url"] == "http://local:8080/v1"
+
     def test_forced_main_skips_openrouter_nous(self, monkeypatch):
         """Even if OpenRouter key is set, 'main' skips it."""
         monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
diff --git a/tests/hermes_cli/test_gateway.py b/tests/hermes_cli/test_gateway.py
index d3f4bb9e8..29da657e2 100644
--- a/tests/hermes_cli/test_gateway.py
+++ b/tests/hermes_cli/test_gateway.py
@@ -115,3 +115,57 @@ def test_systemd_install_system_scope_skips_linger_and_uses_systemctl(monkeypatc
     assert helper_calls == []
     assert "Configured to run as: alice" not in out  # generated test unit has no User= line
     assert "System service installed and enabled" in out
+
+
+def test_conflicting_systemd_units_warning(monkeypatch, tmp_path, capsys):
+    user_unit = tmp_path / "user" / "hermes-gateway.service"
+    system_unit = tmp_path / "system" / "hermes-gateway.service"
+    user_unit.parent.mkdir(parents=True)
+    system_unit.parent.mkdir(parents=True)
+    user_unit.write_text("[Unit]\n", encoding="utf-8")
+    system_unit.write_text("[Unit]\n", encoding="utf-8")
+
+    monkeypatch.setattr(
+        gateway,
+        "get_systemd_unit_path",
+        lambda system=False: system_unit if system else user_unit,
+    )
+
+    gateway.print_systemd_scope_conflict_warning()
+
+    out = capsys.readouterr().out
+    assert "Both user and system gateway services are installed" in out
+    assert "hermes gateway uninstall" in out
+    assert "--system" in out
+
+
+def test_install_linux_gateway_from_setup_system_choice_without_root_prints_followup(monkeypatch, capsys):
+    monkeypatch.setattr(gateway, "prompt_linux_gateway_install_scope", lambda: "system")
+    monkeypatch.setattr(gateway.os, "geteuid", lambda: 1000)
+    monkeypatch.setattr(gateway, "_default_system_service_user", lambda: "alice")
+    monkeypatch.setattr(gateway, "systemd_install", lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("should not install")))
+
+    scope, did_install = gateway.install_linux_gateway_from_setup(force=False)
+
+    out = capsys.readouterr().out
+    assert (scope, did_install) == ("system", False)
+    assert "sudo hermes gateway install --system --run-as-user alice" in out
+    assert "sudo hermes gateway start --system" in out
+
+
+def test_install_linux_gateway_from_setup_system_choice_as_root_installs(monkeypatch):
+    monkeypatch.setattr(gateway, "prompt_linux_gateway_install_scope", lambda: "system")
+    monkeypatch.setattr(gateway.os, "geteuid", lambda: 0)
+    monkeypatch.setattr(gateway, "_default_system_service_user", lambda: "alice")
+
+    calls = []
+    monkeypatch.setattr(
+        gateway,
+        "systemd_install",
+        lambda force=False, system=False, run_as_user=None: calls.append((force, system, run_as_user)),
+    )
+
+    scope, did_install = gateway.install_linux_gateway_from_setup(force=True)
+
+    assert (scope, did_install) == ("system", True)
+    assert calls == [(True, True, "alice")]
diff --git a/tests/hermes_cli/test_gateway_service.py b/tests/hermes_cli/test_gateway_service.py
index 1cc0968da..ce41a57a1 100644
--- a/tests/hermes_cli/test_gateway_service.py
+++ b/tests/hermes_cli/test_gateway_service.py
@@ -78,6 +78,31 @@ class TestGatewayStopCleanup:
         assert kill_calls == [False]
 
 
+class TestGatewayServiceDetection:
+    def test_is_service_running_checks_system_scope_when_user_scope_is_inactive(self, monkeypatch):
+        user_unit = SimpleNamespace(exists=lambda: True)
+        system_unit = SimpleNamespace(exists=lambda: True)
+
+        monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
+        monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
+        monkeypatch.setattr(
+            gateway_cli,
+            "get_systemd_unit_path",
+            lambda system=False: system_unit if system else user_unit,
+        )
+
+        def fake_run(cmd, capture_output=True, text=True, **kwargs):
+            if cmd == ["systemctl", "--user", "is-active", gateway_cli.SERVICE_NAME]:
+                return SimpleNamespace(returncode=0, stdout="inactive\n", stderr="")
+            if cmd == ["systemctl", "is-active", gateway_cli.SERVICE_NAME]:
+                return SimpleNamespace(returncode=0, stdout="active\n", stderr="")
+            raise AssertionError(f"Unexpected command: {cmd}")
+
+        monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
+
+        assert gateway_cli._is_service_running() is True
+
+
 class TestGatewaySystemServiceRouting:
     def test_gateway_install_passes_system_flags(self, monkeypatch):
         monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
diff --git a/tests/test_runtime_provider_resolution.py b/tests/test_runtime_provider_resolution.py
index a53c716a3..52d4a1d4f 100644
--- a/tests/test_runtime_provider_resolution.py
+++ b/tests/test_runtime_provider_resolution.py
@@ -131,13 +131,36 @@ def test_custom_endpoint_prefers_openai_key(monkeypatch):
     monkeypatch.setattr(rp, "_get_model_config", lambda: {})
     monkeypatch.setenv("OPENAI_BASE_URL", "https://api.z.ai/api/coding/paas/v4")
     monkeypatch.delenv("OPENROUTER_BASE_URL", raising=False)
-    monkeypatch.setenv("OPENAI_API_KEY", "sk-zai-correct-key")
-    monkeypatch.setenv("OPENROUTER_API_KEY", "sk-or-wrong-key-for-zai")
+    monkeypatch.setenv("OPENAI_API_KEY", "zai-key")
+    monkeypatch.setenv("OPENROUTER_API_KEY", "openrouter-key")
 
     resolved = rp.resolve_runtime_provider(requested="custom")
 
     assert resolved["base_url"] == "https://api.z.ai/api/coding/paas/v4"
-    assert resolved["api_key"] == "sk-zai-correct-key"
+    assert resolved["api_key"] == "zai-key"
+
+
+def test_custom_endpoint_uses_saved_config_base_url_when_env_missing(monkeypatch):
+    """Persisted custom endpoints in config.yaml must still resolve when
+    OPENAI_BASE_URL is absent from the current environment."""
+    monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "openrouter")
+    monkeypatch.setattr(
+        rp,
+        "_get_model_config",
+        lambda: {
+            "provider": "custom",
+            "base_url": "http://127.0.0.1:1234/v1",
+        },
+    )
+    monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
+    monkeypatch.delenv("OPENROUTER_BASE_URL", raising=False)
+    monkeypatch.setenv("OPENAI_API_KEY", "local-key")
+    monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+
+    resolved = rp.resolve_runtime_provider(requested="custom")
+
+    assert resolved["base_url"] == "http://127.0.0.1:1234/v1"
+    assert resolved["api_key"] == "local-key"
 
 
 def test_custom_endpoint_auto_provider_prefers_openai_key(monkeypatch):
diff --git a/website/docs/developer-guide/provider-runtime.md b/website/docs/developer-guide/provider-runtime.md
index 08f950509..77832fc92 100644
--- a/website/docs/developer-guide/provider-runtime.md
+++ b/website/docs/developer-guide/provider-runtime.md
@@ -27,10 +27,12 @@ If you are trying to add a new first-class inference provider, read [Adding Prov
 At a high level, provider resolution uses:
 
 1. explicit CLI/runtime request
-2. environment variables
-3. `config.yaml` model/provider config
+2. `config.yaml` model/provider config
+3. environment variables
 4. provider-specific defaults or auto resolution
 
+That ordering matters because Hermes treats the saved model/provider choice as the source of truth for normal runs. This prevents a stale shell export from silently overriding the endpoint a user last selected in `hermes model`.
+
 ## Providers
 
 Current provider families include:
@@ -70,11 +72,17 @@ This resolver is the main reason Hermes can share auth/runtime logic between:
 
 Hermes contains logic to avoid leaking the wrong API key to a custom endpoint when both `OPENROUTER_API_KEY` and `OPENAI_API_KEY` exist.
 
+It also distinguishes between:
+
+- a real custom endpoint selected by the user
+- the OpenRouter fallback path used when no custom endpoint is configured
+
 That distinction is especially important for:
 
 - local model servers
 - non-OpenRouter OpenAI-compatible APIs
 - switching providers without re-running setup
+- config-saved custom endpoints that should keep working even when `OPENAI_BASE_URL` is not exported in the current shell
 
 ## Native Anthropic path
 
@@ -114,6 +122,12 @@ Auxiliary tasks such as:
 
 can use their own provider/model routing rather than the main conversational model.
 
+When an auxiliary task is configured with provider `main`, Hermes resolves that through the same shared runtime path as normal chat. In practice that means:
+
+- env-driven custom endpoints still work
+- custom endpoints saved via `hermes model` / `config.yaml` also work
+- auxiliary routing can tell the difference between a real saved custom endpoint and the OpenRouter fallback
+
 ## Fallback models
 
 Hermes also supports a configured fallback model/provider, allowing runtime failover in supported error paths.
diff --git a/website/docs/reference/faq.md b/website/docs/reference/faq.md
index 02a82dce7..4d7be7aa0 100644
--- a/website/docs/reference/faq.md
+++ b/website/docs/reference/faq.md
@@ -50,6 +50,8 @@ hermes config set OPENAI_API_KEY ollama                       # Any non-empty va
 hermes config set HERMES_MODEL llama3.1
 ```
 
+You can also save the endpoint interactively with `hermes model`. Hermes persists that custom endpoint in `config.yaml`, and auxiliary tasks configured with provider `main` follow the same saved endpoint.
+
 This works with Ollama, vLLM, llama.cpp server, SGLang, LocalAI, and others. See the [Configuration guide](../user-guide/configuration.md) for details.
 
 ### How much does it cost?
diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md
index 0a1c50cb0..7e368ecf7 100644
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -69,7 +69,7 @@ You need at least one way to connect to an LLM. Use `hermes model` to switch pro
 | **Kimi / Moonshot** | `KIMI_API_KEY` in `~/.hermes/.env` (provider: `kimi-coding`) |
 | **MiniMax** | `MINIMAX_API_KEY` in `~/.hermes/.env` (provider: `minimax`) |
 | **MiniMax China** | `MINIMAX_CN_API_KEY` in `~/.hermes/.env` (provider: `minimax-cn`) |
-| **Custom Endpoint** | `OPENAI_BASE_URL` + `OPENAI_API_KEY` in `~/.hermes/.env` |
+| **Custom Endpoint** | `hermes model` (saved in `config.yaml`) or `OPENAI_BASE_URL` + `OPENAI_API_KEY` in `~/.hermes/.env` |
 
 :::info Codex Note
 The OpenAI Codex provider authenticates via device code (open a URL, enter a code). Hermes stores the resulting credentials in its own auth store under `~/.hermes/auth.json` and can import existing Codex CLI credentials from `~/.codex/auth.json` when present. No Codex CLI installation is required.
@@ -163,10 +163,12 @@ hermes model
 ```bash
 # Add to ~/.hermes/.env
 OPENAI_BASE_URL=http://localhost:8000/v1
-OPENAI_API_KEY=your-key-or-dummy
+OPENAI_API_KEY=***
 LLM_MODEL=your-model-name
 ```
 
+`hermes model` and the manual `.env` approach end up in the same runtime path. If you save a custom endpoint through `hermes model`, Hermes persists the provider + base URL in `config.yaml` so later sessions keep using that endpoint even if `OPENAI_BASE_URL` is not exported in your current shell.
+
 Everything below follows this same pattern — just change the URL, key, and model name.
 
 ---
@@ -604,7 +606,7 @@ AUXILIARY_VISION_MODEL=openai/gpt-4o
 | `"openrouter"` | Force OpenRouter — routes to any model (Gemini, GPT-4o, Claude, etc.) | `OPENROUTER_API_KEY` |
 | `"nous"` | Force Nous Portal | `hermes login` |
 | `"codex"` | Force Codex OAuth (ChatGPT account). Supports vision (gpt-5.3-codex). | `hermes model` → Codex |
-| `"main"` | Use your custom endpoint (`OPENAI_BASE_URL` + `OPENAI_API_KEY`). Works with OpenAI, local models, or any OpenAI-compatible API. | `OPENAI_BASE_URL` + `OPENAI_API_KEY` |
+| `"main"` | Use your active custom/main endpoint. This can come from `OPENAI_BASE_URL` + `OPENAI_API_KEY` or from a custom endpoint saved via `hermes model` / `config.yaml`. Works with OpenAI, local models, or any OpenAI-compatible API. | Custom endpoint credentials + base URL |
 
 ### Common Setups
 
@@ -651,10 +653,12 @@ auxiliary:
 ```yaml
 auxiliary:
   vision:
-    provider: "main"      # uses your OPENAI_BASE_URL endpoint
+    provider: "main"      # uses your active custom endpoint
     model: "my-local-model"
 ```
 
+`provider: "main"` follows the same custom endpoint Hermes uses for normal chat. That endpoint can be set directly with `OPENAI_BASE_URL`, or saved once through `hermes model` and persisted in `config.yaml`.
+
 :::tip
 If you use Codex OAuth as your main model provider, vision works automatically — no extra configuration needed. Codex is included in the auto-detection chain for vision.
 :::