diff --git a/cli.py b/cli.py
index 978b36091..e5f88e752 100644
--- a/cli.py
+++ b/cli.py
@@ -2192,7 +2192,31 @@ class HermesCLI:
         
         # Show tool availability warnings if any tools are disabled
         self._show_tool_availability_warnings()
-        
+
+        # Warn about very low context lengths (common with local servers)
+        if ctx_len and ctx_len <= 8192:
+            self.console.print()
+            self.console.print(
+                f"[yellow]⚠️  Context length is only {ctx_len:,} tokens — "
+                f"this is likely too low for agent use with tools.[/]"
+            )
+            self.console.print(
+                "[dim]   Hermes needs 16k–32k minimum. Tool schemas + system prompt alone use ~4k–8k.[/]"
+            )
+            base_url = getattr(self, "base_url", "") or ""
+            if "11434" in base_url or "ollama" in base_url.lower():
+                self.console.print(
+                    "[dim]   Ollama fix: OLLAMA_CONTEXT_LENGTH=32768 ollama serve[/]"
+                )
+            elif "1234" in base_url:
+                self.console.print(
+                    "[dim]   LM Studio fix: Set context length in model settings → reload model[/]"
+                )
+            else:
+                self.console.print(
+                    "[dim]   Fix: Set model.context_length in config.yaml, or increase your server's context setting[/]"
+                )
+
         self.console.print()
 
     def _preload_resumed_session(self) -> bool:
diff --git a/tests/test_cli_context_warning.py b/tests/test_cli_context_warning.py
new file mode 100644
index 000000000..fa0305a27
--- /dev/null
+++ b/tests/test_cli_context_warning.py
@@ -0,0 +1,147 @@
+"""Tests for the low context length warning in the CLI banner."""
+
+import os
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+
+@pytest.fixture
+def _isolate(tmp_path, monkeypatch):
+    """Isolate HERMES_HOME so tests don't touch real config."""
+    home = tmp_path / ".hermes"
+    home.mkdir()
+    monkeypatch.setenv("HERMES_HOME", str(home))
+
+
+@pytest.fixture
+def cli_obj(_isolate):
+    """Create a minimal HermesCLI instance for banner testing."""
+    with patch("cli.load_cli_config", return_value={
+        "display": {"tool_progress": "new"},
+        "terminal": {},
+    }), patch("cli.get_tool_definitions", return_value=[]), \
+         patch("cli.build_welcome_banner"):
+        from cli import HermesCLI
+        obj = HermesCLI.__new__(HermesCLI)
+        obj.model = "test-model"
+        obj.enabled_toolsets = ["hermes-core"]
+        obj.compact = False
+        obj.console = MagicMock()
+        obj.session_id = None
+        obj.api_key = "test"
+        obj.base_url = ""
+        # Mock agent with context compressor
+        obj.agent = SimpleNamespace(
+            context_compressor=SimpleNamespace(context_length=None)
+        )
+        return obj
+
+
+class TestLowContextWarning:
+    """Tests that the CLI warns about low context lengths."""
+
+    def test_no_warning_for_normal_context(self, cli_obj):
+        """No warning when context is 32k+."""
+        cli_obj.agent.context_compressor.context_length = 32768
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        # Check that no yellow warning was printed
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        warning_calls = [c for c in calls if "too low" in c]
+        assert len(warning_calls) == 0
+
+    def test_warning_for_low_context(self, cli_obj):
+        """Warning shown when context is 4096 (Ollama default)."""
+        cli_obj.agent.context_compressor.context_length = 4096
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        warning_calls = [c for c in calls if "too low" in c]
+        assert len(warning_calls) == 1
+        assert "4,096" in warning_calls[0]
+
+    def test_warning_for_2048_context(self, cli_obj):
+        """Warning shown for 2048 tokens (common LM Studio default)."""
+        cli_obj.agent.context_compressor.context_length = 2048
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        warning_calls = [c for c in calls if "too low" in c]
+        assert len(warning_calls) == 1
+
+    def test_no_warning_at_boundary(self, cli_obj):
+        """No warning at exactly 8192 — 8192 is borderline but included in warning."""
+        cli_obj.agent.context_compressor.context_length = 8192
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        warning_calls = [c for c in calls if "too low" in c]
+        assert len(warning_calls) == 1  # 8192 is still warned about
+
+    def test_no_warning_above_boundary(self, cli_obj):
+        """No warning at 16384."""
+        cli_obj.agent.context_compressor.context_length = 16384
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        warning_calls = [c for c in calls if "too low" in c]
+        assert len(warning_calls) == 0
+
+    def test_ollama_specific_hint(self, cli_obj):
+        """Ollama-specific fix shown when port 11434 detected."""
+        cli_obj.agent.context_compressor.context_length = 4096
+        cli_obj.base_url = "http://localhost:11434/v1"
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        ollama_hints = [c for c in calls if "OLLAMA_CONTEXT_LENGTH" in c]
+        assert len(ollama_hints) == 1
+
+    def test_lm_studio_specific_hint(self, cli_obj):
+        """LM Studio-specific fix shown when port 1234 detected."""
+        cli_obj.agent.context_compressor.context_length = 2048
+        cli_obj.base_url = "http://localhost:1234/v1"
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        lms_hints = [c for c in calls if "LM Studio" in c]
+        assert len(lms_hints) == 1
+
+    def test_generic_hint_for_other_servers(self, cli_obj):
+        """Generic fix shown for unknown servers."""
+        cli_obj.agent.context_compressor.context_length = 4096
+        cli_obj.base_url = "http://localhost:8080/v1"
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        generic_hints = [c for c in calls if "config.yaml" in c]
+        assert len(generic_hints) == 1
+
+    def test_no_warning_when_no_context_length(self, cli_obj):
+        """No warning when context length is not yet known."""
+        cli_obj.agent.context_compressor.context_length = None
+        with patch("cli.get_tool_definitions", return_value=[]), \
+             patch("cli.build_welcome_banner"):
+            cli_obj.show_banner()
+
+        calls = [str(c) for c in cli_obj.console.print.call_args_list]
+        warning_calls = [c for c in calls if "too low" in c]
+        assert len(warning_calls) == 0
diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md
index ab4c8f354..7740e36db 100644
--- a/website/docs/integrations/providers.md
+++ b/website/docs/integrations/providers.md
@@ -218,15 +218,11 @@ model:
   api_key: your-key-or-leave-empty-for-local
 ```
 
-**Environment variables (`.env` file):**
-```bash
-# Add to ~/.hermes/.env
-OPENAI_BASE_URL=http://localhost:8000/v1
-OPENAI_API_KEY=your-key     # Any non-empty string for local servers
-LLM_MODEL=your-model-name
-```
+:::warning Legacy env vars
+`OPENAI_BASE_URL` and `LLM_MODEL` in `.env` are **deprecated**. The CLI ignores `LLM_MODEL` entirely (only the gateway reads it). Use `hermes model` or edit `config.yaml` directly — both persist correctly across restarts and Docker containers.
+:::
 
-All three approaches end up in the same runtime path. `hermes model` persists provider, model, and base URL to `config.yaml` so later sessions keep using that endpoint even if env vars are not set.
+Both approaches persist to `config.yaml`, which is the source of truth for model, provider, and base URL.
 
 ### Switching Models with `/model`
 
@@ -257,23 +253,73 @@ Everything below follows this same pattern — just change the URL, key, and mod
 
 ### Ollama — Local Models, Zero Config
 
-[Ollama](https://ollama.com/) runs open-weight models locally with one command. Best for: quick local experimentation, privacy-sensitive work, offline use.
+[Ollama](https://ollama.com/) runs open-weight models locally with one command. Best for: quick local experimentation, privacy-sensitive work, offline use. Supports tool calling via the OpenAI-compatible API.
 
 ```bash
 # Install and run a model
-ollama pull llama3.1:70b
+ollama pull qwen2.5-coder:32b
 ollama serve   # Starts on port 11434
-
-# Configure Hermes
-OPENAI_BASE_URL=http://localhost:11434/v1
-OPENAI_API_KEY=ollama           # Any non-empty string
-LLM_MODEL=llama3.1:70b
 ```
 
-Ollama's OpenAI-compatible endpoint supports chat completions, streaming, and tool calling (for supported models). No GPU required for smaller models — Ollama handles CPU inference automatically.
+Then configure Hermes:
+
+```bash
+hermes model
+# Select "Custom endpoint (self-hosted / VLLM / etc.)"
+# Enter URL: http://localhost:11434/v1
+# Skip API key (Ollama doesn't need one)
+# Enter model name (e.g. qwen2.5-coder:32b)
+```
+
+Or configure `config.yaml` directly:
+
+```yaml
+model:
+  default: qwen2.5-coder:32b
+  provider: custom
+  base_url: http://localhost:11434/v1
+  context_length: 32768   # See warning below
+```
+
+:::caution Ollama defaults to very low context lengths
+Ollama does **not** use your model's full context window by default. Depending on your VRAM, the default is:
+
+| Available VRAM | Default context |
+|----------------|----------------|
+| Less than 24 GB | **4,096 tokens** |
+| 24–48 GB | 32,768 tokens |
+| 48+ GB | 256,000 tokens |
+
+For agent use with tools, **you need at least 16k–32k context**. At 4k, the system prompt + tool schemas alone can fill the window, leaving no room for conversation.
+
+**How to increase it** (pick one):
+
+```bash
+# Option 1: Set server-wide via environment variable (recommended)
+OLLAMA_CONTEXT_LENGTH=32768 ollama serve
+
+# Option 2: For systemd-managed Ollama
+sudo systemctl edit ollama.service
+# Add: Environment="OLLAMA_CONTEXT_LENGTH=32768"
+# Then: sudo systemctl daemon-reload && sudo systemctl restart ollama
+
+# Option 3: Bake it into a custom model (persistent per-model)
+echo -e "FROM qwen2.5-coder:32b\nPARAMETER num_ctx 32768" > Modelfile
+ollama create qwen2.5-coder-32k -f Modelfile
+```
+
+**You cannot set context length through the OpenAI-compatible API** (`/v1/chat/completions`). It must be configured server-side or via a Modelfile. This is the #1 source of confusion when integrating Ollama with tools like Hermes.
+:::
+
+**Verify your context is set correctly:**
+
+```bash
+ollama ps
+# Look at the CONTEXT column — it should show your configured value
+```
 
 :::tip
-List available models with `ollama list`. Pull any model from the [Ollama library](https://ollama.com/library) with `ollama pull <model>`.
+List available models with `ollama list`. Pull any model from the [Ollama library](https://ollama.com/library) with `ollama pull <model>`. Ollama handles GPU offloading automatically — no configuration needed for most setups.
 :::
 
 ---
@@ -283,19 +329,39 @@ List available models with `ollama list`. Pull any model from the [Ollama librar
 [vLLM](https://docs.vllm.ai/) is the standard for production LLM serving. Best for: maximum throughput on GPU hardware, serving large models, continuous batching.
 
 ```bash
-# Start vLLM server
 pip install vllm
 vllm serve meta-llama/Llama-3.1-70B-Instruct \
   --port 8000 \
-  --tensor-parallel-size 2    # Multi-GPU
-
-# Configure Hermes
-OPENAI_BASE_URL=http://localhost:8000/v1
-OPENAI_API_KEY=dummy
-LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct
+  --max-model-len 65536 \
+  --tensor-parallel-size 2 \
+  --enable-auto-tool-choice \
+  --tool-call-parser hermes
 ```
 
-vLLM supports tool calling, structured output, and multi-modal models. Use `--enable-auto-tool-choice` and `--tool-call-parser hermes` for Hermes-format tool calling with NousResearch models.
+Then configure Hermes:
+
+```bash
+hermes model
+# Select "Custom endpoint (self-hosted / VLLM / etc.)"
+# Enter URL: http://localhost:8000/v1
+# Skip API key (or enter one if you configured vLLM with --api-key)
+# Enter model name: meta-llama/Llama-3.1-70B-Instruct
+```
+
+**Context length:** vLLM reads the model's `max_position_embeddings` by default. If that exceeds your GPU memory, it errors and asks you to set `--max-model-len` lower. You can also use `--max-model-len auto` to automatically find the maximum that fits. Set `--gpu-memory-utilization 0.95` (default 0.9) to squeeze more context into VRAM.
+
+**Tool calling requires explicit flags:**
+
+| Flag | Purpose |
+|------|---------|
+| `--enable-auto-tool-choice` | Required for `tool_choice: "auto"` (the default in Hermes) |
+| `--tool-call-parser <name>` | Parser for the model's tool call format |
+
+Supported parsers: `hermes` (Qwen 2.5, Hermes 2/3), `llama3_json` (Llama 3.x), `mistral`, `deepseek_v3`, `deepseek_v31`, `xlam`, `pythonic`. Without these flags, tool calls won't work — the model will output tool calls as text.
+
+:::tip
+vLLM supports human-readable sizes: `--max-model-len 64k` (lowercase k = 1000, uppercase K = 1024).
+:::
 
 ---
 
@@ -304,19 +370,32 @@ vLLM supports tool calling, structured output, and multi-modal models. Use `--en
 [SGLang](https://github.com/sgl-project/sglang) is an alternative to vLLM with RadixAttention for KV cache reuse. Best for: multi-turn conversations (prefix caching), constrained decoding, structured output.
 
 ```bash
-# Start SGLang server
 pip install "sglang[all]"
 python -m sglang.launch_server \
   --model meta-llama/Llama-3.1-70B-Instruct \
-  --port 8000 \
-  --tp 2
-
-# Configure Hermes
-OPENAI_BASE_URL=http://localhost:8000/v1
-OPENAI_API_KEY=dummy
-LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct
+  --port 30000 \
+  --context-length 65536 \
+  --tp 2 \
+  --tool-call-parser qwen
 ```
 
+Then configure Hermes:
+
+```bash
+hermes model
+# Select "Custom endpoint (self-hosted / VLLM / etc.)"
+# Enter URL: http://localhost:30000/v1
+# Enter model name: meta-llama/Llama-3.1-70B-Instruct
+```
+
+**Context length:** SGLang reads from the model's config by default. Use `--context-length` to override. If you need to exceed the model's declared maximum, set `SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1`.
+
+**Tool calling:** Use `--tool-call-parser` with the appropriate parser for your model family: `qwen` (Qwen 2.5), `llama3`, `llama4`, `deepseekv3`, `mistral`, `glm`. Without this flag, tool calls come back as plain text.
+
+:::caution SGLang defaults to 128 max output tokens
+If responses seem truncated, add `max_tokens` to your requests or set `--default-max-tokens` on the server. SGLang's default is only 128 tokens per response if not specified in the request.
+:::
+
 ---
 
 ### llama.cpp / llama-server — CPU & Metal Inference
@@ -327,21 +406,136 @@ LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct
 # Build and start llama-server
 cmake -B build && cmake --build build --config Release
 ./build/bin/llama-server \
-  -m models/llama-3.1-8b-instruct-Q4_K_M.gguf \
+  --jinja -fa \
+  -c 32768 \
+  -ngl 99 \
+  -m models/qwen2.5-coder-32b-instruct-Q4_K_M.gguf \
   --port 8080 --host 0.0.0.0
-
-# Configure Hermes
-OPENAI_BASE_URL=http://localhost:8080/v1
-OPENAI_API_KEY=dummy
-LLM_MODEL=llama-3.1-8b-instruct
 ```
 
+**Context length (`-c`):** Recent builds default to `0` which reads the model's training context from the GGUF metadata. For models with 128k+ training context, this can OOM trying to allocate the full KV cache. Set `-c` explicitly to what you need (32k–64k is a good range for agent use). If using parallel slots (`-np`), the total context is divided among slots — with `-c 32768 -np 4`, each slot only gets 8k.
+
+Then configure Hermes to point at it:
+
+```bash
+hermes model
+# Select "Custom endpoint (self-hosted / VLLM / etc.)"
+# Enter URL: http://localhost:8080/v1
+# Skip API key (local servers don't need one)
+# Enter model name — or leave blank to auto-detect if only one model is loaded
+```
+
+This saves the endpoint to `config.yaml` so it persists across sessions.
+
+:::caution `--jinja` is required for tool calling
+Without `--jinja`, llama-server ignores the `tools` parameter entirely. The model will try to call tools by writing JSON in its response text, but Hermes won't recognize it as a tool call — you'll see raw JSON like `{"name": "web_search", ...}` printed as a message instead of an actual search.
+
+Native tool calling support (best performance): Llama 3.x, Qwen 2.5 (including Coder), Hermes 2/3, Mistral, DeepSeek, Functionary. All other models use a generic handler that works but may be less efficient. See the [llama.cpp function calling docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/function-calling.md) for the full list.
+
+You can verify tool support is active by checking `http://localhost:8080/props` — the `chat_template` field should be present.
+:::
+
 :::tip
 Download GGUF models from [Hugging Face](https://huggingface.co/models?library=gguf). Q4_K_M quantization offers the best balance of quality vs. memory usage.
 :::
 
 ---
 
+### LM Studio — Desktop App with Local Models
+
+[LM Studio](https://lmstudio.ai/) is a desktop app for running local models with a GUI. Best for: users who prefer a visual interface, quick model testing, developers on macOS/Windows/Linux.
+
+Start the server from the LM Studio app (Developer tab → Start Server), or use the CLI:
+
+```bash
+lms server start                        # Starts on port 1234
+lms load qwen2.5-coder --context-length 32768
+```
+
+Then configure Hermes:
+
+```bash
+hermes model
+# Select "Custom endpoint (self-hosted / VLLM / etc.)"
+# Enter URL: http://localhost:1234/v1
+# Skip API key (LM Studio doesn't require one)
+# Enter model name
+```
+
+:::caution Context length often defaults to 2048
+LM Studio reads context length from the model's metadata, but many GGUF models report low defaults (2048 or 4096). **Always set context length explicitly** in the LM Studio model settings:
+
+1. Click the gear icon next to the model picker
+2. Set "Context Length" to at least 16384 (preferably 32768)
+3. Reload the model for the change to take effect
+
+Alternatively, use the CLI: `lms load model-name --context-length 32768`
+
+To set persistent per-model defaults: My Models tab → gear icon on the model → set context size.
+:::
+
+**Tool calling:** Supported since LM Studio 0.3.6. Models with native tool-calling training (Qwen 2.5, Llama 3.x, Mistral, Hermes) are auto-detected and shown with a tool badge. Other models use a generic fallback that may be less reliable.
+
+---
+
+### Troubleshooting Local Models
+
+These issues affect **all** local inference servers when used with Hermes.
+
+#### Tool calls appear as text instead of executing
+
+The model outputs something like `{"name": "web_search", "arguments": {...}}` as a message instead of actually calling the tool.
+
+**Cause:** Your server doesn't have tool calling enabled, or the model doesn't support it through the server's tool calling implementation.
+
+| Server | Fix |
+|--------|-----|
+| **llama.cpp** | Add `--jinja` to the startup command |
+| **vLLM** | Add `--enable-auto-tool-choice --tool-call-parser hermes` |
+| **SGLang** | Add `--tool-call-parser qwen` (or appropriate parser) |
+| **Ollama** | Tool calling is enabled by default — make sure your model supports it (check with `ollama show model-name`) |
+| **LM Studio** | Update to 0.3.6+ and use a model with native tool support |
+
+#### Model seems to forget context or give incoherent responses
+
+**Cause:** Context window is too small. When the conversation exceeds the context limit, most servers silently drop older messages. Hermes's system prompt + tool schemas alone can use 4k–8k tokens.
+
+**Diagnosis:**
+
+```bash
+# Check what Hermes thinks the context is
+# Look at startup line: "Context limit: X tokens"
+
+# Check your server's actual context
+# Ollama: ollama ps (CONTEXT column)
+# llama.cpp: curl http://localhost:8080/props | jq '.default_generation_settings.n_ctx'
+# vLLM: check --max-model-len in startup args
+```
+
+**Fix:** Set context to at least **32,768 tokens** for agent use. See each server's section above for the specific flag.
+
+#### "Context limit: 2048 tokens" at startup
+
+Hermes auto-detects context length from your server's `/v1/models` endpoint. If the server reports a low value (or doesn't report one at all), Hermes uses the model's declared limit which may be wrong.
+
+**Fix:** Set it explicitly in `config.yaml`:
+
+```yaml
+model:
+  default: your-model
+  provider: custom
+  base_url: http://localhost:11434/v1
+  context_length: 32768
+```
+
+#### Responses get cut off mid-sentence
+
+**Possible causes:**
+1. **Low `max_tokens` on the server** — SGLang defaults to 128 tokens per response. Set `--default-max-tokens` on the server or configure Hermes with `model.max_tokens` in config.yaml.
+2. **Context exhaustion** — The model filled its context window. Increase context length or enable [context compression](/docs/user-guide/configuration#context-compression) in Hermes.
+
+---
+
 ### LiteLLM Proxy — Multi-Provider Gateway
 
 [LiteLLM](https://docs.litellm.ai/) is an OpenAI-compatible proxy that unifies 100+ LLM providers behind a single API. Best for: switching between providers without config changes, load balancing, fallback chains, budget controls.
@@ -353,13 +547,10 @@ litellm --model anthropic/claude-sonnet-4 --port 4000
 
 # Or with a config file for multiple models:
 litellm --config litellm_config.yaml --port 4000
-
-# Configure Hermes
-OPENAI_BASE_URL=http://localhost:4000/v1
-OPENAI_API_KEY=sk-your-litellm-key
-LLM_MODEL=anthropic/claude-sonnet-4
 ```
 
+Then configure Hermes with `hermes model` → Custom endpoint → `http://localhost:4000/v1`.
+
 Example `litellm_config.yaml` with fallback:
 ```yaml
 model_list:
@@ -384,13 +575,10 @@ router_settings:
 ```bash
 # Install and start
 npx @blockrun/clawrouter    # Starts on port 8402
-
-# Configure Hermes
-OPENAI_BASE_URL=http://localhost:8402/v1
-OPENAI_API_KEY=dummy
-LLM_MODEL=blockrun/auto     # or: blockrun/eco, blockrun/premium, blockrun/agentic
 ```
 
+Then configure Hermes with `hermes model` → Custom endpoint → `http://localhost:8402/v1` → model name `blockrun/auto`.
+
 Routing profiles:
 | Profile | Strategy | Savings |
 |---------|----------|---------|
@@ -423,11 +611,14 @@ Any service with an OpenAI-compatible API works. Some popular options:
 | [LocalAI](https://localai.io) | `http://localhost:8080/v1` | Self-hosted, multi-model |
 | [Jan](https://jan.ai) | `http://localhost:1337/v1` | Desktop app with local models |
 
-```bash
-# Example: Together AI
-OPENAI_BASE_URL=https://api.together.xyz/v1
-OPENAI_API_KEY=your-together-key
-LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct-Turbo
+Configure any of these with `hermes model` → Custom endpoint, or in `config.yaml`:
+
+```yaml
+model:
+  default: meta-llama/Llama-3.1-70B-Instruct-Turbo
+  provider: custom
+  base_url: https://api.together.xyz/v1
+  api_key: your-together-key
 ```
 
 ---