docs+feat: comprehensive local LLM provider guides and context length warning (#4294)
* docs: update llama.cpp section with --jinja flag and tool calling guide The llama.cpp docs were missing the --jinja flag which is required for tool calling to work. Without it, models output tool calls as raw JSON text instead of structured API responses, making Hermes unable to execute them. Changes: - Add --jinja and -fa flags to the server startup example - Replace deprecated env vars (OPENAI_BASE_URL, LLM_MODEL) with hermes model interactive setup - Add caution block explaining the --jinja requirement and symptoms - List models with native tool calling support - Add /props endpoint verification tip * docs+feat: comprehensive local LLM provider guides and context length warning Docs (providers.md): - Rewrote Ollama section with context length warning (defaults to 4k on <24GB VRAM), three methods to increase it, and verification steps - Rewrote vLLM section with --max-model-len, tool calling flags (--enable-auto-tool-choice, --tool-call-parser), and context guidance - Rewrote SGLang section with --context-length, --tool-call-parser, and warning about 128-token default max output - Added LM Studio section (port 1234, context length defaults to 2048, tool calling since 0.3.6) - Added llama.cpp context length flag (-c) and GPU offload (-ngl) - Added Troubleshooting Local Models section covering: - Tool calls appearing as text (with per-server fix table) - Silent context truncation and diagnosis commands - Low detected context at startup - Truncated responses - Replaced all deprecated env vars (OPENAI_BASE_URL, LLM_MODEL) with hermes model interactive setup and config.yaml examples - Added deprecation warning for legacy env vars in General Setup Code (cli.py): - Added context length warning in show_banner() when detected context is <= 8192 tokens, with server-specific fix hints: - Ollama (port 11434): suggests OLLAMA_CONTEXT_LENGTH env var - LM Studio (port 1234): suggests model settings adjustment - Other servers: suggests config.yaml override Tests: - 9 new tests covering warning thresholds, server-specific hints, and no-warning cases
This commit is contained in:
26
cli.py
26
cli.py
@@ -2192,7 +2192,31 @@ class HermesCLI:
|
||||
|
||||
# Show tool availability warnings if any tools are disabled
|
||||
self._show_tool_availability_warnings()
|
||||
|
||||
|
||||
# Warn about very low context lengths (common with local servers)
|
||||
if ctx_len and ctx_len <= 8192:
|
||||
self.console.print()
|
||||
self.console.print(
|
||||
f"[yellow]⚠️ Context length is only {ctx_len:,} tokens — "
|
||||
f"this is likely too low for agent use with tools.[/]"
|
||||
)
|
||||
self.console.print(
|
||||
"[dim] Hermes needs 16k–32k minimum. Tool schemas + system prompt alone use ~4k–8k.[/]"
|
||||
)
|
||||
base_url = getattr(self, "base_url", "") or ""
|
||||
if "11434" in base_url or "ollama" in base_url.lower():
|
||||
self.console.print(
|
||||
"[dim] Ollama fix: OLLAMA_CONTEXT_LENGTH=32768 ollama serve[/]"
|
||||
)
|
||||
elif "1234" in base_url:
|
||||
self.console.print(
|
||||
"[dim] LM Studio fix: Set context length in model settings → reload model[/]"
|
||||
)
|
||||
else:
|
||||
self.console.print(
|
||||
"[dim] Fix: Set model.context_length in config.yaml, or increase your server's context setting[/]"
|
||||
)
|
||||
|
||||
self.console.print()
|
||||
|
||||
def _preload_resumed_session(self) -> bool:
|
||||
|
||||
147
tests/test_cli_context_warning.py
Normal file
147
tests/test_cli_context_warning.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Tests for the low context length warning in the CLI banner."""
|
||||
|
||||
import os
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def _isolate(tmp_path, monkeypatch):
|
||||
"""Isolate HERMES_HOME so tests don't touch real config."""
|
||||
home = tmp_path / ".hermes"
|
||||
home.mkdir()
|
||||
monkeypatch.setenv("HERMES_HOME", str(home))
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def cli_obj(_isolate):
|
||||
"""Create a minimal HermesCLI instance for banner testing."""
|
||||
with patch("cli.load_cli_config", return_value={
|
||||
"display": {"tool_progress": "new"},
|
||||
"terminal": {},
|
||||
}), patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
from cli import HermesCLI
|
||||
obj = HermesCLI.__new__(HermesCLI)
|
||||
obj.model = "test-model"
|
||||
obj.enabled_toolsets = ["hermes-core"]
|
||||
obj.compact = False
|
||||
obj.console = MagicMock()
|
||||
obj.session_id = None
|
||||
obj.api_key = "test"
|
||||
obj.base_url = ""
|
||||
# Mock agent with context compressor
|
||||
obj.agent = SimpleNamespace(
|
||||
context_compressor=SimpleNamespace(context_length=None)
|
||||
)
|
||||
return obj
|
||||
|
||||
|
||||
class TestLowContextWarning:
|
||||
"""Tests that the CLI warns about low context lengths."""
|
||||
|
||||
def test_no_warning_for_normal_context(self, cli_obj):
|
||||
"""No warning when context is 32k+."""
|
||||
cli_obj.agent.context_compressor.context_length = 32768
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
# Check that no yellow warning was printed
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
warning_calls = [c for c in calls if "too low" in c]
|
||||
assert len(warning_calls) == 0
|
||||
|
||||
def test_warning_for_low_context(self, cli_obj):
|
||||
"""Warning shown when context is 4096 (Ollama default)."""
|
||||
cli_obj.agent.context_compressor.context_length = 4096
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
warning_calls = [c for c in calls if "too low" in c]
|
||||
assert len(warning_calls) == 1
|
||||
assert "4,096" in warning_calls[0]
|
||||
|
||||
def test_warning_for_2048_context(self, cli_obj):
|
||||
"""Warning shown for 2048 tokens (common LM Studio default)."""
|
||||
cli_obj.agent.context_compressor.context_length = 2048
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
warning_calls = [c for c in calls if "too low" in c]
|
||||
assert len(warning_calls) == 1
|
||||
|
||||
def test_no_warning_at_boundary(self, cli_obj):
|
||||
"""No warning at exactly 8192 — 8192 is borderline but included in warning."""
|
||||
cli_obj.agent.context_compressor.context_length = 8192
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
warning_calls = [c for c in calls if "too low" in c]
|
||||
assert len(warning_calls) == 1 # 8192 is still warned about
|
||||
|
||||
def test_no_warning_above_boundary(self, cli_obj):
|
||||
"""No warning at 16384."""
|
||||
cli_obj.agent.context_compressor.context_length = 16384
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
warning_calls = [c for c in calls if "too low" in c]
|
||||
assert len(warning_calls) == 0
|
||||
|
||||
def test_ollama_specific_hint(self, cli_obj):
|
||||
"""Ollama-specific fix shown when port 11434 detected."""
|
||||
cli_obj.agent.context_compressor.context_length = 4096
|
||||
cli_obj.base_url = "http://localhost:11434/v1"
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
ollama_hints = [c for c in calls if "OLLAMA_CONTEXT_LENGTH" in c]
|
||||
assert len(ollama_hints) == 1
|
||||
|
||||
def test_lm_studio_specific_hint(self, cli_obj):
|
||||
"""LM Studio-specific fix shown when port 1234 detected."""
|
||||
cli_obj.agent.context_compressor.context_length = 2048
|
||||
cli_obj.base_url = "http://localhost:1234/v1"
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
lms_hints = [c for c in calls if "LM Studio" in c]
|
||||
assert len(lms_hints) == 1
|
||||
|
||||
def test_generic_hint_for_other_servers(self, cli_obj):
|
||||
"""Generic fix shown for unknown servers."""
|
||||
cli_obj.agent.context_compressor.context_length = 4096
|
||||
cli_obj.base_url = "http://localhost:8080/v1"
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
generic_hints = [c for c in calls if "config.yaml" in c]
|
||||
assert len(generic_hints) == 1
|
||||
|
||||
def test_no_warning_when_no_context_length(self, cli_obj):
|
||||
"""No warning when context length is not yet known."""
|
||||
cli_obj.agent.context_compressor.context_length = None
|
||||
with patch("cli.get_tool_definitions", return_value=[]), \
|
||||
patch("cli.build_welcome_banner"):
|
||||
cli_obj.show_banner()
|
||||
|
||||
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
||||
warning_calls = [c for c in calls if "too low" in c]
|
||||
assert len(warning_calls) == 0
|
||||
@@ -218,15 +218,11 @@ model:
|
||||
api_key: your-key-or-leave-empty-for-local
|
||||
```
|
||||
|
||||
**Environment variables (`.env` file):**
|
||||
```bash
|
||||
# Add to ~/.hermes/.env
|
||||
OPENAI_BASE_URL=http://localhost:8000/v1
|
||||
OPENAI_API_KEY=your-key # Any non-empty string for local servers
|
||||
LLM_MODEL=your-model-name
|
||||
```
|
||||
:::warning Legacy env vars
|
||||
`OPENAI_BASE_URL` and `LLM_MODEL` in `.env` are **deprecated**. The CLI ignores `LLM_MODEL` entirely (only the gateway reads it). Use `hermes model` or edit `config.yaml` directly — both persist correctly across restarts and Docker containers.
|
||||
:::
|
||||
|
||||
All three approaches end up in the same runtime path. `hermes model` persists provider, model, and base URL to `config.yaml` so later sessions keep using that endpoint even if env vars are not set.
|
||||
Both approaches persist to `config.yaml`, which is the source of truth for model, provider, and base URL.
|
||||
|
||||
### Switching Models with `/model`
|
||||
|
||||
@@ -257,23 +253,73 @@ Everything below follows this same pattern — just change the URL, key, and mod
|
||||
|
||||
### Ollama — Local Models, Zero Config
|
||||
|
||||
[Ollama](https://ollama.com/) runs open-weight models locally with one command. Best for: quick local experimentation, privacy-sensitive work, offline use.
|
||||
[Ollama](https://ollama.com/) runs open-weight models locally with one command. Best for: quick local experimentation, privacy-sensitive work, offline use. Supports tool calling via the OpenAI-compatible API.
|
||||
|
||||
```bash
|
||||
# Install and run a model
|
||||
ollama pull llama3.1:70b
|
||||
ollama pull qwen2.5-coder:32b
|
||||
ollama serve # Starts on port 11434
|
||||
|
||||
# Configure Hermes
|
||||
OPENAI_BASE_URL=http://localhost:11434/v1
|
||||
OPENAI_API_KEY=ollama # Any non-empty string
|
||||
LLM_MODEL=llama3.1:70b
|
||||
```
|
||||
|
||||
Ollama's OpenAI-compatible endpoint supports chat completions, streaming, and tool calling (for supported models). No GPU required for smaller models — Ollama handles CPU inference automatically.
|
||||
Then configure Hermes:
|
||||
|
||||
```bash
|
||||
hermes model
|
||||
# Select "Custom endpoint (self-hosted / VLLM / etc.)"
|
||||
# Enter URL: http://localhost:11434/v1
|
||||
# Skip API key (Ollama doesn't need one)
|
||||
# Enter model name (e.g. qwen2.5-coder:32b)
|
||||
```
|
||||
|
||||
Or configure `config.yaml` directly:
|
||||
|
||||
```yaml
|
||||
model:
|
||||
default: qwen2.5-coder:32b
|
||||
provider: custom
|
||||
base_url: http://localhost:11434/v1
|
||||
context_length: 32768 # See warning below
|
||||
```
|
||||
|
||||
:::caution Ollama defaults to very low context lengths
|
||||
Ollama does **not** use your model's full context window by default. Depending on your VRAM, the default is:
|
||||
|
||||
| Available VRAM | Default context |
|
||||
|----------------|----------------|
|
||||
| Less than 24 GB | **4,096 tokens** |
|
||||
| 24–48 GB | 32,768 tokens |
|
||||
| 48+ GB | 256,000 tokens |
|
||||
|
||||
For agent use with tools, **you need at least 16k–32k context**. At 4k, the system prompt + tool schemas alone can fill the window, leaving no room for conversation.
|
||||
|
||||
**How to increase it** (pick one):
|
||||
|
||||
```bash
|
||||
# Option 1: Set server-wide via environment variable (recommended)
|
||||
OLLAMA_CONTEXT_LENGTH=32768 ollama serve
|
||||
|
||||
# Option 2: For systemd-managed Ollama
|
||||
sudo systemctl edit ollama.service
|
||||
# Add: Environment="OLLAMA_CONTEXT_LENGTH=32768"
|
||||
# Then: sudo systemctl daemon-reload && sudo systemctl restart ollama
|
||||
|
||||
# Option 3: Bake it into a custom model (persistent per-model)
|
||||
echo -e "FROM qwen2.5-coder:32b\nPARAMETER num_ctx 32768" > Modelfile
|
||||
ollama create qwen2.5-coder-32k -f Modelfile
|
||||
```
|
||||
|
||||
**You cannot set context length through the OpenAI-compatible API** (`/v1/chat/completions`). It must be configured server-side or via a Modelfile. This is the #1 source of confusion when integrating Ollama with tools like Hermes.
|
||||
:::
|
||||
|
||||
**Verify your context is set correctly:**
|
||||
|
||||
```bash
|
||||
ollama ps
|
||||
# Look at the CONTEXT column — it should show your configured value
|
||||
```
|
||||
|
||||
:::tip
|
||||
List available models with `ollama list`. Pull any model from the [Ollama library](https://ollama.com/library) with `ollama pull <model>`.
|
||||
List available models with `ollama list`. Pull any model from the [Ollama library](https://ollama.com/library) with `ollama pull <model>`. Ollama handles GPU offloading automatically — no configuration needed for most setups.
|
||||
:::
|
||||
|
||||
---
|
||||
@@ -283,19 +329,39 @@ List available models with `ollama list`. Pull any model from the [Ollama librar
|
||||
[vLLM](https://docs.vllm.ai/) is the standard for production LLM serving. Best for: maximum throughput on GPU hardware, serving large models, continuous batching.
|
||||
|
||||
```bash
|
||||
# Start vLLM server
|
||||
pip install vllm
|
||||
vllm serve meta-llama/Llama-3.1-70B-Instruct \
|
||||
--port 8000 \
|
||||
--tensor-parallel-size 2 # Multi-GPU
|
||||
|
||||
# Configure Hermes
|
||||
OPENAI_BASE_URL=http://localhost:8000/v1
|
||||
OPENAI_API_KEY=dummy
|
||||
LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct
|
||||
--max-model-len 65536 \
|
||||
--tensor-parallel-size 2 \
|
||||
--enable-auto-tool-choice \
|
||||
--tool-call-parser hermes
|
||||
```
|
||||
|
||||
vLLM supports tool calling, structured output, and multi-modal models. Use `--enable-auto-tool-choice` and `--tool-call-parser hermes` for Hermes-format tool calling with NousResearch models.
|
||||
Then configure Hermes:
|
||||
|
||||
```bash
|
||||
hermes model
|
||||
# Select "Custom endpoint (self-hosted / VLLM / etc.)"
|
||||
# Enter URL: http://localhost:8000/v1
|
||||
# Skip API key (or enter one if you configured vLLM with --api-key)
|
||||
# Enter model name: meta-llama/Llama-3.1-70B-Instruct
|
||||
```
|
||||
|
||||
**Context length:** vLLM reads the model's `max_position_embeddings` by default. If that exceeds your GPU memory, it errors and asks you to set `--max-model-len` lower. You can also use `--max-model-len auto` to automatically find the maximum that fits. Set `--gpu-memory-utilization 0.95` (default 0.9) to squeeze more context into VRAM.
|
||||
|
||||
**Tool calling requires explicit flags:**
|
||||
|
||||
| Flag | Purpose |
|
||||
|------|---------|
|
||||
| `--enable-auto-tool-choice` | Required for `tool_choice: "auto"` (the default in Hermes) |
|
||||
| `--tool-call-parser <name>` | Parser for the model's tool call format |
|
||||
|
||||
Supported parsers: `hermes` (Qwen 2.5, Hermes 2/3), `llama3_json` (Llama 3.x), `mistral`, `deepseek_v3`, `deepseek_v31`, `xlam`, `pythonic`. Without these flags, tool calls won't work — the model will output tool calls as text.
|
||||
|
||||
:::tip
|
||||
vLLM supports human-readable sizes: `--max-model-len 64k` (lowercase k = 1000, uppercase K = 1024).
|
||||
:::
|
||||
|
||||
---
|
||||
|
||||
@@ -304,19 +370,32 @@ vLLM supports tool calling, structured output, and multi-modal models. Use `--en
|
||||
[SGLang](https://github.com/sgl-project/sglang) is an alternative to vLLM with RadixAttention for KV cache reuse. Best for: multi-turn conversations (prefix caching), constrained decoding, structured output.
|
||||
|
||||
```bash
|
||||
# Start SGLang server
|
||||
pip install "sglang[all]"
|
||||
python -m sglang.launch_server \
|
||||
--model meta-llama/Llama-3.1-70B-Instruct \
|
||||
--port 8000 \
|
||||
--tp 2
|
||||
|
||||
# Configure Hermes
|
||||
OPENAI_BASE_URL=http://localhost:8000/v1
|
||||
OPENAI_API_KEY=dummy
|
||||
LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct
|
||||
--port 30000 \
|
||||
--context-length 65536 \
|
||||
--tp 2 \
|
||||
--tool-call-parser qwen
|
||||
```
|
||||
|
||||
Then configure Hermes:
|
||||
|
||||
```bash
|
||||
hermes model
|
||||
# Select "Custom endpoint (self-hosted / VLLM / etc.)"
|
||||
# Enter URL: http://localhost:30000/v1
|
||||
# Enter model name: meta-llama/Llama-3.1-70B-Instruct
|
||||
```
|
||||
|
||||
**Context length:** SGLang reads from the model's config by default. Use `--context-length` to override. If you need to exceed the model's declared maximum, set `SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1`.
|
||||
|
||||
**Tool calling:** Use `--tool-call-parser` with the appropriate parser for your model family: `qwen` (Qwen 2.5), `llama3`, `llama4`, `deepseekv3`, `mistral`, `glm`. Without this flag, tool calls come back as plain text.
|
||||
|
||||
:::caution SGLang defaults to 128 max output tokens
|
||||
If responses seem truncated, add `max_tokens` to your requests or set `--default-max-tokens` on the server. SGLang's default is only 128 tokens per response if not specified in the request.
|
||||
:::
|
||||
|
||||
---
|
||||
|
||||
### llama.cpp / llama-server — CPU & Metal Inference
|
||||
@@ -327,21 +406,136 @@ LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct
|
||||
# Build and start llama-server
|
||||
cmake -B build && cmake --build build --config Release
|
||||
./build/bin/llama-server \
|
||||
-m models/llama-3.1-8b-instruct-Q4_K_M.gguf \
|
||||
--jinja -fa \
|
||||
-c 32768 \
|
||||
-ngl 99 \
|
||||
-m models/qwen2.5-coder-32b-instruct-Q4_K_M.gguf \
|
||||
--port 8080 --host 0.0.0.0
|
||||
|
||||
# Configure Hermes
|
||||
OPENAI_BASE_URL=http://localhost:8080/v1
|
||||
OPENAI_API_KEY=dummy
|
||||
LLM_MODEL=llama-3.1-8b-instruct
|
||||
```
|
||||
|
||||
**Context length (`-c`):** Recent builds default to `0` which reads the model's training context from the GGUF metadata. For models with 128k+ training context, this can OOM trying to allocate the full KV cache. Set `-c` explicitly to what you need (32k–64k is a good range for agent use). If using parallel slots (`-np`), the total context is divided among slots — with `-c 32768 -np 4`, each slot only gets 8k.
|
||||
|
||||
Then configure Hermes to point at it:
|
||||
|
||||
```bash
|
||||
hermes model
|
||||
# Select "Custom endpoint (self-hosted / VLLM / etc.)"
|
||||
# Enter URL: http://localhost:8080/v1
|
||||
# Skip API key (local servers don't need one)
|
||||
# Enter model name — or leave blank to auto-detect if only one model is loaded
|
||||
```
|
||||
|
||||
This saves the endpoint to `config.yaml` so it persists across sessions.
|
||||
|
||||
:::caution `--jinja` is required for tool calling
|
||||
Without `--jinja`, llama-server ignores the `tools` parameter entirely. The model will try to call tools by writing JSON in its response text, but Hermes won't recognize it as a tool call — you'll see raw JSON like `{"name": "web_search", ...}` printed as a message instead of an actual search.
|
||||
|
||||
Native tool calling support (best performance): Llama 3.x, Qwen 2.5 (including Coder), Hermes 2/3, Mistral, DeepSeek, Functionary. All other models use a generic handler that works but may be less efficient. See the [llama.cpp function calling docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/function-calling.md) for the full list.
|
||||
|
||||
You can verify tool support is active by checking `http://localhost:8080/props` — the `chat_template` field should be present.
|
||||
:::
|
||||
|
||||
:::tip
|
||||
Download GGUF models from [Hugging Face](https://huggingface.co/models?library=gguf). Q4_K_M quantization offers the best balance of quality vs. memory usage.
|
||||
:::
|
||||
|
||||
---
|
||||
|
||||
### LM Studio — Desktop App with Local Models
|
||||
|
||||
[LM Studio](https://lmstudio.ai/) is a desktop app for running local models with a GUI. Best for: users who prefer a visual interface, quick model testing, developers on macOS/Windows/Linux.
|
||||
|
||||
Start the server from the LM Studio app (Developer tab → Start Server), or use the CLI:
|
||||
|
||||
```bash
|
||||
lms server start # Starts on port 1234
|
||||
lms load qwen2.5-coder --context-length 32768
|
||||
```
|
||||
|
||||
Then configure Hermes:
|
||||
|
||||
```bash
|
||||
hermes model
|
||||
# Select "Custom endpoint (self-hosted / VLLM / etc.)"
|
||||
# Enter URL: http://localhost:1234/v1
|
||||
# Skip API key (LM Studio doesn't require one)
|
||||
# Enter model name
|
||||
```
|
||||
|
||||
:::caution Context length often defaults to 2048
|
||||
LM Studio reads context length from the model's metadata, but many GGUF models report low defaults (2048 or 4096). **Always set context length explicitly** in the LM Studio model settings:
|
||||
|
||||
1. Click the gear icon next to the model picker
|
||||
2. Set "Context Length" to at least 16384 (preferably 32768)
|
||||
3. Reload the model for the change to take effect
|
||||
|
||||
Alternatively, use the CLI: `lms load model-name --context-length 32768`
|
||||
|
||||
To set persistent per-model defaults: My Models tab → gear icon on the model → set context size.
|
||||
:::
|
||||
|
||||
**Tool calling:** Supported since LM Studio 0.3.6. Models with native tool-calling training (Qwen 2.5, Llama 3.x, Mistral, Hermes) are auto-detected and shown with a tool badge. Other models use a generic fallback that may be less reliable.
|
||||
|
||||
---
|
||||
|
||||
### Troubleshooting Local Models
|
||||
|
||||
These issues affect **all** local inference servers when used with Hermes.
|
||||
|
||||
#### Tool calls appear as text instead of executing
|
||||
|
||||
The model outputs something like `{"name": "web_search", "arguments": {...}}` as a message instead of actually calling the tool.
|
||||
|
||||
**Cause:** Your server doesn't have tool calling enabled, or the model doesn't support it through the server's tool calling implementation.
|
||||
|
||||
| Server | Fix |
|
||||
|--------|-----|
|
||||
| **llama.cpp** | Add `--jinja` to the startup command |
|
||||
| **vLLM** | Add `--enable-auto-tool-choice --tool-call-parser hermes` |
|
||||
| **SGLang** | Add `--tool-call-parser qwen` (or appropriate parser) |
|
||||
| **Ollama** | Tool calling is enabled by default — make sure your model supports it (check with `ollama show model-name`) |
|
||||
| **LM Studio** | Update to 0.3.6+ and use a model with native tool support |
|
||||
|
||||
#### Model seems to forget context or give incoherent responses
|
||||
|
||||
**Cause:** Context window is too small. When the conversation exceeds the context limit, most servers silently drop older messages. Hermes's system prompt + tool schemas alone can use 4k–8k tokens.
|
||||
|
||||
**Diagnosis:**
|
||||
|
||||
```bash
|
||||
# Check what Hermes thinks the context is
|
||||
# Look at startup line: "Context limit: X tokens"
|
||||
|
||||
# Check your server's actual context
|
||||
# Ollama: ollama ps (CONTEXT column)
|
||||
# llama.cpp: curl http://localhost:8080/props | jq '.default_generation_settings.n_ctx'
|
||||
# vLLM: check --max-model-len in startup args
|
||||
```
|
||||
|
||||
**Fix:** Set context to at least **32,768 tokens** for agent use. See each server's section above for the specific flag.
|
||||
|
||||
#### "Context limit: 2048 tokens" at startup
|
||||
|
||||
Hermes auto-detects context length from your server's `/v1/models` endpoint. If the server reports a low value (or doesn't report one at all), Hermes uses the model's declared limit which may be wrong.
|
||||
|
||||
**Fix:** Set it explicitly in `config.yaml`:
|
||||
|
||||
```yaml
|
||||
model:
|
||||
default: your-model
|
||||
provider: custom
|
||||
base_url: http://localhost:11434/v1
|
||||
context_length: 32768
|
||||
```
|
||||
|
||||
#### Responses get cut off mid-sentence
|
||||
|
||||
**Possible causes:**
|
||||
1. **Low `max_tokens` on the server** — SGLang defaults to 128 tokens per response. Set `--default-max-tokens` on the server or configure Hermes with `model.max_tokens` in config.yaml.
|
||||
2. **Context exhaustion** — The model filled its context window. Increase context length or enable [context compression](/docs/user-guide/configuration#context-compression) in Hermes.
|
||||
|
||||
---
|
||||
|
||||
### LiteLLM Proxy — Multi-Provider Gateway
|
||||
|
||||
[LiteLLM](https://docs.litellm.ai/) is an OpenAI-compatible proxy that unifies 100+ LLM providers behind a single API. Best for: switching between providers without config changes, load balancing, fallback chains, budget controls.
|
||||
@@ -353,13 +547,10 @@ litellm --model anthropic/claude-sonnet-4 --port 4000
|
||||
|
||||
# Or with a config file for multiple models:
|
||||
litellm --config litellm_config.yaml --port 4000
|
||||
|
||||
# Configure Hermes
|
||||
OPENAI_BASE_URL=http://localhost:4000/v1
|
||||
OPENAI_API_KEY=sk-your-litellm-key
|
||||
LLM_MODEL=anthropic/claude-sonnet-4
|
||||
```
|
||||
|
||||
Then configure Hermes with `hermes model` → Custom endpoint → `http://localhost:4000/v1`.
|
||||
|
||||
Example `litellm_config.yaml` with fallback:
|
||||
```yaml
|
||||
model_list:
|
||||
@@ -384,13 +575,10 @@ router_settings:
|
||||
```bash
|
||||
# Install and start
|
||||
npx @blockrun/clawrouter # Starts on port 8402
|
||||
|
||||
# Configure Hermes
|
||||
OPENAI_BASE_URL=http://localhost:8402/v1
|
||||
OPENAI_API_KEY=dummy
|
||||
LLM_MODEL=blockrun/auto # or: blockrun/eco, blockrun/premium, blockrun/agentic
|
||||
```
|
||||
|
||||
Then configure Hermes with `hermes model` → Custom endpoint → `http://localhost:8402/v1` → model name `blockrun/auto`.
|
||||
|
||||
Routing profiles:
|
||||
| Profile | Strategy | Savings |
|
||||
|---------|----------|---------|
|
||||
@@ -423,11 +611,14 @@ Any service with an OpenAI-compatible API works. Some popular options:
|
||||
| [LocalAI](https://localai.io) | `http://localhost:8080/v1` | Self-hosted, multi-model |
|
||||
| [Jan](https://jan.ai) | `http://localhost:1337/v1` | Desktop app with local models |
|
||||
|
||||
```bash
|
||||
# Example: Together AI
|
||||
OPENAI_BASE_URL=https://api.together.xyz/v1
|
||||
OPENAI_API_KEY=your-together-key
|
||||
LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct-Turbo
|
||||
Configure any of these with `hermes model` → Custom endpoint, or in `config.yaml`:
|
||||
|
||||
```yaml
|
||||
model:
|
||||
default: meta-llama/Llama-3.1-70B-Instruct-Turbo
|
||||
provider: custom
|
||||
base_url: https://api.together.xyz/v1
|
||||
api_key: your-together-key
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Reference in New Issue
Block a user