diff --git a/cli.py b/cli.py index 978b36091..e5f88e752 100644 --- a/cli.py +++ b/cli.py @@ -2192,7 +2192,31 @@ class HermesCLI: # Show tool availability warnings if any tools are disabled self._show_tool_availability_warnings() - + + # Warn about very low context lengths (common with local servers) + if ctx_len and ctx_len <= 8192: + self.console.print() + self.console.print( + f"[yellow]⚠️ Context length is only {ctx_len:,} tokens — " + f"this is likely too low for agent use with tools.[/]" + ) + self.console.print( + "[dim] Hermes needs 16k–32k minimum. Tool schemas + system prompt alone use ~4k–8k.[/]" + ) + base_url = getattr(self, "base_url", "") or "" + if "11434" in base_url or "ollama" in base_url.lower(): + self.console.print( + "[dim] Ollama fix: OLLAMA_CONTEXT_LENGTH=32768 ollama serve[/]" + ) + elif "1234" in base_url: + self.console.print( + "[dim] LM Studio fix: Set context length in model settings → reload model[/]" + ) + else: + self.console.print( + "[dim] Fix: Set model.context_length in config.yaml, or increase your server's context setting[/]" + ) + self.console.print() def _preload_resumed_session(self) -> bool: diff --git a/tests/test_cli_context_warning.py b/tests/test_cli_context_warning.py new file mode 100644 index 000000000..fa0305a27 --- /dev/null +++ b/tests/test_cli_context_warning.py @@ -0,0 +1,147 @@ +"""Tests for the low context length warning in the CLI banner.""" + +import os +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest + + +@pytest.fixture +def _isolate(tmp_path, monkeypatch): + """Isolate HERMES_HOME so tests don't touch real config.""" + home = tmp_path / ".hermes" + home.mkdir() + monkeypatch.setenv("HERMES_HOME", str(home)) + + +@pytest.fixture +def cli_obj(_isolate): + """Create a minimal HermesCLI instance for banner testing.""" + with patch("cli.load_cli_config", return_value={ + "display": {"tool_progress": "new"}, + "terminal": {}, + }), patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + from cli import HermesCLI + obj = HermesCLI.__new__(HermesCLI) + obj.model = "test-model" + obj.enabled_toolsets = ["hermes-core"] + obj.compact = False + obj.console = MagicMock() + obj.session_id = None + obj.api_key = "test" + obj.base_url = "" + # Mock agent with context compressor + obj.agent = SimpleNamespace( + context_compressor=SimpleNamespace(context_length=None) + ) + return obj + + +class TestLowContextWarning: + """Tests that the CLI warns about low context lengths.""" + + def test_no_warning_for_normal_context(self, cli_obj): + """No warning when context is 32k+.""" + cli_obj.agent.context_compressor.context_length = 32768 + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + # Check that no yellow warning was printed + calls = [str(c) for c in cli_obj.console.print.call_args_list] + warning_calls = [c for c in calls if "too low" in c] + assert len(warning_calls) == 0 + + def test_warning_for_low_context(self, cli_obj): + """Warning shown when context is 4096 (Ollama default).""" + cli_obj.agent.context_compressor.context_length = 4096 + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + warning_calls = [c for c in calls if "too low" in c] + assert len(warning_calls) == 1 + assert "4,096" in warning_calls[0] + + def test_warning_for_2048_context(self, cli_obj): + """Warning shown for 2048 tokens (common LM Studio default).""" + cli_obj.agent.context_compressor.context_length = 2048 + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + warning_calls = [c for c in calls if "too low" in c] + assert len(warning_calls) == 1 + + def test_no_warning_at_boundary(self, cli_obj): + """No warning at exactly 8192 — 8192 is borderline but included in warning.""" + cli_obj.agent.context_compressor.context_length = 8192 + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + warning_calls = [c for c in calls if "too low" in c] + assert len(warning_calls) == 1 # 8192 is still warned about + + def test_no_warning_above_boundary(self, cli_obj): + """No warning at 16384.""" + cli_obj.agent.context_compressor.context_length = 16384 + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + warning_calls = [c for c in calls if "too low" in c] + assert len(warning_calls) == 0 + + def test_ollama_specific_hint(self, cli_obj): + """Ollama-specific fix shown when port 11434 detected.""" + cli_obj.agent.context_compressor.context_length = 4096 + cli_obj.base_url = "http://localhost:11434/v1" + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + ollama_hints = [c for c in calls if "OLLAMA_CONTEXT_LENGTH" in c] + assert len(ollama_hints) == 1 + + def test_lm_studio_specific_hint(self, cli_obj): + """LM Studio-specific fix shown when port 1234 detected.""" + cli_obj.agent.context_compressor.context_length = 2048 + cli_obj.base_url = "http://localhost:1234/v1" + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + lms_hints = [c for c in calls if "LM Studio" in c] + assert len(lms_hints) == 1 + + def test_generic_hint_for_other_servers(self, cli_obj): + """Generic fix shown for unknown servers.""" + cli_obj.agent.context_compressor.context_length = 4096 + cli_obj.base_url = "http://localhost:8080/v1" + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + generic_hints = [c for c in calls if "config.yaml" in c] + assert len(generic_hints) == 1 + + def test_no_warning_when_no_context_length(self, cli_obj): + """No warning when context length is not yet known.""" + cli_obj.agent.context_compressor.context_length = None + with patch("cli.get_tool_definitions", return_value=[]), \ + patch("cli.build_welcome_banner"): + cli_obj.show_banner() + + calls = [str(c) for c in cli_obj.console.print.call_args_list] + warning_calls = [c for c in calls if "too low" in c] + assert len(warning_calls) == 0 diff --git a/website/docs/integrations/providers.md b/website/docs/integrations/providers.md index ab4c8f354..7740e36db 100644 --- a/website/docs/integrations/providers.md +++ b/website/docs/integrations/providers.md @@ -218,15 +218,11 @@ model: api_key: your-key-or-leave-empty-for-local ``` -**Environment variables (`.env` file):** -```bash -# Add to ~/.hermes/.env -OPENAI_BASE_URL=http://localhost:8000/v1 -OPENAI_API_KEY=your-key # Any non-empty string for local servers -LLM_MODEL=your-model-name -``` +:::warning Legacy env vars +`OPENAI_BASE_URL` and `LLM_MODEL` in `.env` are **deprecated**. The CLI ignores `LLM_MODEL` entirely (only the gateway reads it). Use `hermes model` or edit `config.yaml` directly — both persist correctly across restarts and Docker containers. +::: -All three approaches end up in the same runtime path. `hermes model` persists provider, model, and base URL to `config.yaml` so later sessions keep using that endpoint even if env vars are not set. +Both approaches persist to `config.yaml`, which is the source of truth for model, provider, and base URL. ### Switching Models with `/model` @@ -257,23 +253,73 @@ Everything below follows this same pattern — just change the URL, key, and mod ### Ollama — Local Models, Zero Config -[Ollama](https://ollama.com/) runs open-weight models locally with one command. Best for: quick local experimentation, privacy-sensitive work, offline use. +[Ollama](https://ollama.com/) runs open-weight models locally with one command. Best for: quick local experimentation, privacy-sensitive work, offline use. Supports tool calling via the OpenAI-compatible API. ```bash # Install and run a model -ollama pull llama3.1:70b +ollama pull qwen2.5-coder:32b ollama serve # Starts on port 11434 - -# Configure Hermes -OPENAI_BASE_URL=http://localhost:11434/v1 -OPENAI_API_KEY=ollama # Any non-empty string -LLM_MODEL=llama3.1:70b ``` -Ollama's OpenAI-compatible endpoint supports chat completions, streaming, and tool calling (for supported models). No GPU required for smaller models — Ollama handles CPU inference automatically. +Then configure Hermes: + +```bash +hermes model +# Select "Custom endpoint (self-hosted / VLLM / etc.)" +# Enter URL: http://localhost:11434/v1 +# Skip API key (Ollama doesn't need one) +# Enter model name (e.g. qwen2.5-coder:32b) +``` + +Or configure `config.yaml` directly: + +```yaml +model: + default: qwen2.5-coder:32b + provider: custom + base_url: http://localhost:11434/v1 + context_length: 32768 # See warning below +``` + +:::caution Ollama defaults to very low context lengths +Ollama does **not** use your model's full context window by default. Depending on your VRAM, the default is: + +| Available VRAM | Default context | +|----------------|----------------| +| Less than 24 GB | **4,096 tokens** | +| 24–48 GB | 32,768 tokens | +| 48+ GB | 256,000 tokens | + +For agent use with tools, **you need at least 16k–32k context**. At 4k, the system prompt + tool schemas alone can fill the window, leaving no room for conversation. + +**How to increase it** (pick one): + +```bash +# Option 1: Set server-wide via environment variable (recommended) +OLLAMA_CONTEXT_LENGTH=32768 ollama serve + +# Option 2: For systemd-managed Ollama +sudo systemctl edit ollama.service +# Add: Environment="OLLAMA_CONTEXT_LENGTH=32768" +# Then: sudo systemctl daemon-reload && sudo systemctl restart ollama + +# Option 3: Bake it into a custom model (persistent per-model) +echo -e "FROM qwen2.5-coder:32b\nPARAMETER num_ctx 32768" > Modelfile +ollama create qwen2.5-coder-32k -f Modelfile +``` + +**You cannot set context length through the OpenAI-compatible API** (`/v1/chat/completions`). It must be configured server-side or via a Modelfile. This is the #1 source of confusion when integrating Ollama with tools like Hermes. +::: + +**Verify your context is set correctly:** + +```bash +ollama ps +# Look at the CONTEXT column — it should show your configured value +``` :::tip -List available models with `ollama list`. Pull any model from the [Ollama library](https://ollama.com/library) with `ollama pull `. +List available models with `ollama list`. Pull any model from the [Ollama library](https://ollama.com/library) with `ollama pull `. Ollama handles GPU offloading automatically — no configuration needed for most setups. ::: --- @@ -283,19 +329,39 @@ List available models with `ollama list`. Pull any model from the [Ollama librar [vLLM](https://docs.vllm.ai/) is the standard for production LLM serving. Best for: maximum throughput on GPU hardware, serving large models, continuous batching. ```bash -# Start vLLM server pip install vllm vllm serve meta-llama/Llama-3.1-70B-Instruct \ --port 8000 \ - --tensor-parallel-size 2 # Multi-GPU - -# Configure Hermes -OPENAI_BASE_URL=http://localhost:8000/v1 -OPENAI_API_KEY=dummy -LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct + --max-model-len 65536 \ + --tensor-parallel-size 2 \ + --enable-auto-tool-choice \ + --tool-call-parser hermes ``` -vLLM supports tool calling, structured output, and multi-modal models. Use `--enable-auto-tool-choice` and `--tool-call-parser hermes` for Hermes-format tool calling with NousResearch models. +Then configure Hermes: + +```bash +hermes model +# Select "Custom endpoint (self-hosted / VLLM / etc.)" +# Enter URL: http://localhost:8000/v1 +# Skip API key (or enter one if you configured vLLM with --api-key) +# Enter model name: meta-llama/Llama-3.1-70B-Instruct +``` + +**Context length:** vLLM reads the model's `max_position_embeddings` by default. If that exceeds your GPU memory, it errors and asks you to set `--max-model-len` lower. You can also use `--max-model-len auto` to automatically find the maximum that fits. Set `--gpu-memory-utilization 0.95` (default 0.9) to squeeze more context into VRAM. + +**Tool calling requires explicit flags:** + +| Flag | Purpose | +|------|---------| +| `--enable-auto-tool-choice` | Required for `tool_choice: "auto"` (the default in Hermes) | +| `--tool-call-parser ` | Parser for the model's tool call format | + +Supported parsers: `hermes` (Qwen 2.5, Hermes 2/3), `llama3_json` (Llama 3.x), `mistral`, `deepseek_v3`, `deepseek_v31`, `xlam`, `pythonic`. Without these flags, tool calls won't work — the model will output tool calls as text. + +:::tip +vLLM supports human-readable sizes: `--max-model-len 64k` (lowercase k = 1000, uppercase K = 1024). +::: --- @@ -304,19 +370,32 @@ vLLM supports tool calling, structured output, and multi-modal models. Use `--en [SGLang](https://github.com/sgl-project/sglang) is an alternative to vLLM with RadixAttention for KV cache reuse. Best for: multi-turn conversations (prefix caching), constrained decoding, structured output. ```bash -# Start SGLang server pip install "sglang[all]" python -m sglang.launch_server \ --model meta-llama/Llama-3.1-70B-Instruct \ - --port 8000 \ - --tp 2 - -# Configure Hermes -OPENAI_BASE_URL=http://localhost:8000/v1 -OPENAI_API_KEY=dummy -LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct + --port 30000 \ + --context-length 65536 \ + --tp 2 \ + --tool-call-parser qwen ``` +Then configure Hermes: + +```bash +hermes model +# Select "Custom endpoint (self-hosted / VLLM / etc.)" +# Enter URL: http://localhost:30000/v1 +# Enter model name: meta-llama/Llama-3.1-70B-Instruct +``` + +**Context length:** SGLang reads from the model's config by default. Use `--context-length` to override. If you need to exceed the model's declared maximum, set `SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1`. + +**Tool calling:** Use `--tool-call-parser` with the appropriate parser for your model family: `qwen` (Qwen 2.5), `llama3`, `llama4`, `deepseekv3`, `mistral`, `glm`. Without this flag, tool calls come back as plain text. + +:::caution SGLang defaults to 128 max output tokens +If responses seem truncated, add `max_tokens` to your requests or set `--default-max-tokens` on the server. SGLang's default is only 128 tokens per response if not specified in the request. +::: + --- ### llama.cpp / llama-server — CPU & Metal Inference @@ -327,21 +406,136 @@ LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct # Build and start llama-server cmake -B build && cmake --build build --config Release ./build/bin/llama-server \ - -m models/llama-3.1-8b-instruct-Q4_K_M.gguf \ + --jinja -fa \ + -c 32768 \ + -ngl 99 \ + -m models/qwen2.5-coder-32b-instruct-Q4_K_M.gguf \ --port 8080 --host 0.0.0.0 - -# Configure Hermes -OPENAI_BASE_URL=http://localhost:8080/v1 -OPENAI_API_KEY=dummy -LLM_MODEL=llama-3.1-8b-instruct ``` +**Context length (`-c`):** Recent builds default to `0` which reads the model's training context from the GGUF metadata. For models with 128k+ training context, this can OOM trying to allocate the full KV cache. Set `-c` explicitly to what you need (32k–64k is a good range for agent use). If using parallel slots (`-np`), the total context is divided among slots — with `-c 32768 -np 4`, each slot only gets 8k. + +Then configure Hermes to point at it: + +```bash +hermes model +# Select "Custom endpoint (self-hosted / VLLM / etc.)" +# Enter URL: http://localhost:8080/v1 +# Skip API key (local servers don't need one) +# Enter model name — or leave blank to auto-detect if only one model is loaded +``` + +This saves the endpoint to `config.yaml` so it persists across sessions. + +:::caution `--jinja` is required for tool calling +Without `--jinja`, llama-server ignores the `tools` parameter entirely. The model will try to call tools by writing JSON in its response text, but Hermes won't recognize it as a tool call — you'll see raw JSON like `{"name": "web_search", ...}` printed as a message instead of an actual search. + +Native tool calling support (best performance): Llama 3.x, Qwen 2.5 (including Coder), Hermes 2/3, Mistral, DeepSeek, Functionary. All other models use a generic handler that works but may be less efficient. See the [llama.cpp function calling docs](https://github.com/ggml-org/llama.cpp/blob/master/docs/function-calling.md) for the full list. + +You can verify tool support is active by checking `http://localhost:8080/props` — the `chat_template` field should be present. +::: + :::tip Download GGUF models from [Hugging Face](https://huggingface.co/models?library=gguf). Q4_K_M quantization offers the best balance of quality vs. memory usage. ::: --- +### LM Studio — Desktop App with Local Models + +[LM Studio](https://lmstudio.ai/) is a desktop app for running local models with a GUI. Best for: users who prefer a visual interface, quick model testing, developers on macOS/Windows/Linux. + +Start the server from the LM Studio app (Developer tab → Start Server), or use the CLI: + +```bash +lms server start # Starts on port 1234 +lms load qwen2.5-coder --context-length 32768 +``` + +Then configure Hermes: + +```bash +hermes model +# Select "Custom endpoint (self-hosted / VLLM / etc.)" +# Enter URL: http://localhost:1234/v1 +# Skip API key (LM Studio doesn't require one) +# Enter model name +``` + +:::caution Context length often defaults to 2048 +LM Studio reads context length from the model's metadata, but many GGUF models report low defaults (2048 or 4096). **Always set context length explicitly** in the LM Studio model settings: + +1. Click the gear icon next to the model picker +2. Set "Context Length" to at least 16384 (preferably 32768) +3. Reload the model for the change to take effect + +Alternatively, use the CLI: `lms load model-name --context-length 32768` + +To set persistent per-model defaults: My Models tab → gear icon on the model → set context size. +::: + +**Tool calling:** Supported since LM Studio 0.3.6. Models with native tool-calling training (Qwen 2.5, Llama 3.x, Mistral, Hermes) are auto-detected and shown with a tool badge. Other models use a generic fallback that may be less reliable. + +--- + +### Troubleshooting Local Models + +These issues affect **all** local inference servers when used with Hermes. + +#### Tool calls appear as text instead of executing + +The model outputs something like `{"name": "web_search", "arguments": {...}}` as a message instead of actually calling the tool. + +**Cause:** Your server doesn't have tool calling enabled, or the model doesn't support it through the server's tool calling implementation. + +| Server | Fix | +|--------|-----| +| **llama.cpp** | Add `--jinja` to the startup command | +| **vLLM** | Add `--enable-auto-tool-choice --tool-call-parser hermes` | +| **SGLang** | Add `--tool-call-parser qwen` (or appropriate parser) | +| **Ollama** | Tool calling is enabled by default — make sure your model supports it (check with `ollama show model-name`) | +| **LM Studio** | Update to 0.3.6+ and use a model with native tool support | + +#### Model seems to forget context or give incoherent responses + +**Cause:** Context window is too small. When the conversation exceeds the context limit, most servers silently drop older messages. Hermes's system prompt + tool schemas alone can use 4k–8k tokens. + +**Diagnosis:** + +```bash +# Check what Hermes thinks the context is +# Look at startup line: "Context limit: X tokens" + +# Check your server's actual context +# Ollama: ollama ps (CONTEXT column) +# llama.cpp: curl http://localhost:8080/props | jq '.default_generation_settings.n_ctx' +# vLLM: check --max-model-len in startup args +``` + +**Fix:** Set context to at least **32,768 tokens** for agent use. See each server's section above for the specific flag. + +#### "Context limit: 2048 tokens" at startup + +Hermes auto-detects context length from your server's `/v1/models` endpoint. If the server reports a low value (or doesn't report one at all), Hermes uses the model's declared limit which may be wrong. + +**Fix:** Set it explicitly in `config.yaml`: + +```yaml +model: + default: your-model + provider: custom + base_url: http://localhost:11434/v1 + context_length: 32768 +``` + +#### Responses get cut off mid-sentence + +**Possible causes:** +1. **Low `max_tokens` on the server** — SGLang defaults to 128 tokens per response. Set `--default-max-tokens` on the server or configure Hermes with `model.max_tokens` in config.yaml. +2. **Context exhaustion** — The model filled its context window. Increase context length or enable [context compression](/docs/user-guide/configuration#context-compression) in Hermes. + +--- + ### LiteLLM Proxy — Multi-Provider Gateway [LiteLLM](https://docs.litellm.ai/) is an OpenAI-compatible proxy that unifies 100+ LLM providers behind a single API. Best for: switching between providers without config changes, load balancing, fallback chains, budget controls. @@ -353,13 +547,10 @@ litellm --model anthropic/claude-sonnet-4 --port 4000 # Or with a config file for multiple models: litellm --config litellm_config.yaml --port 4000 - -# Configure Hermes -OPENAI_BASE_URL=http://localhost:4000/v1 -OPENAI_API_KEY=sk-your-litellm-key -LLM_MODEL=anthropic/claude-sonnet-4 ``` +Then configure Hermes with `hermes model` → Custom endpoint → `http://localhost:4000/v1`. + Example `litellm_config.yaml` with fallback: ```yaml model_list: @@ -384,13 +575,10 @@ router_settings: ```bash # Install and start npx @blockrun/clawrouter # Starts on port 8402 - -# Configure Hermes -OPENAI_BASE_URL=http://localhost:8402/v1 -OPENAI_API_KEY=dummy -LLM_MODEL=blockrun/auto # or: blockrun/eco, blockrun/premium, blockrun/agentic ``` +Then configure Hermes with `hermes model` → Custom endpoint → `http://localhost:8402/v1` → model name `blockrun/auto`. + Routing profiles: | Profile | Strategy | Savings | |---------|----------|---------| @@ -423,11 +611,14 @@ Any service with an OpenAI-compatible API works. Some popular options: | [LocalAI](https://localai.io) | `http://localhost:8080/v1` | Self-hosted, multi-model | | [Jan](https://jan.ai) | `http://localhost:1337/v1` | Desktop app with local models | -```bash -# Example: Together AI -OPENAI_BASE_URL=https://api.together.xyz/v1 -OPENAI_API_KEY=your-together-key -LLM_MODEL=meta-llama/Llama-3.1-70B-Instruct-Turbo +Configure any of these with `hermes model` → Custom endpoint, or in `config.yaml`: + +```yaml +model: + default: meta-llama/Llama-3.1-70B-Instruct-Turbo + provider: custom + base_url: https://api.together.xyz/v1 + api_key: your-together-key ``` ---