* docs: update llama.cpp section with --jinja flag and tool calling guide The llama.cpp docs were missing the --jinja flag which is required for tool calling to work. Without it, models output tool calls as raw JSON text instead of structured API responses, making Hermes unable to execute them. Changes: - Add --jinja and -fa flags to the server startup example - Replace deprecated env vars (OPENAI_BASE_URL, LLM_MODEL) with hermes model interactive setup - Add caution block explaining the --jinja requirement and symptoms - List models with native tool calling support - Add /props endpoint verification tip * docs+feat: comprehensive local LLM provider guides and context length warning Docs (providers.md): - Rewrote Ollama section with context length warning (defaults to 4k on <24GB VRAM), three methods to increase it, and verification steps - Rewrote vLLM section with --max-model-len, tool calling flags (--enable-auto-tool-choice, --tool-call-parser), and context guidance - Rewrote SGLang section with --context-length, --tool-call-parser, and warning about 128-token default max output - Added LM Studio section (port 1234, context length defaults to 2048, tool calling since 0.3.6) - Added llama.cpp context length flag (-c) and GPU offload (-ngl) - Added Troubleshooting Local Models section covering: - Tool calls appearing as text (with per-server fix table) - Silent context truncation and diagnosis commands - Low detected context at startup - Truncated responses - Replaced all deprecated env vars (OPENAI_BASE_URL, LLM_MODEL) with hermes model interactive setup and config.yaml examples - Added deprecation warning for legacy env vars in General Setup Code (cli.py): - Added context length warning in show_banner() when detected context is <= 8192 tokens, with server-specific fix hints: - Ollama (port 11434): suggests OLLAMA_CONTEXT_LENGTH env var - LM Studio (port 1234): suggests model settings adjustment - Other servers: suggests config.yaml override Tests: - 9 new tests covering warning thresholds, server-specific hints, and no-warning cases
148 lines
6.0 KiB
Python
148 lines
6.0 KiB
Python
"""Tests for the low context length warning in the CLI banner."""
|
|
|
|
import os
|
|
from types import SimpleNamespace
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture
|
|
def _isolate(tmp_path, monkeypatch):
|
|
"""Isolate HERMES_HOME so tests don't touch real config."""
|
|
home = tmp_path / ".hermes"
|
|
home.mkdir()
|
|
monkeypatch.setenv("HERMES_HOME", str(home))
|
|
|
|
|
|
@pytest.fixture
|
|
def cli_obj(_isolate):
|
|
"""Create a minimal HermesCLI instance for banner testing."""
|
|
with patch("cli.load_cli_config", return_value={
|
|
"display": {"tool_progress": "new"},
|
|
"terminal": {},
|
|
}), patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
from cli import HermesCLI
|
|
obj = HermesCLI.__new__(HermesCLI)
|
|
obj.model = "test-model"
|
|
obj.enabled_toolsets = ["hermes-core"]
|
|
obj.compact = False
|
|
obj.console = MagicMock()
|
|
obj.session_id = None
|
|
obj.api_key = "test"
|
|
obj.base_url = ""
|
|
# Mock agent with context compressor
|
|
obj.agent = SimpleNamespace(
|
|
context_compressor=SimpleNamespace(context_length=None)
|
|
)
|
|
return obj
|
|
|
|
|
|
class TestLowContextWarning:
|
|
"""Tests that the CLI warns about low context lengths."""
|
|
|
|
def test_no_warning_for_normal_context(self, cli_obj):
|
|
"""No warning when context is 32k+."""
|
|
cli_obj.agent.context_compressor.context_length = 32768
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
# Check that no yellow warning was printed
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
warning_calls = [c for c in calls if "too low" in c]
|
|
assert len(warning_calls) == 0
|
|
|
|
def test_warning_for_low_context(self, cli_obj):
|
|
"""Warning shown when context is 4096 (Ollama default)."""
|
|
cli_obj.agent.context_compressor.context_length = 4096
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
warning_calls = [c for c in calls if "too low" in c]
|
|
assert len(warning_calls) == 1
|
|
assert "4,096" in warning_calls[0]
|
|
|
|
def test_warning_for_2048_context(self, cli_obj):
|
|
"""Warning shown for 2048 tokens (common LM Studio default)."""
|
|
cli_obj.agent.context_compressor.context_length = 2048
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
warning_calls = [c for c in calls if "too low" in c]
|
|
assert len(warning_calls) == 1
|
|
|
|
def test_no_warning_at_boundary(self, cli_obj):
|
|
"""No warning at exactly 8192 — 8192 is borderline but included in warning."""
|
|
cli_obj.agent.context_compressor.context_length = 8192
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
warning_calls = [c for c in calls if "too low" in c]
|
|
assert len(warning_calls) == 1 # 8192 is still warned about
|
|
|
|
def test_no_warning_above_boundary(self, cli_obj):
|
|
"""No warning at 16384."""
|
|
cli_obj.agent.context_compressor.context_length = 16384
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
warning_calls = [c for c in calls if "too low" in c]
|
|
assert len(warning_calls) == 0
|
|
|
|
def test_ollama_specific_hint(self, cli_obj):
|
|
"""Ollama-specific fix shown when port 11434 detected."""
|
|
cli_obj.agent.context_compressor.context_length = 4096
|
|
cli_obj.base_url = "http://localhost:11434/v1"
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
ollama_hints = [c for c in calls if "OLLAMA_CONTEXT_LENGTH" in c]
|
|
assert len(ollama_hints) == 1
|
|
|
|
def test_lm_studio_specific_hint(self, cli_obj):
|
|
"""LM Studio-specific fix shown when port 1234 detected."""
|
|
cli_obj.agent.context_compressor.context_length = 2048
|
|
cli_obj.base_url = "http://localhost:1234/v1"
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
lms_hints = [c for c in calls if "LM Studio" in c]
|
|
assert len(lms_hints) == 1
|
|
|
|
def test_generic_hint_for_other_servers(self, cli_obj):
|
|
"""Generic fix shown for unknown servers."""
|
|
cli_obj.agent.context_compressor.context_length = 4096
|
|
cli_obj.base_url = "http://localhost:8080/v1"
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
generic_hints = [c for c in calls if "config.yaml" in c]
|
|
assert len(generic_hints) == 1
|
|
|
|
def test_no_warning_when_no_context_length(self, cli_obj):
|
|
"""No warning when context length is not yet known."""
|
|
cli_obj.agent.context_compressor.context_length = None
|
|
with patch("cli.get_tool_definitions", return_value=[]), \
|
|
patch("cli.build_welcome_banner"):
|
|
cli_obj.show_banner()
|
|
|
|
calls = [str(c) for c in cli_obj.console.print.call_args_list]
|
|
warning_calls = [c for c in calls if "too low" in c]
|
|
assert len(warning_calls) == 0
|