From 471ea81a7d4ae230837ced723faca511ba89839c Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 20 Mar 2026 03:19:31 -0700 Subject: [PATCH] fix: preserve Ollama model:tag colons in context length detection (#2149) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The colon-split logic in get_model_context_length() and _query_local_context_length() assumed any colon meant provider:model format (e.g. "local:my-model"). But Ollama uses model:tag format (e.g. "qwen3.5:27b"), so the split turned "qwen3.5:27b" into just "27b" — which matches nothing, causing a fallback to the 2M token probe tier. Now only recognised provider prefixes (local, openrouter, anthropic, etc.) are stripped. Ollama model:tag names pass through intact. Co-authored-by: kshitijk4poor <82637225+kshitijk4poor@users.noreply.github.com> --- agent/model_metadata.py | 40 ++++++++++++++++++++++----- tests/agent/test_model_metadata.py | 44 ++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 6e14d9d99..3dc3e9e13 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -19,6 +19,34 @@ from hermes_constants import OPENROUTER_MODELS_URL logger = logging.getLogger(__name__) +# Provider names that can appear as a "provider:" prefix before a model ID. +# Only these are stripped — Ollama-style "model:tag" colons (e.g. "qwen3.5:27b") +# are preserved so the full model name reaches cache lookups and server queries. +_PROVIDER_PREFIXES: frozenset[str] = frozenset({ + "openrouter", "nous", "openai-codex", "copilot", "copilot-acp", + "zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek", + "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba", + "custom", "local", + # Common aliases + "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot", + "github-models", "kimi", "moonshot", "claude", "deep-seek", + "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen", +}) + + +def _strip_provider_prefix(model: str) -> str: + """Strip a recognised provider prefix from a model string. + + ``"local:my-model"`` → ``"my-model"`` + ``"qwen3.5:27b"`` → ``"qwen3.5:27b"`` (unchanged — not a provider prefix) + """ + if ":" not in model or model.startswith("http"): + return model + prefix = model.split(":", 1)[0].strip().lower() + if prefix in _PROVIDER_PREFIXES: + return model.split(":", 1)[1] + return model + _model_metadata_cache: Dict[str, Dict[str, Any]] = {} _model_metadata_cache_time: float = 0 _MODEL_CACHE_TTL = 3600 @@ -579,10 +607,9 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]: """Query a local server for the model's context length.""" import httpx - # Strip provider prefix (e.g., "local:model-name" → "model-name"). - # LM Studio and Ollama don't use provider prefixes in their model IDs. - if ":" in model and not model.startswith("http"): - model = model.split(":", 1)[1] + # Strip recognised provider prefix (e.g., "local:model-name" → "model-name"). + # Ollama "model:tag" colons (e.g. "qwen3.5:27b") are intentionally preserved. + model = _strip_provider_prefix(model) # Strip /v1 suffix to get the server root server_url = base_url.rstrip("/") @@ -689,9 +716,8 @@ def get_model_context_length( # Normalise provider-prefixed model names (e.g. "local:model-name" → # "model-name") so cache lookups and server queries use the bare ID that - # local servers actually know about. - if ":" in model and not model.startswith("http"): - model = model.split(":", 1)[1] + # local servers actually know about. Ollama "model:tag" colons are preserved. + model = _strip_provider_prefix(model) # 1. Check persistent cache (model+provider) if base_url: diff --git a/tests/agent/test_model_metadata.py b/tests/agent/test_model_metadata.py index a733a03c6..75770ce31 100644 --- a/tests/agent/test_model_metadata.py +++ b/tests/agent/test_model_metadata.py @@ -22,6 +22,7 @@ from unittest.mock import patch, MagicMock from agent.model_metadata import ( CONTEXT_PROBE_TIERS, DEFAULT_CONTEXT_LENGTHS, + _strip_provider_prefix, estimate_tokens_rough, estimate_messages_tokens_rough, get_model_context_length, @@ -292,6 +293,49 @@ class TestGetModelContextLength: assert result == 200000 +# ========================================================================= +# _strip_provider_prefix — Ollama model:tag vs provider:model +# ========================================================================= + +class TestStripProviderPrefix: + def test_known_provider_prefix_is_stripped(self): + assert _strip_provider_prefix("local:my-model") == "my-model" + assert _strip_provider_prefix("openrouter:anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4" + assert _strip_provider_prefix("anthropic:claude-sonnet-4") == "claude-sonnet-4" + + def test_ollama_model_tag_preserved(self): + """Ollama model:tag format must NOT be stripped.""" + assert _strip_provider_prefix("qwen3.5:27b") == "qwen3.5:27b" + assert _strip_provider_prefix("llama3.3:70b") == "llama3.3:70b" + assert _strip_provider_prefix("gemma2:9b") == "gemma2:9b" + assert _strip_provider_prefix("codellama:13b-instruct-q4_0") == "codellama:13b-instruct-q4_0" + + def test_http_urls_preserved(self): + assert _strip_provider_prefix("http://example.com") == "http://example.com" + assert _strip_provider_prefix("https://example.com") == "https://example.com" + + def test_no_colon_returns_unchanged(self): + assert _strip_provider_prefix("gpt-4o") == "gpt-4o" + assert _strip_provider_prefix("anthropic/claude-sonnet-4") == "anthropic/claude-sonnet-4" + + @patch("agent.model_metadata.fetch_model_metadata") + def test_ollama_model_tag_not_mangled_in_context_lookup(self, mock_fetch): + """Ensure 'qwen3.5:27b' is NOT reduced to '27b' during context length lookup. + + We mock a custom endpoint that knows 'qwen3.5:27b' — the full name + must reach the endpoint metadata lookup intact. + """ + mock_fetch.return_value = {} + with patch("agent.model_metadata.fetch_endpoint_model_metadata") as mock_ep, \ + patch("agent.model_metadata._is_custom_endpoint", return_value=True): + mock_ep.return_value = {"qwen3.5:27b": {"context_length": 32768}} + result = get_model_context_length( + "qwen3.5:27b", + base_url="http://localhost:11434/v1", + ) + assert result == 32768 + + # ========================================================================= # fetch_model_metadata — caching, TTL, slugs, failures # =========================================================================