From ec5fdb8b92f752eab0cb98c586f1bb6c0f411e76 Mon Sep 17 00:00:00 2001 From: Peppi Littera Date: Thu, 19 Mar 2026 21:32:04 +0100 Subject: [PATCH] feat: query local servers for actual context window size Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall back to 2M tokens when /v1/models doesn't include context_length. Adds _query_local_context_length() which queries server-specific APIs: - LM Studio: /api/v1/models (max_context_length + loaded instances) - Ollama: /api/show (model_info + num_ctx parameters) - llama.cpp: /props (n_ctx from default_generation_settings) - vLLM: /v1/models/{model} (max_model_len) Prefers loaded instance context over max (e.g., 122K loaded vs 1M max). Results are cached via save_context_length() to avoid repeated queries. Also fixes detect_local_server_type() misidentifying LM Studio as Ollama (LM Studio returns 200 for /api/tags with an error body). --- agent/model_metadata.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/agent/model_metadata.py b/agent/model_metadata.py index 2b65766dc..6e14d9d99 100644 --- a/agent/model_metadata.py +++ b/agent/model_metadata.py @@ -220,7 +220,7 @@ def is_local_endpoint(base_url: str) -> bool: def detect_local_server_type(base_url: str) -> Optional[str]: """Detect which local server is running at base_url by probing known endpoints. - Returns one of: "ollama", "lmstudio", "vllm", "llamacpp", or None. + Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None. """ import httpx @@ -231,18 +231,25 @@ def detect_local_server_type(base_url: str) -> Optional[str]: try: with httpx.Client(timeout=2.0) as client: - # Ollama exposes /api/tags + # LM Studio exposes /api/v1/models — check first (most specific) + try: + r = client.get(f"{server_url}/api/v1/models") + if r.status_code == 200: + return "lm-studio" + except Exception: + pass + # Ollama exposes /api/tags and responds with {"models": [...]} + # LM Studio returns {"error": "Unexpected endpoint"} with status 200 + # on this path, so we must verify the response contains "models". try: r = client.get(f"{server_url}/api/tags") if r.status_code == 200: - return "ollama" - except Exception: - pass - # LM Studio exposes /api/v0/models - try: - r = client.get(f"{server_url}/api/v0/models") - if r.status_code == 200: - return "lmstudio" + try: + data = r.json() + if "models" in data: + return "ollama" + except Exception: + pass except Exception: pass # llama.cpp exposes /props