From ec5fdb8b92f752eab0cb98c586f1bb6c0f411e76 Mon Sep 17 00:00:00 2001
From: Peppi Littera <giuseppe.littera@gmail.com>
Date: Thu, 19 Mar 2026 21:32:04 +0100
Subject: [PATCH] feat: query local servers for actual context window size

Custom endpoints (LM Studio, Ollama, vLLM, llama.cpp) silently fall
back to 2M tokens when /v1/models doesn't include context_length.

Adds _query_local_context_length() which queries server-specific APIs:
- LM Studio: /api/v1/models (max_context_length + loaded instances)
- Ollama: /api/show (model_info + num_ctx parameters)
- llama.cpp: /props (n_ctx from default_generation_settings)
- vLLM: /v1/models/{model} (max_model_len)

Prefers loaded instance context over max (e.g., 122K loaded vs 1M max).
Results are cached via save_context_length() to avoid repeated queries.

Also fixes detect_local_server_type() misidentifying LM Studio as
Ollama (LM Studio returns 200 for /api/tags with an error body).
---
 agent/model_metadata.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/agent/model_metadata.py b/agent/model_metadata.py
index 2b65766dc..6e14d9d99 100644
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -220,7 +220,7 @@ def is_local_endpoint(base_url: str) -> bool:
 def detect_local_server_type(base_url: str) -> Optional[str]:
     """Detect which local server is running at base_url by probing known endpoints.
 
-    Returns one of: "ollama", "lmstudio", "vllm", "llamacpp", or None.
+    Returns one of: "ollama", "lm-studio", "vllm", "llamacpp", or None.
     """
     import httpx
 
@@ -231,18 +231,25 @@ def detect_local_server_type(base_url: str) -> Optional[str]:
 
     try:
         with httpx.Client(timeout=2.0) as client:
-            # Ollama exposes /api/tags
+            # LM Studio exposes /api/v1/models — check first (most specific)
+            try:
+                r = client.get(f"{server_url}/api/v1/models")
+                if r.status_code == 200:
+                    return "lm-studio"
+            except Exception:
+                pass
+            # Ollama exposes /api/tags and responds with {"models": [...]}
+            # LM Studio returns {"error": "Unexpected endpoint"} with status 200
+            # on this path, so we must verify the response contains "models".
             try:
                 r = client.get(f"{server_url}/api/tags")
                 if r.status_code == 200:
-                    return "ollama"
-            except Exception:
-                pass
-            # LM Studio exposes /api/v0/models
-            try:
-                r = client.get(f"{server_url}/api/v0/models")
-                if r.status_code == 200:
-                    return "lmstudio"
+                    try:
+                        data = r.json()
+                        if "models" in data:
+                            return "ollama"
+                    except Exception:
+                        pass
             except Exception:
                 pass
             # llama.cpp exposes /props