fix: detect context length for custom model endpoints via fuzzy matching + config override (#2051)
* fix: detect context length for custom model endpoints via fuzzy matching + config override Custom model endpoints (non-OpenRouter, non-known-provider) were silently falling back to 2M tokens when the model name didn't exactly match what the endpoint's /v1/models reported. This happened because: 1. Endpoint metadata lookup used exact match only — model name mismatches (e.g. 'qwen3.5:9b' vs 'Qwen3.5-9B-Q4_K_M.gguf') caused a miss 2. Single-model servers (common for local inference) required exact name match even though only one model was loaded 3. No user escape hatch to manually set context length Changes: - Add fuzzy matching for endpoint model metadata: single-model servers use the only available model regardless of name; multi-model servers try substring matching in both directions - Add model.context_length config override (highest priority) so users can explicitly set their model's context length in config.yaml - Log an informative message when falling back to 2M probe, telling users about the config override option - Thread config_context_length through ContextCompressor and AIAgent init Tests: 6 new tests covering fuzzy match, single-model fallback, config override (including zero/None edge cases). * fix: auto-detect local model name and context length for local servers Cherry-picked from PR #2043 by sudoingX. - Auto-detect model name from local server's /v1/models when only one model is loaded (no manual model name config needed) - Add n_ctx_train and n_ctx to context length detection keys for llama.cpp - Query llama.cpp /props endpoint for actual allocated context (not just training context from GGUF metadata) - Strip .gguf suffix from display in banner and status bar - _auto_detect_local_model() in runtime_provider.py for CLI init Co-authored-by: sudo <sudoingx@users.noreply.github.com> * fix: revert accidental summary_target_tokens change + add docs for context_length config - Revert summary_target_tokens from 2500 back to 500 (accidental change during patching) - Add 'Context Length Detection' section to Custom & Self-Hosted docs explaining model.context_length config override --------- Co-authored-by: Test <test@test.com> Co-authored-by: sudo <sudoingx@users.noreply.github.com>
This commit is contained in:
@@ -289,6 +289,8 @@ def build_welcome_banner(console: Console, model: str, cwd: str,
|
||||
_hero = HERMES_CADUCEUS
|
||||
left_lines = ["", _hero, ""]
|
||||
model_short = model.split("/")[-1] if "/" in model else model
|
||||
if model_short.endswith(".gguf"):
|
||||
model_short = model_short[:-5]
|
||||
if len(model_short) > 28:
|
||||
model_short = model_short[:25] + "..."
|
||||
ctx_str = f" [dim {dim}]·[/] [dim {dim}]{_format_context_length(context_length)} context[/]" if context_length else ""
|
||||
|
||||
@@ -24,11 +24,41 @@ def _normalize_custom_provider_name(value: str) -> str:
|
||||
return value.strip().lower().replace(" ", "-")
|
||||
|
||||
|
||||
def _auto_detect_local_model(base_url: str) -> str:
|
||||
"""Query a local server for its model name when only one model is loaded."""
|
||||
if not base_url:
|
||||
return ""
|
||||
try:
|
||||
import requests
|
||||
url = base_url.rstrip("/")
|
||||
if not url.endswith("/v1"):
|
||||
url += "/v1"
|
||||
resp = requests.get(url + "/models", timeout=5)
|
||||
if resp.ok:
|
||||
models = resp.json().get("data", [])
|
||||
if len(models) == 1:
|
||||
model_id = models[0].get("id", "")
|
||||
if model_id:
|
||||
return model_id
|
||||
except Exception:
|
||||
pass
|
||||
return ""
|
||||
|
||||
|
||||
def _get_model_config() -> Dict[str, Any]:
|
||||
config = load_config()
|
||||
model_cfg = config.get("model")
|
||||
if isinstance(model_cfg, dict):
|
||||
return dict(model_cfg)
|
||||
cfg = dict(model_cfg)
|
||||
default = cfg.get("default", "").strip()
|
||||
base_url = cfg.get("base_url", "").strip()
|
||||
is_local = "localhost" in base_url or "127.0.0.1" in base_url
|
||||
is_fallback = not default or default == "anthropic/claude-opus-4.6"
|
||||
if is_local and is_fallback and base_url:
|
||||
detected = _auto_detect_local_model(base_url)
|
||||
if detected:
|
||||
cfg["default"] = detected
|
||||
return cfg
|
||||
if isinstance(model_cfg, str) and model_cfg.strip():
|
||||
return {"default": model_cfg.strip()}
|
||||
return {}
|
||||
|
||||
Reference in New Issue
Block a user