From 344239c2dbfe6c03c9020a4faa9552c8769be20a Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Tue, 31 Mar 2026 03:29:00 -0700
Subject: [PATCH] feat: auto-detect models from server probe in custom endpoint
 setup (#4218)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Custom endpoint setup (_model_flow_custom) now probes the server first
and presents detected models instead of asking users to type blind:

- Single model: auto-confirms with Y/n prompt
- Multiple models: numbered list picker, or type a name
- No models / probe failed: falls back to manual input

Context length prompt also moved after model selection so the user sees
the verified endpoint before being asked for details.

All recent fixes preserved: config dict sync (#4172), api_key
persistence (#4182), no save_env_value for URLs (#4165).

Inspired by PR #4194 by sudoingX — re-implemented against current main.

Co-authored-by: Xpress AI (Dip KD) <200180104+sudoingX@users.noreply.github.com>
---
 hermes_cli/main.py                    | 50 ++++++++++++++++++++-------
 tests/test_cli_provider_resolution.py |  5 ++-
 2 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index 3c7142b5e..9b4b3ccac 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -1242,22 +1242,10 @@ def _model_flow_custom(config):
     try:
         base_url = input(f"API base URL [{current_url or 'e.g. https://api.example.com/v1'}]: ").strip()
         api_key = input(f"API key [{current_key[:8] + '...' if current_key else 'optional'}]: ").strip()
-        model_name = input("Model name (e.g. gpt-4, llama-3-70b): ").strip()
-        context_length_str = input("Context length in tokens [leave blank for auto-detect]: ").strip()
     except (KeyboardInterrupt, EOFError):
         print("\nCancelled.")
         return
 
-    context_length = None
-    if context_length_str:
-        try:
-            context_length = int(context_length_str.replace(",", "").replace("k", "000").replace("K", "000"))
-            if context_length <= 0:
-                context_length = None
-        except ValueError:
-            print(f"Invalid context length: {context_length_str} — will auto-detect.")
-            context_length = None
-
     if not base_url and not current_url:
         print("No URL provided. Cancelled.")
         return
@@ -1294,6 +1282,44 @@ def _model_flow_custom(config):
         if probe.get("suggested_base_url"):
             print(f"  If this server expects /v1, try base URL: {probe['suggested_base_url']}")
 
+    # Select model — use probe results when available, fall back to manual input
+    model_name = ""
+    detected_models = probe.get("models") or []
+    try:
+        if len(detected_models) == 1:
+            print(f"  Detected model: {detected_models[0]}")
+            confirm = input("  Use this model? [Y/n]: ").strip().lower()
+            if confirm in ("", "y", "yes"):
+                model_name = detected_models[0]
+            else:
+                model_name = input("Model name (e.g. gpt-4, llama-3-70b): ").strip()
+        elif len(detected_models) > 1:
+            print("  Available models:")
+            for i, m in enumerate(detected_models, 1):
+                print(f"    {i}. {m}")
+            pick = input(f"  Select model [1-{len(detected_models)}] or type name: ").strip()
+            if pick.isdigit() and 1 <= int(pick) <= len(detected_models):
+                model_name = detected_models[int(pick) - 1]
+            elif pick:
+                model_name = pick
+        else:
+            model_name = input("Model name (e.g. gpt-4, llama-3-70b): ").strip()
+
+        context_length_str = input("Context length in tokens [leave blank for auto-detect]: ").strip()
+    except (KeyboardInterrupt, EOFError):
+        print("\nCancelled.")
+        return
+
+    context_length = None
+    if context_length_str:
+        try:
+            context_length = int(context_length_str.replace(",", "").replace("k", "000").replace("K", "000"))
+            if context_length <= 0:
+                context_length = None
+        except ValueError:
+            print(f"Invalid context length: {context_length_str} — will auto-detect.")
+            context_length = None
+
     if model_name:
         _save_model_choice(model_name)
 
diff --git a/tests/test_cli_provider_resolution.py b/tests/test_cli_provider_resolution.py
index 943a45a55..3c9b31f5f 100644
--- a/tests/test_cli_provider_resolution.py
+++ b/tests/test_cli_provider_resolution.py
@@ -460,13 +460,16 @@ def test_model_flow_custom_saves_verified_v1_base_url(monkeypatch, capsys):
     )
     monkeypatch.setattr("hermes_cli.config.save_config", lambda cfg: None)
 
-    answers = iter(["http://localhost:8000", "local-key", "llm", ""])
+    # After the probe detects a single model ("llm"), the flow asks
+    # "Use this model? [Y/n]:" — confirm with Enter, then context length.
+    answers = iter(["http://localhost:8000", "local-key", "", ""])
     monkeypatch.setattr("builtins.input", lambda _prompt="": next(answers))
 
     hermes_main._model_flow_custom({})
     output = capsys.readouterr().out
 
     assert "Saving the working base URL instead" in output
+    assert "Detected model: llm" in output
     # OPENAI_BASE_URL is no longer saved to .env — config.yaml is authoritative
     assert "OPENAI_BASE_URL" not in saved_env
     assert saved_env["MODEL"] == "llm"
\ No newline at end of file