fix(kimi): purge kimi-for-coding from model lists, tests, docs (#lazzyPit)

kimi-for-coding triggers 401/403 access-terminated errors. Apply workaround consistently: - Remove from _PROVIDER_MODELS['kimi-coding'] and coding plan selection - Update tests to expect kimi-k2.5 instead - Update docs and reports - Live config on Beta VPS also corrected
feat(provider): first-class Ollama support + Gemma 4 defaults (#169 )
2026-04-07 16:13:12 +00:00 · 2026-04-07 15:55:50 +00:00 · 2026-04-07 15:40:00 +00:00
9 changed files with 43 additions and 14 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -922,6 +922,7 @@ def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[st
 _AUTO_PROVIDER_LABELS = {
    "_try_openrouter": "openrouter",
    "_try_nous": "nous",
+    "_try_ollama": "ollama",
    "_try_custom_endpoint": "local/custom",
    "_try_codex": "openai-codex",
    "_resolve_api_key_provider": "api-key",
@@ -930,6 +931,18 @@ _AUTO_PROVIDER_LABELS = {
 _AGGREGATOR_PROVIDERS = frozenset({"openrouter", "nous"})


+def _try_ollama() -> Tuple[Optional[OpenAI], Optional[str]]:
+    """Detect and return an Ollama client if the server is reachable."""
+    base_url = (os.getenv("OLLAMA_BASE_URL", "") or "http://localhost:11434").strip().rstrip("/")
+    base_url = base_url + "/v1" if not base_url.endswith("/v1") else base_url
+    from agent.model_metadata import detect_local_server_type
+    if detect_local_server_type(base_url) != "ollama":
+        return None, None
+    api_key = (os.getenv("OLLAMA_API_KEY", "") or "ollama").strip()
+    model = _read_main_model() or "gemma4:12b"
+    return OpenAI(api_key=*** base_url=base_url), model
+
+
 def _get_provider_chain() -> List[tuple]:
    """Return the ordered provider detection chain.

@@ -939,6 +952,7 @@ def _get_provider_chain() -> List[tuple]:
    return [
        ("openrouter", _try_openrouter),
        ("nous", _try_nous),
+        ("ollama", _try_ollama),
        ("local/custom", _try_custom_endpoint),
        ("openai-codex", _try_codex),
        ("api-key", _resolve_api_key_provider),
@@ -988,6 +1002,7 @@ def _try_payment_fallback(
    # Map common resolved_provider values back to chain labels.
    _alias_to_label = {"openrouter": "openrouter", "nous": "nous",
                       "openai-codex": "openai-codex", "codex": "openai-codex",
+                       "ollama": "ollama",
                       "custom": "local/custom", "local/custom": "local/custom"}
    skip_chain_labels = {_alias_to_label.get(s, s) for s in skip_labels}

@@ -1195,6 +1210,15 @@ def resolve_provider_client(
        return (_to_async_client(client, final_model) if async_mode
                else (client, final_model))

+    # ── Ollama (first-class local provider) ──────────────────────────
+    if provider == "ollama":
+        base_url = (explicit_base_url or os.getenv("OLLAMA_BASE_URL", "") or "http://localhost:11434").strip().rstrip("/")
+        base_url = base_url + "/v1" if not base_url.endswith("/v1") else base_url
+        api_key = (explicit_api_key or os.getenv("OLLAMA_API_KEY", "") or "ollama").strip()
+        final_model = model or _read_main_model() or "gemma4:12b"
+        client = OpenAI(api_key=*** base_url=base_url)
+        return (_to_async_client(client, final_model) if async_mode else (client, final_model))
+
    # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
    if provider == "custom":
        if explicit_base_url:
@@ -1335,6 +1359,7 @@ def get_async_text_auxiliary_client(task: str = ""):
 _VISION_AUTO_PROVIDER_ORDER = (
    "openrouter",
    "nous",
+    "ollama",
    "openai-codex",
    "anthropic",
    "custom",
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -26,7 +26,7 @@ _PROVIDER_PREFIXES: frozenset[str] = frozenset({
    "openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
    "gemini", "zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek",
    "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
-    "custom", "local",
+    "ollama", "custom", "local",
    # Common aliases
    "google", "google-gemini", "google-ai-studio",
    "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
@@ -102,9 +102,12 @@ DEFAULT_CONTEXT_LENGTHS = {
    "gpt-4": 128000,
    # Google
    "gemini": 1048576,
-    # Gemma (open models served via AI Studio)
+    # Gemma (open models — Ollama / AI Studio)
    "gemma-4-31b": 256000,
    "gemma-4-26b": 256000,
+    "gemma-4-12b": 256000,
+    "gemma-4-4b": 256000,
+    "gemma-4-1b": 256000,
    "gemma-3": 131072,
    "gemma": 8192,  # fallback for older gemma models
    # DeepSeek
@@ -187,6 +190,8 @@ _URL_TO_PROVIDER: Dict[str, str] = {
    "api.githubcopilot.com": "copilot",
    "models.github.ai": "copilot",
    "api.fireworks.ai": "fireworks",
+    "localhost": "ollama",
+    "127.0.0.1": "ollama",
 }


--- a/config/fallback-config.yaml
+++ b/config/fallback-config.yaml
@@ -6,7 +6,7 @@ model: anthropic/claude-opus-4.6
 # Fallback chain: Anthropic -> Kimi -> Ollama (local)
 fallback_providers:
  - provider: kimi-coding
-    model: kimi-for-coding
+    model: kimi-k2.5
    timeout: 60
    reason: "Primary fallback when Anthropic quota limited"
  
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -820,10 +820,11 @@ def resolve_provider(
        "hf": "huggingface", "hugging-face": "huggingface", "huggingface-hub": "huggingface",
        "go": "opencode-go", "opencode-go-sub": "opencode-go",
        "kilo": "kilocode", "kilo-code": "kilocode", "kilo-gateway": "kilocode",
-        # Local server aliases — route through the generic custom provider
+        # Local server aliases
        "lmstudio": "custom", "lm-studio": "custom", "lm_studio": "custom",
-        "ollama": "custom", "vllm": "custom", "llamacpp": "custom",
+        "vllm": "custom", "llamacpp": "custom",
        "llama.cpp": "custom", "llama-cpp": "custom",
+        "ollama": "ollama",
    }
    normalized = _PROVIDER_ALIASES.get(normalized, normalized)

--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -2126,9 +2126,8 @@ def _model_flow_kimi(config, current_model=""):

    # Step 3: Model selection — show appropriate models for the endpoint
    if is_coding_plan:
-        # Coding Plan models (kimi-for-coding first)
+        # Coding Plan models (kimi-k2.5 first — kimi-for-coding retired due to 403)
        model_list = [
-            "kimi-for-coding",
            "kimi-k2.5",
            "kimi-k2-thinking",
            "kimi-k2-thinking-turbo",
@@ -4206,7 +4205,7 @@ For more help on a command:
    )
    chat_parser.add_argument(
        "--provider",
-        choices=["auto", "openrouter", "nous", "openai-codex", "copilot-acp", "copilot", "anthropic", "gemini", "huggingface", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode"],
+        choices=["auto", "openrouter", "nous", "openai-codex", "copilot-acp", "copilot", "anthropic", "gemini", "huggingface", "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode", "ollama"],
        default=None,
        help="Inference provider (default: auto)"
    )
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -130,7 +130,6 @@ _PROVIDER_MODELS: dict[str, list[str]] = {
        "glm-4.5-flash",
    ],
    "kimi-coding": [
-        "kimi-for-coding",
        "kimi-k2.5",
        "kimi-k2-thinking",
        "kimi-k2-thinking-turbo",
@@ -568,7 +567,7 @@ def list_available_providers() -> list[dict[str, str]]:
        "gemini", "huggingface",
        "zai", "kimi-coding", "minimax", "minimax-cn", "kilocode", "anthropic", "alibaba",
        "opencode-zen", "opencode-go",
-        "ai-gateway", "deepseek", "custom",
+        "ai-gateway", "deepseek", "ollama", "custom",
    ]
    # Build reverse alias map
    aliases_for: dict[str, list[str]] = {}
--- a/reports/ezra-quarterly-report-april-2026.md
+++ b/reports/ezra-quarterly-report-april-2026.md
@@ -235,7 +235,7 @@ The Hermes Agent framework serves as both the delivery platform and the portfoli

 | House | Host | Model / Provider | Gateway Status |
 |-------|------|------------------|----------------|
-| Ezra | Hermes VPS | `kimi-for-coding` (Kimi K2.5) | API `8658`, webhook `8648` — Active |
+| Ezra | Hermes VPS | `kimi-k2.5` (Kimi K2.5) | API `8658`, webhook `8648` — Active |
 | Bezalel | Hermes VPS | Claude Opus 4.6 (Anthropic) | Port `8645` — Active |
 | Allegro-Primus | Hermes VPS | Kimi K2.5 | Port `8644` — Requires restart |
 | Bilbo | External | Gemma 4B (local) | Telegram dual-mode — Active |
--- a/tests/test_api_key_providers.py
+++ b/tests/test_api_key_providers.py
@@ -895,7 +895,7 @@ class TestKimiMoonshotModelListIsolation:
    def test_moonshot_list_excludes_coding_plan_only_models(self):
        from hermes_cli.main import _PROVIDER_MODELS
        moonshot_models = _PROVIDER_MODELS["moonshot"]
-        coding_plan_only = {"kimi-for-coding", "kimi-k2-thinking-turbo"}
+        coding_plan_only = {"kimi-k2-thinking-turbo"}
        leaked = set(moonshot_models) & coding_plan_only
        assert not leaked, f"Moonshot list contains Coding Plan-only models: {leaked}"

@@ -908,7 +908,7 @@ class TestKimiMoonshotModelListIsolation:
    def test_coding_plan_list_contains_plan_specific_models(self):
        from hermes_cli.main import _PROVIDER_MODELS
        coding_models = _PROVIDER_MODELS["kimi-coding"]
-        assert "kimi-for-coding" in coding_models
+        assert "kimi-k2.5" in coding_models
        assert "kimi-k2-thinking-turbo" in coding_models


--- a/website/docs/integrations/providers.md
+++ b/website/docs/integrations/providers.md
@@ -142,7 +142,7 @@ hermes chat --provider zai --model glm-5
 # Requires: GLM_API_KEY in ~/.hermes/.env

 # Kimi / Moonshot AI
-hermes chat --provider kimi-coding --model kimi-for-coding
+hermes chat --provider kimi-coding --model kimi-k2.5
 # Requires: KIMI_API_KEY in ~/.hermes/.env

 # MiniMax (global endpoint)