[claude] Qwen3 two-model strategy: 14B primary + 8B fast router (#1063) (#1143)

2026-03-23 18:35:57 +00:00
parent 128aa4427f
commit ed63877f75
4 changed files with 409 additions and 11 deletions
--- a/src/config.py
+++ b/src/config.py
@@ -30,25 +30,36 @@ class Settings(BaseSettings):
        return normalize_ollama_url(self.ollama_url)

    # LLM model passed to Agno/Ollama — override with OLLAMA_MODEL
-    # qwen3:30b is the primary model — better reasoning and tool calling
-    # than llama3.1:8b-instruct while still running locally on modest hardware.
-    # Fallback: llama3.1:8b-instruct if qwen3:30b not available.
-    # llama3.2 (3B) hallucinated tool output consistently in testing.
-    ollama_model: str = "qwen3:30b"
+    # qwen3:14b (Q5_K_M) is the primary model: tool calling F1 0.971, ~17.5 GB
+    # at 32K context — optimal for M3 Max 36 GB (Issue #1063).
+    # qwen3:30b exceeded memory budget at 32K+ context on 36 GB hardware.
+    ollama_model: str = "qwen3:14b"
+
+    # Fast routing model — override with OLLAMA_FAST_MODEL
+    # qwen3:8b (Q6_K): tool calling F1 0.933 at ~45-55 tok/s (2x speed of 14B).
+    # Use for routine tasks: simple tool calls, file reads, status checks.
+    # Combined memory with qwen3:14b: ~17 GB — both can stay loaded simultaneously.
+    ollama_fast_model: str = "qwen3:8b"
+
+    # Maximum concurrently loaded Ollama models — override with OLLAMA_MAX_LOADED_MODELS
+    # Set to 2 to keep qwen3:8b (fast) + qwen3:14b (primary) both hot.
+    # Requires setting OLLAMA_MAX_LOADED_MODELS=2 in the Ollama server environment.
+    ollama_max_loaded_models: int = 2

    # Context window size for Ollama inference — override with OLLAMA_NUM_CTX
-    # qwen3:30b with default context eats 45GB on a 39GB Mac.
-    # 4096 keeps memory at ~19GB. Set to 0 to use model defaults.
-    ollama_num_ctx: int = 4096
+    # qwen3:14b at 32K: ~17.5 GB total (weights + KV cache) on M3 Max 36 GB.
+    # Set to 0 to use model defaults.
+    ollama_num_ctx: int = 32768

    # Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS
-    # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:30b,llama3.1"
+    # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:8b,qwen2.5:14b"
    # Or edit config/providers.yaml → fallback_chains for the canonical source.
    fallback_models: list[str] = [
-        "llama3.1:8b-instruct",
-        "llama3.1",
+        "qwen3:8b",
        "qwen2.5:14b",
        "qwen2.5:7b",
+        "llama3.1:8b-instruct",
+        "llama3.1",
        "llama3.2:3b",
    ]
    vision_fallback_models: list[str] = [