From 59ccb382aa0c9bd69a95f001a805565721f297a5 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Mon, 23 Mar 2026 14:34:33 -0400 Subject: [PATCH] feat: implement Qwen3 two-model strategy from M3 Max model study MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Translates findings from the M3 Max 36 GB model selection study (Issue #1063) into production-ready config and tooling artifacts. Changes: - Modelfile.qwen3-14b: primary agent model (Q5_K_M, 32K ctx, temp 0.3) Tool calling F1 0.971, ~17.5 GB total — fits M3 Max with 10 GB headroom - Modelfile.qwen3-8b: fast routing model (Q6_K, 32K ctx, temp 0.2) Tool calling F1 0.933 at ~45-55 tok/s; ~11.6 GB total - scripts/benchmark_local_model.sh: 5-test evaluation suite Tests tool call compliance, code gen, shell gen, multi-turn coherence, and issue triage quality with pass/fail thresholds - config.py: update defaults for M3 Max 36 GB hardware - ollama_model: qwen3:30b → qwen3:14b (within memory budget at 32K ctx) - ollama_fast_model: new field, defaults to qwen3:8b - ollama_max_loaded_models: new field, defaults to 2 (both models hot) - ollama_num_ctx: 4096 → 32768 (qwen3:14b fits at 32K; 30b did not) - fallback_models: promote qwen3:8b as first fallback Refs #1063 --- Modelfile.qwen3-14b | 51 ++++++ Modelfile.qwen3-8b | 43 +++++ scripts/benchmark_local_model.sh | 293 +++++++++++++++++++++++++++++++ src/config.py | 33 ++-- 4 files changed, 409 insertions(+), 11 deletions(-) create mode 100644 Modelfile.qwen3-14b create mode 100644 Modelfile.qwen3-8b create mode 100755 scripts/benchmark_local_model.sh diff --git a/Modelfile.qwen3-14b b/Modelfile.qwen3-14b new file mode 100644 index 00000000..4177019d --- /dev/null +++ b/Modelfile.qwen3-14b @@ -0,0 +1,51 @@ +# Modelfile.qwen3-14b +# +# Qwen3-14B Q5_K_M — Primary local agent model (Issue #1063) +# +# Tool calling F1: 0.971 — GPT-4-class structured output reliability. +# Hybrid thinking/non-thinking mode: toggle per-request via /think or /no_think +# in the prompt for planning vs rapid execution. +# +# Build: +# ollama pull qwen3:14b # downloads Q4_K_M (~8.2 GB) by default +# # For Q5_K_M (~10.5 GB, recommended): +# # ollama pull bartowski/Qwen3-14B-GGUF:Q5_K_M +# ollama create qwen3-14b -f Modelfile.qwen3-14b +# +# Memory budget: ~10.5 GB weights + ~7 GB KV cache = ~17.5 GB total at 32K ctx +# Headroom on M3 Max 36 GB: ~10.5 GB free (enough to run qwen3:8b simultaneously) +# Generation: ~20-28 tok/s (Ollama) / ~28-38 tok/s (MLX) +# Context: 32K native, extensible to 131K with YaRN +# +# Two-model strategy: set OLLAMA_MAX_LOADED_MODELS=2 so qwen3:8b stays +# hot for fast routing while qwen3:14b handles complex tasks. + +FROM qwen3:14b + +# 32K context — optimal balance of quality and memory on M3 Max 36 GB. +# At 32K, total memory (weights + KV cache) is ~17.5 GB — well within budget. +# Extend to 131K with YaRN if needed: PARAMETER rope_scaling_type yarn +PARAMETER num_ctx 32768 + +# Tool-calling temperature — lower = more reliable structured JSON output. +# Raise to 0.7+ for creative/narrative tasks. +PARAMETER temperature 0.3 + +# Nucleus sampling +PARAMETER top_p 0.9 + +# Repeat penalty — prevents looping in structured output +PARAMETER repeat_penalty 1.05 + +SYSTEM """You are Timmy, Alexander's personal sovereign AI agent. + +You are concise, direct, and helpful. You complete tasks efficiently and report results clearly. You do not add unnecessary caveats or disclaimers. + +You have access to tool calling. When you need to use a tool, output a valid JSON function call: + +{"name": "function_name", "arguments": {"param": "value"}} + + +You support hybrid reasoning. For complex planning, include ... before your answer. For rapid execution (simple tool calls, status checks), skip the think block. + +You always start your responses with "Timmy here:" when acting as an agent.""" diff --git a/Modelfile.qwen3-8b b/Modelfile.qwen3-8b new file mode 100644 index 00000000..8e75dd27 --- /dev/null +++ b/Modelfile.qwen3-8b @@ -0,0 +1,43 @@ +# Modelfile.qwen3-8b +# +# Qwen3-8B Q6_K — Fast routing model for routine agent tasks (Issue #1063) +# +# Tool calling F1: 0.933 at ~45-55 tok/s — 2x speed of Qwen3-14B. +# Use for: simple tool calls, shell commands, file reads, status checks, JSON ops. +# Route complex tasks (issue triage, multi-step planning, code review) to qwen3:14b. +# +# Build: +# ollama pull qwen3:8b +# ollama create qwen3-8b -f Modelfile.qwen3-8b +# +# Memory budget: ~6.6 GB weights + ~5 GB KV cache = ~11.6 GB at 32K ctx +# Two-model strategy: ~17 GB combined (both hot) — fits on M3 Max 36 GB. +# Set OLLAMA_MAX_LOADED_MODELS=2 in the Ollama environment. +# +# Generation: ~35-45 tok/s (Ollama) / ~45-60 tok/s (MLX) + +FROM qwen3:8b + +# 32K context +PARAMETER num_ctx 32768 + +# Lower temperature for fast, deterministic tool execution +PARAMETER temperature 0.2 + +# Nucleus sampling +PARAMETER top_p 0.9 + +# Repeat penalty +PARAMETER repeat_penalty 1.05 + +SYSTEM """You are Timmy's fast-routing agent. You handle routine tasks quickly and precisely. + +For simple tasks (tool calls, shell commands, file reads, status checks, JSON ops): respond immediately without a think block. +For anything requiring multi-step planning: defer to the primary agent. + +Tool call format: + +{"name": "function_name", "arguments": {"param": "value"}} + + +Be brief. Be accurate. Execute.""" diff --git a/scripts/benchmark_local_model.sh b/scripts/benchmark_local_model.sh new file mode 100755 index 00000000..27159c60 --- /dev/null +++ b/scripts/benchmark_local_model.sh @@ -0,0 +1,293 @@ +#!/usr/bin/env bash +# benchmark_local_model.sh +# +# 5-test benchmark suite for evaluating local Ollama models as Timmy's agent brain. +# Based on the model selection study for M3 Max 36 GB (Issue #1063). +# +# Usage: +# ./scripts/benchmark_local_model.sh # test $OLLAMA_MODEL or qwen3:14b +# ./scripts/benchmark_local_model.sh qwen3:8b # test a specific model +# ./scripts/benchmark_local_model.sh qwen3:14b qwen3:8b # compare two models +# +# Thresholds (pass/fail): +# Test 1 — Tool call compliance: >=90% valid JSON responses out of 5 probes +# Test 2 — Code generation: compiles without syntax errors +# Test 3 — Shell command gen: no refusal markers in output +# Test 4 — Multi-turn coherence: session ID echoed back correctly +# Test 5 — Issue triage quality: structured JSON with required fields +# +# Exit codes: 0 = all tests passed, 1 = one or more tests failed + +set -euo pipefail + +OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}" +PASS=0 +FAIL=0 +TOTAL=0 + +# ── Colours ────────────────────────────────────────────────────────────────── +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +BOLD='\033[1m' +RESET='\033[0m' + +pass() { echo -e " ${GREEN}✓ PASS${RESET} $1"; ((PASS++)); ((TOTAL++)); } +fail() { echo -e " ${RED}✗ FAIL${RESET} $1"; ((FAIL++)); ((TOTAL++)); } +info() { echo -e " ${YELLOW}ℹ${RESET} $1"; } + +# ── Helper: call Ollama generate API ───────────────────────────────────────── +ollama_generate() { + local model="$1" + local prompt="$2" + local extra_opts="${3:-}" + + local payload + payload=$(printf '{"model":"%s","prompt":"%s","stream":false%s}' \ + "$model" \ + "$(echo "$prompt" | sed 's/"/\\"/g' | tr -d '\n')" \ + "${extra_opts:+,$extra_opts}") + + curl -s --max-time 60 \ + -X POST "${OLLAMA_URL}/api/generate" \ + -H "Content-Type: application/json" \ + -d "$payload" \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('response',''))" 2>/dev/null || echo "" +} + +# ── Helper: call Ollama chat API with tool schema ───────────────────────────── +ollama_chat_tool() { + local model="$1" + local user_msg="$2" + + local payload + payload=$(cat </dev/null || echo "" +} + +# ── Benchmark a single model ────────────────────────────────────────────────── +benchmark_model() { + local model="$1" + echo "" + echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}" + echo -e "${BOLD} Model: ${model}${RESET}" + echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}" + + # Check model availability + local available + available=$(curl -s "${OLLAMA_URL}/api/tags" \ + | python3 -c " +import sys, json +d = json.load(sys.stdin) +models = [m.get('name','') for m in d.get('models',[])] +target = '$model' +match = any(target == m or target == m.split(':')[0] or m.startswith(target) for m in models) +print('yes' if match else 'no') +" 2>/dev/null || echo "no") + + if [[ "$available" != "yes" ]]; then + echo -e " ${YELLOW}⚠ SKIP${RESET} Model '$model' not available locally — pull it first:" + echo " ollama pull $model" + return 0 + fi + + # ── Test 1: Tool Call Compliance ───────────────────────────────────────── + echo "" + echo -e " ${BOLD}Test 1: Tool Call Compliance${RESET} (target ≥90% valid JSON)" + local tool_pass=0 + local tool_probes=5 + for i in $(seq 1 $tool_probes); do + local response + response=$(ollama_chat_tool "$model" \ + "What is the weather in Tokyo right now?") + # Valid if response is non-empty JSON (tool_calls array or JSON object) + if echo "$response" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then + ((tool_pass++)) + fi + done + local tool_pct=$(( tool_pass * 100 / tool_probes )) + info "Tool call valid JSON: $tool_pass/$tool_probes ($tool_pct%)" + if [[ $tool_pct -ge 90 ]]; then + pass "Tool call compliance ≥90% ($tool_pct%)" + else + fail "Tool call compliance <90% ($tool_pct%) — unreliable for agent loops" + fi + + # ── Test 2: Code Generation ────────────────────────────────────────────── + echo "" + echo -e " ${BOLD}Test 2: Code Generation${RESET} (must produce valid Python)" + local code_response + code_response=$(ollama_generate "$model" \ + "Write a Python function called fibonacci(n) that returns the nth Fibonacci number using recursion. Return ONLY the function definition, no explanation.") + # Extract Python code block if present + local code + code=$(echo "$code_response" | sed -n '/^```python/,/^```/p' | grep -v '```' || echo "$code_response") + if [[ -z "$code" ]]; then + code="$code_response" + fi + # Validate syntax + if echo "$code" | python3 -c " +import sys, ast +src = sys.stdin.read() +try: + ast.parse(src) + sys.exit(0) +except SyntaxError as e: + sys.exit(1) +" 2>/dev/null; then + pass "Code generation produces valid Python syntax" + else + fail "Code generation produced invalid Python syntax" + info "Response: $(echo "$code_response" | head -5)" + fi + + # ── Test 3: Shell Command Generation (No Refusal) ──────────────────────── + echo "" + echo -e " ${BOLD}Test 3: Shell Command Generation${RESET} (must not refuse)" + local shell_response + shell_response=$(ollama_generate "$model" \ + "Give me a bash one-liner to find all .log files modified in the last 7 days under /var/log and delete them. Output only the command, no explanation.") + # Check for refusal markers + local refusal_found=false + for marker in "cannot" "can't" "I'm sorry" "I am sorry" "won't" "will not" "inappropriate" "harmful" "dangerous" "refuse"; do + if echo "$shell_response" | grep -qi "$marker"; then + refusal_found=true + info "Refusal marker found: '$marker'" + break + fi + done + # Also check that the response contains something shell-like + if echo "$shell_response" | grep -qE 'find|rm|delete|xargs'; then + if [[ "$refusal_found" == "true" ]]; then + fail "Shell command generated but with refusal language" + else + pass "Shell command generated without refusal" + fi + else + if [[ "$refusal_found" == "true" ]]; then + fail "Shell command refused — model will block agent shell operations" + else + fail "Shell command not generated (no find/rm/delete/xargs in output)" + info "Response: $(echo "$shell_response" | head -3)" + fi + fi + + # ── Test 4: Multi-Turn Agent Loop Coherence ────────────────────────────── + echo "" + echo -e " ${BOLD}Test 4: Multi-Turn Agent Loop Coherence${RESET}" + local session_id="SESS-$(date +%s)" + local turn1_response + turn1_response=$(ollama_generate "$model" \ + "You are starting a multi-step task. Your session ID is $session_id. Acknowledge this ID and ask for the first task.") + local turn2_response + turn2_response=$(ollama_generate "$model" \ + "Continuing session $session_id. Previous context: you acknowledged the session. Now summarize what session ID you are working in. Include the exact ID.") + if echo "$turn2_response" | grep -q "$session_id"; then + pass "Multi-turn coherence: session ID echoed back correctly" + else + fail "Multi-turn coherence: session ID not found in follow-up response" + info "Expected: $session_id" + info "Response snippet: $(echo "$turn2_response" | head -3)" + fi + + # ── Test 5: Issue Triage Quality ───────────────────────────────────────── + echo "" + echo -e " ${BOLD}Test 5: Issue Triage Quality${RESET} (must return structured JSON)" + local triage_response + triage_response=$(ollama_generate "$model" \ + 'Triage this bug report and respond ONLY with a JSON object with fields: priority (low/medium/high/critical), component (string), estimated_effort (hours as integer), needs_reproduction (boolean). Bug: "The dashboard crashes with a 500 error when submitting an empty chat message. Reproducible 100% of the time on the /chat endpoint."') + local triage_valid=false + if echo "$triage_response" | python3 -c " +import sys, json, re +text = sys.stdin.read() +# Try to extract JSON from response (may be wrapped in markdown) +match = re.search(r'\{[^{}]+\}', text, re.DOTALL) +if not match: + sys.exit(1) +try: + d = json.loads(match.group()) + required = {'priority', 'component', 'estimated_effort', 'needs_reproduction'} + if required.issubset(d.keys()): + valid_priority = d['priority'] in ('low','medium','high','critical') + if valid_priority: + sys.exit(0) +sys.exit(1) +except: + sys.exit(1) +" 2>/dev/null; then + pass "Issue triage returned valid structured JSON with all required fields" + else + fail "Issue triage did not return valid structured JSON" + info "Response: $(echo "$triage_response" | head -5)" + fi +} + +# ── Summary ─────────────────────────────────────────────────────────────────── +print_summary() { + local model="$1" + local model_pass="$2" + local model_total="$3" + echo "" + local pct=$(( model_pass * 100 / model_total )) + if [[ $model_pass -eq $model_total ]]; then + echo -e " ${GREEN}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — READY FOR AGENT USE${RESET}" + elif [[ $pct -ge 60 ]]; then + echo -e " ${YELLOW}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — MARGINAL${RESET}" + else + echo -e " ${RED}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — NOT RECOMMENDED${RESET}" + fi +} + +# ── Main ───────────────────────────────────────────────────────────────────── +models=("${@:-${OLLAMA_MODEL:-qwen3:14b}}") + +for model in "${models[@]}"; do + PASS=0 + FAIL=0 + TOTAL=0 + benchmark_model "$model" + print_summary "$model" "$PASS" "$TOTAL" +done + +echo "" +if [[ $FAIL -eq 0 ]]; then + exit 0 +else + exit 1 +fi diff --git a/src/config.py b/src/config.py index 712e5750..d59241e3 100644 --- a/src/config.py +++ b/src/config.py @@ -30,25 +30,36 @@ class Settings(BaseSettings): return normalize_ollama_url(self.ollama_url) # LLM model passed to Agno/Ollama — override with OLLAMA_MODEL - # qwen3:30b is the primary model — better reasoning and tool calling - # than llama3.1:8b-instruct while still running locally on modest hardware. - # Fallback: llama3.1:8b-instruct if qwen3:30b not available. - # llama3.2 (3B) hallucinated tool output consistently in testing. - ollama_model: str = "qwen3:30b" + # qwen3:14b (Q5_K_M) is the primary model: tool calling F1 0.971, ~17.5 GB + # at 32K context — optimal for M3 Max 36 GB (Issue #1063). + # qwen3:30b exceeded memory budget at 32K+ context on 36 GB hardware. + ollama_model: str = "qwen3:14b" + + # Fast routing model — override with OLLAMA_FAST_MODEL + # qwen3:8b (Q6_K): tool calling F1 0.933 at ~45-55 tok/s (2x speed of 14B). + # Use for routine tasks: simple tool calls, file reads, status checks. + # Combined memory with qwen3:14b: ~17 GB — both can stay loaded simultaneously. + ollama_fast_model: str = "qwen3:8b" + + # Maximum concurrently loaded Ollama models — override with OLLAMA_MAX_LOADED_MODELS + # Set to 2 to keep qwen3:8b (fast) + qwen3:14b (primary) both hot. + # Requires setting OLLAMA_MAX_LOADED_MODELS=2 in the Ollama server environment. + ollama_max_loaded_models: int = 2 # Context window size for Ollama inference — override with OLLAMA_NUM_CTX - # qwen3:30b with default context eats 45GB on a 39GB Mac. - # 4096 keeps memory at ~19GB. Set to 0 to use model defaults. - ollama_num_ctx: int = 4096 + # qwen3:14b at 32K: ~17.5 GB total (weights + KV cache) on M3 Max 36 GB. + # Set to 0 to use model defaults. + ollama_num_ctx: int = 32768 # Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS - # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:30b,llama3.1" + # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:8b,qwen2.5:14b" # Or edit config/providers.yaml → fallback_chains for the canonical source. fallback_models: list[str] = [ - "llama3.1:8b-instruct", - "llama3.1", + "qwen3:8b", "qwen2.5:14b", "qwen2.5:7b", + "llama3.1:8b-instruct", + "llama3.1", "llama3.2:3b", ] vision_fallback_models: list[str] = [ -- 2.43.0