diff --git a/Modelfile.qwen3-14b b/Modelfile.qwen3-14b
new file mode 100644
index 00000000..4177019d
--- /dev/null
+++ b/Modelfile.qwen3-14b
@@ -0,0 +1,51 @@
+# Modelfile.qwen3-14b
+#
+# Qwen3-14B Q5_K_M — Primary local agent model (Issue #1063)
+#
+# Tool calling F1: 0.971 — GPT-4-class structured output reliability.
+# Hybrid thinking/non-thinking mode: toggle per-request via /think or /no_think
+# in the prompt for planning vs rapid execution.
+#
+# Build:
+# ollama pull qwen3:14b # downloads Q4_K_M (~8.2 GB) by default
+# # For Q5_K_M (~10.5 GB, recommended):
+# # ollama pull bartowski/Qwen3-14B-GGUF:Q5_K_M
+# ollama create qwen3-14b -f Modelfile.qwen3-14b
+#
+# Memory budget: ~10.5 GB weights + ~7 GB KV cache = ~17.5 GB total at 32K ctx
+# Headroom on M3 Max 36 GB: ~10.5 GB free (enough to run qwen3:8b simultaneously)
+# Generation: ~20-28 tok/s (Ollama) / ~28-38 tok/s (MLX)
+# Context: 32K native, extensible to 131K with YaRN
+#
+# Two-model strategy: set OLLAMA_MAX_LOADED_MODELS=2 so qwen3:8b stays
+# hot for fast routing while qwen3:14b handles complex tasks.
+
+FROM qwen3:14b
+
+# 32K context — optimal balance of quality and memory on M3 Max 36 GB.
+# At 32K, total memory (weights + KV cache) is ~17.5 GB — well within budget.
+# Extend to 131K with YaRN if needed: PARAMETER rope_scaling_type yarn
+PARAMETER num_ctx 32768
+
+# Tool-calling temperature — lower = more reliable structured JSON output.
+# Raise to 0.7+ for creative/narrative tasks.
+PARAMETER temperature 0.3
+
+# Nucleus sampling
+PARAMETER top_p 0.9
+
+# Repeat penalty — prevents looping in structured output
+PARAMETER repeat_penalty 1.05
+
+SYSTEM """You are Timmy, Alexander's personal sovereign AI agent.
+
+You are concise, direct, and helpful. You complete tasks efficiently and report results clearly. You do not add unnecessary caveats or disclaimers.
+
+You have access to tool calling. When you need to use a tool, output a valid JSON function call:
+
+{"name": "function_name", "arguments": {"param": "value"}}
+
+
+You support hybrid reasoning. For complex planning, include ... before your answer. For rapid execution (simple tool calls, status checks), skip the think block.
+
+You always start your responses with "Timmy here:" when acting as an agent."""
diff --git a/Modelfile.qwen3-8b b/Modelfile.qwen3-8b
new file mode 100644
index 00000000..8e75dd27
--- /dev/null
+++ b/Modelfile.qwen3-8b
@@ -0,0 +1,43 @@
+# Modelfile.qwen3-8b
+#
+# Qwen3-8B Q6_K — Fast routing model for routine agent tasks (Issue #1063)
+#
+# Tool calling F1: 0.933 at ~45-55 tok/s — 2x speed of Qwen3-14B.
+# Use for: simple tool calls, shell commands, file reads, status checks, JSON ops.
+# Route complex tasks (issue triage, multi-step planning, code review) to qwen3:14b.
+#
+# Build:
+# ollama pull qwen3:8b
+# ollama create qwen3-8b -f Modelfile.qwen3-8b
+#
+# Memory budget: ~6.6 GB weights + ~5 GB KV cache = ~11.6 GB at 32K ctx
+# Two-model strategy: ~17 GB combined (both hot) — fits on M3 Max 36 GB.
+# Set OLLAMA_MAX_LOADED_MODELS=2 in the Ollama environment.
+#
+# Generation: ~35-45 tok/s (Ollama) / ~45-60 tok/s (MLX)
+
+FROM qwen3:8b
+
+# 32K context
+PARAMETER num_ctx 32768
+
+# Lower temperature for fast, deterministic tool execution
+PARAMETER temperature 0.2
+
+# Nucleus sampling
+PARAMETER top_p 0.9
+
+# Repeat penalty
+PARAMETER repeat_penalty 1.05
+
+SYSTEM """You are Timmy's fast-routing agent. You handle routine tasks quickly and precisely.
+
+For simple tasks (tool calls, shell commands, file reads, status checks, JSON ops): respond immediately without a think block.
+For anything requiring multi-step planning: defer to the primary agent.
+
+Tool call format:
+
+{"name": "function_name", "arguments": {"param": "value"}}
+
+
+Be brief. Be accurate. Execute."""
diff --git a/scripts/benchmark_local_model.sh b/scripts/benchmark_local_model.sh
new file mode 100755
index 00000000..27159c60
--- /dev/null
+++ b/scripts/benchmark_local_model.sh
@@ -0,0 +1,293 @@
+#!/usr/bin/env bash
+# benchmark_local_model.sh
+#
+# 5-test benchmark suite for evaluating local Ollama models as Timmy's agent brain.
+# Based on the model selection study for M3 Max 36 GB (Issue #1063).
+#
+# Usage:
+# ./scripts/benchmark_local_model.sh # test $OLLAMA_MODEL or qwen3:14b
+# ./scripts/benchmark_local_model.sh qwen3:8b # test a specific model
+# ./scripts/benchmark_local_model.sh qwen3:14b qwen3:8b # compare two models
+#
+# Thresholds (pass/fail):
+# Test 1 — Tool call compliance: >=90% valid JSON responses out of 5 probes
+# Test 2 — Code generation: compiles without syntax errors
+# Test 3 — Shell command gen: no refusal markers in output
+# Test 4 — Multi-turn coherence: session ID echoed back correctly
+# Test 5 — Issue triage quality: structured JSON with required fields
+#
+# Exit codes: 0 = all tests passed, 1 = one or more tests failed
+
+set -euo pipefail
+
+OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
+PASS=0
+FAIL=0
+TOTAL=0
+
+# ── Colours ──────────────────────────────────────────────────────────────────
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+BOLD='\033[1m'
+RESET='\033[0m'
+
+pass() { echo -e " ${GREEN}✓ PASS${RESET} $1"; ((PASS++)); ((TOTAL++)); }
+fail() { echo -e " ${RED}✗ FAIL${RESET} $1"; ((FAIL++)); ((TOTAL++)); }
+info() { echo -e " ${YELLOW}ℹ${RESET} $1"; }
+
+# ── Helper: call Ollama generate API ─────────────────────────────────────────
+ollama_generate() {
+ local model="$1"
+ local prompt="$2"
+ local extra_opts="${3:-}"
+
+ local payload
+ payload=$(printf '{"model":"%s","prompt":"%s","stream":false%s}' \
+ "$model" \
+ "$(echo "$prompt" | sed 's/"/\\"/g' | tr -d '\n')" \
+ "${extra_opts:+,$extra_opts}")
+
+ curl -s --max-time 60 \
+ -X POST "${OLLAMA_URL}/api/generate" \
+ -H "Content-Type: application/json" \
+ -d "$payload" \
+ | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('response',''))" 2>/dev/null || echo ""
+}
+
+# ── Helper: call Ollama chat API with tool schema ─────────────────────────────
+ollama_chat_tool() {
+ local model="$1"
+ local user_msg="$2"
+
+ local payload
+ payload=$(cat </dev/null || echo ""
+}
+
+# ── Benchmark a single model ──────────────────────────────────────────────────
+benchmark_model() {
+ local model="$1"
+ echo ""
+ echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}"
+ echo -e "${BOLD} Model: ${model}${RESET}"
+ echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}"
+
+ # Check model availability
+ local available
+ available=$(curl -s "${OLLAMA_URL}/api/tags" \
+ | python3 -c "
+import sys, json
+d = json.load(sys.stdin)
+models = [m.get('name','') for m in d.get('models',[])]
+target = '$model'
+match = any(target == m or target == m.split(':')[0] or m.startswith(target) for m in models)
+print('yes' if match else 'no')
+" 2>/dev/null || echo "no")
+
+ if [[ "$available" != "yes" ]]; then
+ echo -e " ${YELLOW}⚠ SKIP${RESET} Model '$model' not available locally — pull it first:"
+ echo " ollama pull $model"
+ return 0
+ fi
+
+ # ── Test 1: Tool Call Compliance ─────────────────────────────────────────
+ echo ""
+ echo -e " ${BOLD}Test 1: Tool Call Compliance${RESET} (target ≥90% valid JSON)"
+ local tool_pass=0
+ local tool_probes=5
+ for i in $(seq 1 $tool_probes); do
+ local response
+ response=$(ollama_chat_tool "$model" \
+ "What is the weather in Tokyo right now?")
+ # Valid if response is non-empty JSON (tool_calls array or JSON object)
+ if echo "$response" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then
+ ((tool_pass++))
+ fi
+ done
+ local tool_pct=$(( tool_pass * 100 / tool_probes ))
+ info "Tool call valid JSON: $tool_pass/$tool_probes ($tool_pct%)"
+ if [[ $tool_pct -ge 90 ]]; then
+ pass "Tool call compliance ≥90% ($tool_pct%)"
+ else
+ fail "Tool call compliance <90% ($tool_pct%) — unreliable for agent loops"
+ fi
+
+ # ── Test 2: Code Generation ──────────────────────────────────────────────
+ echo ""
+ echo -e " ${BOLD}Test 2: Code Generation${RESET} (must produce valid Python)"
+ local code_response
+ code_response=$(ollama_generate "$model" \
+ "Write a Python function called fibonacci(n) that returns the nth Fibonacci number using recursion. Return ONLY the function definition, no explanation.")
+ # Extract Python code block if present
+ local code
+ code=$(echo "$code_response" | sed -n '/^```python/,/^```/p' | grep -v '```' || echo "$code_response")
+ if [[ -z "$code" ]]; then
+ code="$code_response"
+ fi
+ # Validate syntax
+ if echo "$code" | python3 -c "
+import sys, ast
+src = sys.stdin.read()
+try:
+ ast.parse(src)
+ sys.exit(0)
+except SyntaxError as e:
+ sys.exit(1)
+" 2>/dev/null; then
+ pass "Code generation produces valid Python syntax"
+ else
+ fail "Code generation produced invalid Python syntax"
+ info "Response: $(echo "$code_response" | head -5)"
+ fi
+
+ # ── Test 3: Shell Command Generation (No Refusal) ────────────────────────
+ echo ""
+ echo -e " ${BOLD}Test 3: Shell Command Generation${RESET} (must not refuse)"
+ local shell_response
+ shell_response=$(ollama_generate "$model" \
+ "Give me a bash one-liner to find all .log files modified in the last 7 days under /var/log and delete them. Output only the command, no explanation.")
+ # Check for refusal markers
+ local refusal_found=false
+ for marker in "cannot" "can't" "I'm sorry" "I am sorry" "won't" "will not" "inappropriate" "harmful" "dangerous" "refuse"; do
+ if echo "$shell_response" | grep -qi "$marker"; then
+ refusal_found=true
+ info "Refusal marker found: '$marker'"
+ break
+ fi
+ done
+ # Also check that the response contains something shell-like
+ if echo "$shell_response" | grep -qE 'find|rm|delete|xargs'; then
+ if [[ "$refusal_found" == "true" ]]; then
+ fail "Shell command generated but with refusal language"
+ else
+ pass "Shell command generated without refusal"
+ fi
+ else
+ if [[ "$refusal_found" == "true" ]]; then
+ fail "Shell command refused — model will block agent shell operations"
+ else
+ fail "Shell command not generated (no find/rm/delete/xargs in output)"
+ info "Response: $(echo "$shell_response" | head -3)"
+ fi
+ fi
+
+ # ── Test 4: Multi-Turn Agent Loop Coherence ──────────────────────────────
+ echo ""
+ echo -e " ${BOLD}Test 4: Multi-Turn Agent Loop Coherence${RESET}"
+ local session_id="SESS-$(date +%s)"
+ local turn1_response
+ turn1_response=$(ollama_generate "$model" \
+ "You are starting a multi-step task. Your session ID is $session_id. Acknowledge this ID and ask for the first task.")
+ local turn2_response
+ turn2_response=$(ollama_generate "$model" \
+ "Continuing session $session_id. Previous context: you acknowledged the session. Now summarize what session ID you are working in. Include the exact ID.")
+ if echo "$turn2_response" | grep -q "$session_id"; then
+ pass "Multi-turn coherence: session ID echoed back correctly"
+ else
+ fail "Multi-turn coherence: session ID not found in follow-up response"
+ info "Expected: $session_id"
+ info "Response snippet: $(echo "$turn2_response" | head -3)"
+ fi
+
+ # ── Test 5: Issue Triage Quality ─────────────────────────────────────────
+ echo ""
+ echo -e " ${BOLD}Test 5: Issue Triage Quality${RESET} (must return structured JSON)"
+ local triage_response
+ triage_response=$(ollama_generate "$model" \
+ 'Triage this bug report and respond ONLY with a JSON object with fields: priority (low/medium/high/critical), component (string), estimated_effort (hours as integer), needs_reproduction (boolean). Bug: "The dashboard crashes with a 500 error when submitting an empty chat message. Reproducible 100% of the time on the /chat endpoint."')
+ local triage_valid=false
+ if echo "$triage_response" | python3 -c "
+import sys, json, re
+text = sys.stdin.read()
+# Try to extract JSON from response (may be wrapped in markdown)
+match = re.search(r'\{[^{}]+\}', text, re.DOTALL)
+if not match:
+ sys.exit(1)
+try:
+ d = json.loads(match.group())
+ required = {'priority', 'component', 'estimated_effort', 'needs_reproduction'}
+ if required.issubset(d.keys()):
+ valid_priority = d['priority'] in ('low','medium','high','critical')
+ if valid_priority:
+ sys.exit(0)
+sys.exit(1)
+except:
+ sys.exit(1)
+" 2>/dev/null; then
+ pass "Issue triage returned valid structured JSON with all required fields"
+ else
+ fail "Issue triage did not return valid structured JSON"
+ info "Response: $(echo "$triage_response" | head -5)"
+ fi
+}
+
+# ── Summary ───────────────────────────────────────────────────────────────────
+print_summary() {
+ local model="$1"
+ local model_pass="$2"
+ local model_total="$3"
+ echo ""
+ local pct=$(( model_pass * 100 / model_total ))
+ if [[ $model_pass -eq $model_total ]]; then
+ echo -e " ${GREEN}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — READY FOR AGENT USE${RESET}"
+ elif [[ $pct -ge 60 ]]; then
+ echo -e " ${YELLOW}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — MARGINAL${RESET}"
+ else
+ echo -e " ${RED}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — NOT RECOMMENDED${RESET}"
+ fi
+}
+
+# ── Main ─────────────────────────────────────────────────────────────────────
+models=("${@:-${OLLAMA_MODEL:-qwen3:14b}}")
+
+for model in "${models[@]}"; do
+ PASS=0
+ FAIL=0
+ TOTAL=0
+ benchmark_model "$model"
+ print_summary "$model" "$PASS" "$TOTAL"
+done
+
+echo ""
+if [[ $FAIL -eq 0 ]]; then
+ exit 0
+else
+ exit 1
+fi
diff --git a/src/config.py b/src/config.py
index 712e5750..d59241e3 100644
--- a/src/config.py
+++ b/src/config.py
@@ -30,25 +30,36 @@ class Settings(BaseSettings):
return normalize_ollama_url(self.ollama_url)
# LLM model passed to Agno/Ollama — override with OLLAMA_MODEL
- # qwen3:30b is the primary model — better reasoning and tool calling
- # than llama3.1:8b-instruct while still running locally on modest hardware.
- # Fallback: llama3.1:8b-instruct if qwen3:30b not available.
- # llama3.2 (3B) hallucinated tool output consistently in testing.
- ollama_model: str = "qwen3:30b"
+ # qwen3:14b (Q5_K_M) is the primary model: tool calling F1 0.971, ~17.5 GB
+ # at 32K context — optimal for M3 Max 36 GB (Issue #1063).
+ # qwen3:30b exceeded memory budget at 32K+ context on 36 GB hardware.
+ ollama_model: str = "qwen3:14b"
+
+ # Fast routing model — override with OLLAMA_FAST_MODEL
+ # qwen3:8b (Q6_K): tool calling F1 0.933 at ~45-55 tok/s (2x speed of 14B).
+ # Use for routine tasks: simple tool calls, file reads, status checks.
+ # Combined memory with qwen3:14b: ~17 GB — both can stay loaded simultaneously.
+ ollama_fast_model: str = "qwen3:8b"
+
+ # Maximum concurrently loaded Ollama models — override with OLLAMA_MAX_LOADED_MODELS
+ # Set to 2 to keep qwen3:8b (fast) + qwen3:14b (primary) both hot.
+ # Requires setting OLLAMA_MAX_LOADED_MODELS=2 in the Ollama server environment.
+ ollama_max_loaded_models: int = 2
# Context window size for Ollama inference — override with OLLAMA_NUM_CTX
- # qwen3:30b with default context eats 45GB on a 39GB Mac.
- # 4096 keeps memory at ~19GB. Set to 0 to use model defaults.
- ollama_num_ctx: int = 4096
+ # qwen3:14b at 32K: ~17.5 GB total (weights + KV cache) on M3 Max 36 GB.
+ # Set to 0 to use model defaults.
+ ollama_num_ctx: int = 32768
# Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS
- # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:30b,llama3.1"
+ # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:8b,qwen2.5:14b"
# Or edit config/providers.yaml → fallback_chains for the canonical source.
fallback_models: list[str] = [
- "llama3.1:8b-instruct",
- "llama3.1",
+ "qwen3:8b",
"qwen2.5:14b",
"qwen2.5:7b",
+ "llama3.1:8b-instruct",
+ "llama3.1",
"llama3.2:3b",
]
vision_fallback_models: list[str] = [