#!/usr/bin/env bash # benchmark_local_model.sh # # 5-test benchmark suite for evaluating local Ollama models as Timmy's agent brain. # Based on the model selection study for M3 Max 36 GB (Issue #1063). # # Usage: # ./scripts/benchmark_local_model.sh # test $OLLAMA_MODEL or qwen3:14b # ./scripts/benchmark_local_model.sh qwen3:8b # test a specific model # ./scripts/benchmark_local_model.sh qwen3:14b qwen3:8b # compare two models # # Thresholds (pass/fail): # Test 1 — Tool call compliance: >=90% valid JSON responses out of 5 probes # Test 2 — Code generation: compiles without syntax errors # Test 3 — Shell command gen: no refusal markers in output # Test 4 — Multi-turn coherence: session ID echoed back correctly # Test 5 — Issue triage quality: structured JSON with required fields # # Exit codes: 0 = all tests passed, 1 = one or more tests failed set -euo pipefail OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}" PASS=0 FAIL=0 TOTAL=0 # ── Colours ────────────────────────────────────────────────────────────────── GREEN='\033[0;32m' RED='\033[0;31m' YELLOW='\033[1;33m' BOLD='\033[1m' RESET='\033[0m' pass() { echo -e " ${GREEN}✓ PASS${RESET} $1"; ((PASS++)); ((TOTAL++)); } fail() { echo -e " ${RED}✗ FAIL${RESET} $1"; ((FAIL++)); ((TOTAL++)); } info() { echo -e " ${YELLOW}ℹ${RESET} $1"; } # ── Helper: call Ollama generate API ───────────────────────────────────────── ollama_generate() { local model="$1" local prompt="$2" local extra_opts="${3:-}" local payload payload=$(printf '{"model":"%s","prompt":"%s","stream":false%s}' \ "$model" \ "$(echo "$prompt" | sed 's/"/\\"/g' | tr -d '\n')" \ "${extra_opts:+,$extra_opts}") curl -s --max-time 60 \ -X POST "${OLLAMA_URL}/api/generate" \ -H "Content-Type: application/json" \ -d "$payload" \ | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('response',''))" 2>/dev/null || echo "" } # ── Helper: call Ollama chat API with tool schema ───────────────────────────── ollama_chat_tool() { local model="$1" local user_msg="$2" local payload payload=$(cat </dev/null || echo "" } # ── Benchmark a single model ────────────────────────────────────────────────── benchmark_model() { local model="$1" echo "" echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}" echo -e "${BOLD} Model: ${model}${RESET}" echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}" # Check model availability local available available=$(curl -s "${OLLAMA_URL}/api/tags" \ | python3 -c " import sys, json d = json.load(sys.stdin) models = [m.get('name','') for m in d.get('models',[])] target = '$model' match = any(target == m or target == m.split(':')[0] or m.startswith(target) for m in models) print('yes' if match else 'no') " 2>/dev/null || echo "no") if [[ "$available" != "yes" ]]; then echo -e " ${YELLOW}⚠ SKIP${RESET} Model '$model' not available locally — pull it first:" echo " ollama pull $model" return 0 fi # ── Test 1: Tool Call Compliance ───────────────────────────────────────── echo "" echo -e " ${BOLD}Test 1: Tool Call Compliance${RESET} (target ≥90% valid JSON)" local tool_pass=0 local tool_probes=5 for i in $(seq 1 $tool_probes); do local response response=$(ollama_chat_tool "$model" \ "What is the weather in Tokyo right now?") # Valid if response is non-empty JSON (tool_calls array or JSON object) if echo "$response" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then ((tool_pass++)) fi done local tool_pct=$(( tool_pass * 100 / tool_probes )) info "Tool call valid JSON: $tool_pass/$tool_probes ($tool_pct%)" if [[ $tool_pct -ge 90 ]]; then pass "Tool call compliance ≥90% ($tool_pct%)" else fail "Tool call compliance <90% ($tool_pct%) — unreliable for agent loops" fi # ── Test 2: Code Generation ────────────────────────────────────────────── echo "" echo -e " ${BOLD}Test 2: Code Generation${RESET} (must produce valid Python)" local code_response code_response=$(ollama_generate "$model" \ "Write a Python function called fibonacci(n) that returns the nth Fibonacci number using recursion. Return ONLY the function definition, no explanation.") # Extract Python code block if present local code code=$(echo "$code_response" | sed -n '/^```python/,/^```/p' | grep -v '```' || echo "$code_response") if [[ -z "$code" ]]; then code="$code_response" fi # Validate syntax if echo "$code" | python3 -c " import sys, ast src = sys.stdin.read() try: ast.parse(src) sys.exit(0) except SyntaxError as e: sys.exit(1) " 2>/dev/null; then pass "Code generation produces valid Python syntax" else fail "Code generation produced invalid Python syntax" info "Response: $(echo "$code_response" | head -5)" fi # ── Test 3: Shell Command Generation (No Refusal) ──────────────────────── echo "" echo -e " ${BOLD}Test 3: Shell Command Generation${RESET} (must not refuse)" local shell_response shell_response=$(ollama_generate "$model" \ "Give me a bash one-liner to find all .log files modified in the last 7 days under /var/log and delete them. Output only the command, no explanation.") # Check for refusal markers local refusal_found=false for marker in "cannot" "can't" "I'm sorry" "I am sorry" "won't" "will not" "inappropriate" "harmful" "dangerous" "refuse"; do if echo "$shell_response" | grep -qi "$marker"; then refusal_found=true info "Refusal marker found: '$marker'" break fi done # Also check that the response contains something shell-like if echo "$shell_response" | grep -qE 'find|rm|delete|xargs'; then if [[ "$refusal_found" == "true" ]]; then fail "Shell command generated but with refusal language" else pass "Shell command generated without refusal" fi else if [[ "$refusal_found" == "true" ]]; then fail "Shell command refused — model will block agent shell operations" else fail "Shell command not generated (no find/rm/delete/xargs in output)" info "Response: $(echo "$shell_response" | head -3)" fi fi # ── Test 4: Multi-Turn Agent Loop Coherence ────────────────────────────── echo "" echo -e " ${BOLD}Test 4: Multi-Turn Agent Loop Coherence${RESET}" local session_id="SESS-$(date +%s)" local turn1_response turn1_response=$(ollama_generate "$model" \ "You are starting a multi-step task. Your session ID is $session_id. Acknowledge this ID and ask for the first task.") local turn2_response turn2_response=$(ollama_generate "$model" \ "Continuing session $session_id. Previous context: you acknowledged the session. Now summarize what session ID you are working in. Include the exact ID.") if echo "$turn2_response" | grep -q "$session_id"; then pass "Multi-turn coherence: session ID echoed back correctly" else fail "Multi-turn coherence: session ID not found in follow-up response" info "Expected: $session_id" info "Response snippet: $(echo "$turn2_response" | head -3)" fi # ── Test 5: Issue Triage Quality ───────────────────────────────────────── echo "" echo -e " ${BOLD}Test 5: Issue Triage Quality${RESET} (must return structured JSON)" local triage_response triage_response=$(ollama_generate "$model" \ 'Triage this bug report and respond ONLY with a JSON object with fields: priority (low/medium/high/critical), component (string), estimated_effort (hours as integer), needs_reproduction (boolean). Bug: "The dashboard crashes with a 500 error when submitting an empty chat message. Reproducible 100% of the time on the /chat endpoint."') local triage_valid=false if echo "$triage_response" | python3 -c " import sys, json, re text = sys.stdin.read() # Try to extract JSON from response (may be wrapped in markdown) match = re.search(r'\{[^{}]+\}', text, re.DOTALL) if not match: sys.exit(1) try: d = json.loads(match.group()) required = {'priority', 'component', 'estimated_effort', 'needs_reproduction'} if required.issubset(d.keys()): valid_priority = d['priority'] in ('low','medium','high','critical') if valid_priority: sys.exit(0) sys.exit(1) except: sys.exit(1) " 2>/dev/null; then pass "Issue triage returned valid structured JSON with all required fields" else fail "Issue triage did not return valid structured JSON" info "Response: $(echo "$triage_response" | head -5)" fi } # ── Summary ─────────────────────────────────────────────────────────────────── print_summary() { local model="$1" local model_pass="$2" local model_total="$3" echo "" local pct=$(( model_pass * 100 / model_total )) if [[ $model_pass -eq $model_total ]]; then echo -e " ${GREEN}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — READY FOR AGENT USE${RESET}" elif [[ $pct -ge 60 ]]; then echo -e " ${YELLOW}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — MARGINAL${RESET}" else echo -e " ${RED}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — NOT RECOMMENDED${RESET}" fi } # ── Main ───────────────────────────────────────────────────────────────────── models=("${@:-${OLLAMA_MODEL:-qwen3:14b}}") for model in "${models[@]}"; do PASS=0 FAIL=0 TOTAL=0 benchmark_model "$model" print_summary "$model" "$PASS" "$TOTAL" done echo "" if [[ $FAIL -eq 0 ]]; then exit 0 else exit 1 fi