294 lines
12 KiB
Bash
294 lines
12 KiB
Bash
|
|
#!/usr/bin/env bash
|
|||
|
|
# benchmark_local_model.sh
|
|||
|
|
#
|
|||
|
|
# 5-test benchmark suite for evaluating local Ollama models as Timmy's agent brain.
|
|||
|
|
# Based on the model selection study for M3 Max 36 GB (Issue #1063).
|
|||
|
|
#
|
|||
|
|
# Usage:
|
|||
|
|
# ./scripts/benchmark_local_model.sh # test $OLLAMA_MODEL or qwen3:14b
|
|||
|
|
# ./scripts/benchmark_local_model.sh qwen3:8b # test a specific model
|
|||
|
|
# ./scripts/benchmark_local_model.sh qwen3:14b qwen3:8b # compare two models
|
|||
|
|
#
|
|||
|
|
# Thresholds (pass/fail):
|
|||
|
|
# Test 1 — Tool call compliance: >=90% valid JSON responses out of 5 probes
|
|||
|
|
# Test 2 — Code generation: compiles without syntax errors
|
|||
|
|
# Test 3 — Shell command gen: no refusal markers in output
|
|||
|
|
# Test 4 — Multi-turn coherence: session ID echoed back correctly
|
|||
|
|
# Test 5 — Issue triage quality: structured JSON with required fields
|
|||
|
|
#
|
|||
|
|
# Exit codes: 0 = all tests passed, 1 = one or more tests failed
|
|||
|
|
|
|||
|
|
set -euo pipefail
|
|||
|
|
|
|||
|
|
OLLAMA_URL="${OLLAMA_URL:-http://localhost:11434}"
|
|||
|
|
PASS=0
|
|||
|
|
FAIL=0
|
|||
|
|
TOTAL=0
|
|||
|
|
|
|||
|
|
# ── Colours ──────────────────────────────────────────────────────────────────
|
|||
|
|
GREEN='\033[0;32m'
|
|||
|
|
RED='\033[0;31m'
|
|||
|
|
YELLOW='\033[1;33m'
|
|||
|
|
BOLD='\033[1m'
|
|||
|
|
RESET='\033[0m'
|
|||
|
|
|
|||
|
|
pass() { echo -e " ${GREEN}✓ PASS${RESET} $1"; ((PASS++)); ((TOTAL++)); }
|
|||
|
|
fail() { echo -e " ${RED}✗ FAIL${RESET} $1"; ((FAIL++)); ((TOTAL++)); }
|
|||
|
|
info() { echo -e " ${YELLOW}ℹ${RESET} $1"; }
|
|||
|
|
|
|||
|
|
# ── Helper: call Ollama generate API ─────────────────────────────────────────
|
|||
|
|
ollama_generate() {
|
|||
|
|
local model="$1"
|
|||
|
|
local prompt="$2"
|
|||
|
|
local extra_opts="${3:-}"
|
|||
|
|
|
|||
|
|
local payload
|
|||
|
|
payload=$(printf '{"model":"%s","prompt":"%s","stream":false%s}' \
|
|||
|
|
"$model" \
|
|||
|
|
"$(echo "$prompt" | sed 's/"/\\"/g' | tr -d '\n')" \
|
|||
|
|
"${extra_opts:+,$extra_opts}")
|
|||
|
|
|
|||
|
|
curl -s --max-time 60 \
|
|||
|
|
-X POST "${OLLAMA_URL}/api/generate" \
|
|||
|
|
-H "Content-Type: application/json" \
|
|||
|
|
-d "$payload" \
|
|||
|
|
| python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('response',''))" 2>/dev/null || echo ""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ── Helper: call Ollama chat API with tool schema ─────────────────────────────
|
|||
|
|
ollama_chat_tool() {
|
|||
|
|
local model="$1"
|
|||
|
|
local user_msg="$2"
|
|||
|
|
|
|||
|
|
local payload
|
|||
|
|
payload=$(cat <<EOF
|
|||
|
|
{
|
|||
|
|
"model": "$model",
|
|||
|
|
"messages": [{"role": "user", "content": "$user_msg"}],
|
|||
|
|
"tools": [{
|
|||
|
|
"type": "function",
|
|||
|
|
"function": {
|
|||
|
|
"name": "get_current_weather",
|
|||
|
|
"description": "Get the current weather for a location",
|
|||
|
|
"parameters": {
|
|||
|
|
"type": "object",
|
|||
|
|
"properties": {
|
|||
|
|
"location": {"type": "string", "description": "City name"},
|
|||
|
|
"unit": {"type": "string", "enum": ["celsius","fahrenheit"]}
|
|||
|
|
},
|
|||
|
|
"required": ["location"]
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}],
|
|||
|
|
"stream": false
|
|||
|
|
}
|
|||
|
|
EOF
|
|||
|
|
)
|
|||
|
|
curl -s --max-time 60 \
|
|||
|
|
-X POST "${OLLAMA_URL}/api/chat" \
|
|||
|
|
-H "Content-Type: application/json" \
|
|||
|
|
-d "$payload" \
|
|||
|
|
| python3 -c "
|
|||
|
|
import sys, json
|
|||
|
|
d = json.load(sys.stdin)
|
|||
|
|
msg = d.get('message', {})
|
|||
|
|
# Return tool_calls JSON if present, else content
|
|||
|
|
calls = msg.get('tool_calls')
|
|||
|
|
if calls:
|
|||
|
|
print(json.dumps(calls))
|
|||
|
|
else:
|
|||
|
|
print(msg.get('content', ''))
|
|||
|
|
" 2>/dev/null || echo ""
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ── Benchmark a single model ──────────────────────────────────────────────────
|
|||
|
|
benchmark_model() {
|
|||
|
|
local model="$1"
|
|||
|
|
echo ""
|
|||
|
|
echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}"
|
|||
|
|
echo -e "${BOLD} Model: ${model}${RESET}"
|
|||
|
|
echo -e "${BOLD}═══════════════════════════════════════════════════${RESET}"
|
|||
|
|
|
|||
|
|
# Check model availability
|
|||
|
|
local available
|
|||
|
|
available=$(curl -s "${OLLAMA_URL}/api/tags" \
|
|||
|
|
| python3 -c "
|
|||
|
|
import sys, json
|
|||
|
|
d = json.load(sys.stdin)
|
|||
|
|
models = [m.get('name','') for m in d.get('models',[])]
|
|||
|
|
target = '$model'
|
|||
|
|
match = any(target == m or target == m.split(':')[0] or m.startswith(target) for m in models)
|
|||
|
|
print('yes' if match else 'no')
|
|||
|
|
" 2>/dev/null || echo "no")
|
|||
|
|
|
|||
|
|
if [[ "$available" != "yes" ]]; then
|
|||
|
|
echo -e " ${YELLOW}⚠ SKIP${RESET} Model '$model' not available locally — pull it first:"
|
|||
|
|
echo " ollama pull $model"
|
|||
|
|
return 0
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Test 1: Tool Call Compliance ─────────────────────────────────────────
|
|||
|
|
echo ""
|
|||
|
|
echo -e " ${BOLD}Test 1: Tool Call Compliance${RESET} (target ≥90% valid JSON)"
|
|||
|
|
local tool_pass=0
|
|||
|
|
local tool_probes=5
|
|||
|
|
for i in $(seq 1 $tool_probes); do
|
|||
|
|
local response
|
|||
|
|
response=$(ollama_chat_tool "$model" \
|
|||
|
|
"What is the weather in Tokyo right now?")
|
|||
|
|
# Valid if response is non-empty JSON (tool_calls array or JSON object)
|
|||
|
|
if echo "$response" | python3 -c "import sys,json; json.load(sys.stdin)" 2>/dev/null; then
|
|||
|
|
((tool_pass++))
|
|||
|
|
fi
|
|||
|
|
done
|
|||
|
|
local tool_pct=$(( tool_pass * 100 / tool_probes ))
|
|||
|
|
info "Tool call valid JSON: $tool_pass/$tool_probes ($tool_pct%)"
|
|||
|
|
if [[ $tool_pct -ge 90 ]]; then
|
|||
|
|
pass "Tool call compliance ≥90% ($tool_pct%)"
|
|||
|
|
else
|
|||
|
|
fail "Tool call compliance <90% ($tool_pct%) — unreliable for agent loops"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Test 2: Code Generation ──────────────────────────────────────────────
|
|||
|
|
echo ""
|
|||
|
|
echo -e " ${BOLD}Test 2: Code Generation${RESET} (must produce valid Python)"
|
|||
|
|
local code_response
|
|||
|
|
code_response=$(ollama_generate "$model" \
|
|||
|
|
"Write a Python function called fibonacci(n) that returns the nth Fibonacci number using recursion. Return ONLY the function definition, no explanation.")
|
|||
|
|
# Extract Python code block if present
|
|||
|
|
local code
|
|||
|
|
code=$(echo "$code_response" | sed -n '/^```python/,/^```/p' | grep -v '```' || echo "$code_response")
|
|||
|
|
if [[ -z "$code" ]]; then
|
|||
|
|
code="$code_response"
|
|||
|
|
fi
|
|||
|
|
# Validate syntax
|
|||
|
|
if echo "$code" | python3 -c "
|
|||
|
|
import sys, ast
|
|||
|
|
src = sys.stdin.read()
|
|||
|
|
try:
|
|||
|
|
ast.parse(src)
|
|||
|
|
sys.exit(0)
|
|||
|
|
except SyntaxError as e:
|
|||
|
|
sys.exit(1)
|
|||
|
|
" 2>/dev/null; then
|
|||
|
|
pass "Code generation produces valid Python syntax"
|
|||
|
|
else
|
|||
|
|
fail "Code generation produced invalid Python syntax"
|
|||
|
|
info "Response: $(echo "$code_response" | head -5)"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Test 3: Shell Command Generation (No Refusal) ────────────────────────
|
|||
|
|
echo ""
|
|||
|
|
echo -e " ${BOLD}Test 3: Shell Command Generation${RESET} (must not refuse)"
|
|||
|
|
local shell_response
|
|||
|
|
shell_response=$(ollama_generate "$model" \
|
|||
|
|
"Give me a bash one-liner to find all .log files modified in the last 7 days under /var/log and delete them. Output only the command, no explanation.")
|
|||
|
|
# Check for refusal markers
|
|||
|
|
local refusal_found=false
|
|||
|
|
for marker in "cannot" "can't" "I'm sorry" "I am sorry" "won't" "will not" "inappropriate" "harmful" "dangerous" "refuse"; do
|
|||
|
|
if echo "$shell_response" | grep -qi "$marker"; then
|
|||
|
|
refusal_found=true
|
|||
|
|
info "Refusal marker found: '$marker'"
|
|||
|
|
break
|
|||
|
|
fi
|
|||
|
|
done
|
|||
|
|
# Also check that the response contains something shell-like
|
|||
|
|
if echo "$shell_response" | grep -qE 'find|rm|delete|xargs'; then
|
|||
|
|
if [[ "$refusal_found" == "true" ]]; then
|
|||
|
|
fail "Shell command generated but with refusal language"
|
|||
|
|
else
|
|||
|
|
pass "Shell command generated without refusal"
|
|||
|
|
fi
|
|||
|
|
else
|
|||
|
|
if [[ "$refusal_found" == "true" ]]; then
|
|||
|
|
fail "Shell command refused — model will block agent shell operations"
|
|||
|
|
else
|
|||
|
|
fail "Shell command not generated (no find/rm/delete/xargs in output)"
|
|||
|
|
info "Response: $(echo "$shell_response" | head -3)"
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Test 4: Multi-Turn Agent Loop Coherence ──────────────────────────────
|
|||
|
|
echo ""
|
|||
|
|
echo -e " ${BOLD}Test 4: Multi-Turn Agent Loop Coherence${RESET}"
|
|||
|
|
local session_id="SESS-$(date +%s)"
|
|||
|
|
local turn1_response
|
|||
|
|
turn1_response=$(ollama_generate "$model" \
|
|||
|
|
"You are starting a multi-step task. Your session ID is $session_id. Acknowledge this ID and ask for the first task.")
|
|||
|
|
local turn2_response
|
|||
|
|
turn2_response=$(ollama_generate "$model" \
|
|||
|
|
"Continuing session $session_id. Previous context: you acknowledged the session. Now summarize what session ID you are working in. Include the exact ID.")
|
|||
|
|
if echo "$turn2_response" | grep -q "$session_id"; then
|
|||
|
|
pass "Multi-turn coherence: session ID echoed back correctly"
|
|||
|
|
else
|
|||
|
|
fail "Multi-turn coherence: session ID not found in follow-up response"
|
|||
|
|
info "Expected: $session_id"
|
|||
|
|
info "Response snippet: $(echo "$turn2_response" | head -3)"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ── Test 5: Issue Triage Quality ─────────────────────────────────────────
|
|||
|
|
echo ""
|
|||
|
|
echo -e " ${BOLD}Test 5: Issue Triage Quality${RESET} (must return structured JSON)"
|
|||
|
|
local triage_response
|
|||
|
|
triage_response=$(ollama_generate "$model" \
|
|||
|
|
'Triage this bug report and respond ONLY with a JSON object with fields: priority (low/medium/high/critical), component (string), estimated_effort (hours as integer), needs_reproduction (boolean). Bug: "The dashboard crashes with a 500 error when submitting an empty chat message. Reproducible 100% of the time on the /chat endpoint."')
|
|||
|
|
local triage_valid=false
|
|||
|
|
if echo "$triage_response" | python3 -c "
|
|||
|
|
import sys, json, re
|
|||
|
|
text = sys.stdin.read()
|
|||
|
|
# Try to extract JSON from response (may be wrapped in markdown)
|
|||
|
|
match = re.search(r'\{[^{}]+\}', text, re.DOTALL)
|
|||
|
|
if not match:
|
|||
|
|
sys.exit(1)
|
|||
|
|
try:
|
|||
|
|
d = json.loads(match.group())
|
|||
|
|
required = {'priority', 'component', 'estimated_effort', 'needs_reproduction'}
|
|||
|
|
if required.issubset(d.keys()):
|
|||
|
|
valid_priority = d['priority'] in ('low','medium','high','critical')
|
|||
|
|
if valid_priority:
|
|||
|
|
sys.exit(0)
|
|||
|
|
sys.exit(1)
|
|||
|
|
except:
|
|||
|
|
sys.exit(1)
|
|||
|
|
" 2>/dev/null; then
|
|||
|
|
pass "Issue triage returned valid structured JSON with all required fields"
|
|||
|
|
else
|
|||
|
|
fail "Issue triage did not return valid structured JSON"
|
|||
|
|
info "Response: $(echo "$triage_response" | head -5)"
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ── Summary ───────────────────────────────────────────────────────────────────
|
|||
|
|
print_summary() {
|
|||
|
|
local model="$1"
|
|||
|
|
local model_pass="$2"
|
|||
|
|
local model_total="$3"
|
|||
|
|
echo ""
|
|||
|
|
local pct=$(( model_pass * 100 / model_total ))
|
|||
|
|
if [[ $model_pass -eq $model_total ]]; then
|
|||
|
|
echo -e " ${GREEN}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — READY FOR AGENT USE${RESET}"
|
|||
|
|
elif [[ $pct -ge 60 ]]; then
|
|||
|
|
echo -e " ${YELLOW}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — MARGINAL${RESET}"
|
|||
|
|
else
|
|||
|
|
echo -e " ${RED}${BOLD}RESULT: $model_pass/$model_total tests passed ($pct%) — NOT RECOMMENDED${RESET}"
|
|||
|
|
fi
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# ── Main ─────────────────────────────────────────────────────────────────────
|
|||
|
|
models=("${@:-${OLLAMA_MODEL:-qwen3:14b}}")
|
|||
|
|
|
|||
|
|
for model in "${models[@]}"; do
|
|||
|
|
PASS=0
|
|||
|
|
FAIL=0
|
|||
|
|
TOTAL=0
|
|||
|
|
benchmark_model "$model"
|
|||
|
|
print_summary "$model" "$PASS" "$TOTAL"
|
|||
|
|
done
|
|||
|
|
|
|||
|
|
echo ""
|
|||
|
|
if [[ $FAIL -eq 0 ]]; then
|
|||
|
|
exit 0
|
|||
|
|
else
|
|||
|
|
exit 1
|
|||
|
|
fi
|