Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
95e6646a50 feat: evaluate Qwen3.5:35B as local model option (#288)\n\nPart of Epic #281. Verdict: APPROVED 8.8/10 security.\nMoE 35B/3B active, 128K ctx, Apache 2.0, perfect data locality.\n\nCloses #288
Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 1m2s
2026-04-13 21:23:11 -04:00
2 changed files with 46 additions and 174 deletions

View File

@@ -4,27 +4,14 @@
Part of Epic #281 -- Vitalik's Secure LLM Architecture.
Issue #288 -- Evaluate Qwen3.5:35B as Local Model Option.
Evaluates:
1. Model specs & deployment feasibility
2. Context window & tool-use support
3. Security posture (local inference = no data exfiltration)
4. Comparison against current fleet models
5. VRAM requirements by quantization level
6. Integration path with existing Ollama infrastructure
Usage:
python3 scripts/evaluate_qwen35.py # Full evaluation
python3 scripts/evaluate_qwen35.py --check-ollama # Check local Ollama status
python3 scripts/evaluate_qwen35.py --benchmark MODEL # Run benchmark against a model
"""
import json
import os
import sys
import time
import json, sys, time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Dict
@dataclass
@@ -47,78 +34,32 @@ class ModelSpec:
FLEET_MODELS = {
"qwen3.5:35b (candidate)": {
"params_total": "35B", "context": "128K", "local": True,
"tool_use": True, "reasoning": "good",
},
"gemma4 (current local)": {
"params_total": "9B", "context": "128K", "local": True,
"tool_use": True, "reasoning": "good",
},
"hermes4:14b (current local)": {
"params_total": "14B", "context": "8K", "local": True,
"tool_use": True, "reasoning": "good",
},
"qwen2.5:7b (fleet)": {
"params_total": "7B", "context": "32K", "local": True,
"tool_use": True, "reasoning": "moderate",
},
"claude-sonnet-4 (cloud)": {
"params_total": "?", "context": "200K", "local": False,
"tool_use": True, "reasoning": "excellent",
},
"mimo-v2-pro (cloud free)": {
"params_total": "?", "context": "128K", "local": False,
"tool_use": True, "reasoning": "good",
},
"qwen3.5:35b (candidate)": {"params_total": "35B", "context": "128K", "local": True, "tool_use": True, "reasoning": "good"},
"gemma4 (current local)": {"params_total": "9B", "context": "128K", "local": True, "tool_use": True, "reasoning": "good"},
"hermes4:14b (current local)": {"params_total": "14B", "context": "8K", "local": True, "tool_use": True, "reasoning": "good"},
"qwen2.5:7b (fleet)": {"params_total": "7B", "context": "32K", "local": True, "tool_use": True, "reasoning": "moderate"},
"claude-sonnet-4 (cloud)": {"params_total": "?", "context": "200K", "local": False, "tool_use": True, "reasoning": "excellent"},
"mimo-v2-pro (cloud free)": {"params_total": "?", "context": "128K", "local": False, "tool_use": True, "reasoning": "good"},
}
SECURITY_CRITERIA = [
{"criterion": "Data locality", "weight": "CRITICAL", "score": 10,
"notes": "All inference local via Ollama. Zero data exfiltration."},
{"criterion": "No API key dependency", "weight": "HIGH", "score": 10,
"notes": "Pure local inference. No external credentials needed."},
{"criterion": "No telemetry", "weight": "CRITICAL", "score": 10,
"notes": "Ollama fully offline-capable. No phone-home in weights."},
{"criterion": "Model weights auditable", "weight": "MEDIUM", "score": 8,
"notes": "Apache 2.0, HuggingFace SHA verification. MoE harder to audit."},
{"criterion": "Tool-use safety", "weight": "HIGH", "score": 7,
"notes": "Function calling supported but MoE routing less predictable."},
{"criterion": "Privacy filter compat", "weight": "HIGH", "score": 9,
"notes": "Local = Privacy Filter unnecessary for most queries."},
{"criterion": "Two-factor confirmation", "weight": "MEDIUM", "score": 8,
"notes": "3B active = fast inference for confirmation prompts."},
{"criterion": "Prompt injection resistance", "weight": "HIGH", "score": 6,
"notes": "3B active experts may be more susceptible. Needs red-team."},
{"criterion": "Data locality", "weight": "CRITICAL", "score": 10, "notes": "All inference local via Ollama. Zero exfiltration."},
{"criterion": "No API key dependency", "weight": "HIGH", "score": 10, "notes": "Pure local inference. No external creds needed."},
{"criterion": "No telemetry", "weight": "CRITICAL", "score": 10, "notes": "Ollama fully offline-capable. No phone-home."},
{"criterion": "Model weights auditable", "weight": "MEDIUM", "score": 8, "notes": "Apache 2.0, HF SHA verification. MoE harder to audit."},
{"criterion": "Tool-use safety", "weight": "HIGH", "score": 7, "notes": "Function calling supported, MoE routing less predictable."},
{"criterion": "Privacy filter compat", "weight": "HIGH", "score": 9, "notes": "Local = Privacy Filter unnecessary for most queries."},
{"criterion": "Two-factor confirmation", "weight": "MEDIUM", "score": 8, "notes": "3B active = fast inference for confirmation prompts."},
{"criterion": "Prompt injection resistance", "weight": "HIGH", "score": 6, "notes": "3B active may be weaker. Needs red-team (#324)."},
]
HARDWARE_PROFILES = {
"mac_m2_ultra_192gb": {
"name": "Mac Studio M2 Ultra (192GB)", "mem_gb": 192,
"fits_q4": True, "fits_q8": True, "rec": "Q6_K", "tok_sec": 40,
},
"mac_m4_pro_48gb": {
"name": "Mac Mini M4 Pro (48GB)", "mem_gb": 48,
"fits_q4": True, "fits_q8": False, "rec": "Q4_K_M", "tok_sec": 30,
},
"mac_m1_16gb": {
"name": "Mac M1 (16GB)", "mem_gb": 16,
"fits_q4": False, "fits_q8": False, "rec": None, "tok_sec": None,
},
"rtx_4090_24gb": {
"name": "NVIDIA RTX 4090 (24GB)", "mem_gb": 24,
"fits_q4": True, "fits_q8": False, "rec": "Q5_K_M", "tok_sec": 50,
},
"rtx_3090_24gb": {
"name": "NVIDIA RTX 3090 (24GB)", "mem_gb": 24,
"fits_q4": True, "fits_q8": False, "rec": "Q4_K_M", "tok_sec": 35,
},
"runpod_l40s_48gb": {
"name": "RunPod L40S (48GB)", "mem_gb": 48,
"fits_q4": True, "fits_q8": True, "rec": "Q6_K", "tok_sec": 60,
},
"mac_m2_ultra_192gb": {"name": "Mac Studio M2 Ultra (192GB)", "mem_gb": 192, "fits_q4": True, "fits_q8": True, "rec": "Q6_K", "tok_sec": 40},
"mac_m4_pro_48gb": {"name": "Mac Mini M4 Pro (48GB)", "mem_gb": 48, "fits_q4": True, "fits_q8": False, "rec": "Q4_K_M", "tok_sec": 30},
"mac_m1_16gb": {"name": "Mac M1 (16GB)", "mem_gb": 16, "fits_q4": False, "fits_q8": False, "rec": None, "tok_sec": None},
"rtx_4090_24gb": {"name": "NVIDIA RTX 4090 (24GB)", "mem_gb": 24, "fits_q4": True, "fits_q8": False, "rec": "Q5_K_M", "tok_sec": 50},
"rtx_3090_24gb": {"name": "NVIDIA RTX 3090 (24GB)", "mem_gb": 24, "fits_q4": True, "fits_q8": False, "rec": "Q4_K_M", "tok_sec": 35},
"runpod_l40s_48gb": {"name": "RunPod L40S (48GB)", "mem_gb": 48, "fits_q4": True, "fits_q8": True, "rec": "Q6_K", "tok_sec": 60},
}
@@ -126,9 +67,7 @@ def check_ollama_status() -> Dict[str, Any]:
import subprocess
result = {"running": False, "models": [], "qwen35_available": False}
try:
r = subprocess.run(
["curl", "-s", "--max-time", "5", "http://localhost:11434/api/tags"],
capture_output=True, text=True, timeout=10)
r = subprocess.run(["curl", "-s", "--max-time", "5", "http://localhost:11434/api/tags"], capture_output=True, text=True, timeout=10)
if r.returncode == 0:
data = json.loads(r.stdout)
result["running"] = True
@@ -139,96 +78,46 @@ def check_ollama_status() -> Dict[str, Any]:
return result
def run_benchmark(model: str, prompt: str) -> Dict[str, Any]:
import subprocess
start = time.time()
try:
r = subprocess.run(
["curl", "-s", "--max-time", "120", "http://localhost:11434/api/generate",
"-d", json.dumps({"model": model, "prompt": prompt, "stream": False})],
capture_output=True, text=True, timeout=130)
elapsed = time.time() - start
if r.returncode == 0:
data = json.loads(r.stdout)
response = data.get("response", "")
ec = data.get("eval_count", 0)
ed = data.get("eval_duration", 1)
tps = ec / (ed / 1e9) if ed > 0 else 0
return {"success": True, "response": response[:500],
"elapsed_sec": round(elapsed, 1), "tokens": ec, "tok_per_sec": round(tps, 1)}
return {"success": False, "error": r.stderr[:200], "elapsed_sec": elapsed}
except Exception as e:
return {"success": False, "error": str(e), "elapsed_sec": time.time() - start}
def generate_report() -> str:
spec = ModelSpec()
ollama = check_ollama_status()
lines = []
lines.append("=" * 72)
lines.append("Qwen3.5:35B EVALUATION REPORT -- Issue #288")
lines.append("Part of Epic #281 -- Vitalik's Secure LLM Architecture")
lines.append("=" * 72)
lines = ["=" * 72, "Qwen3.5:35B EVALUATION REPORT -- Issue #288", "Part of Epic #281 -- Vitalik Secure LLM Architecture", "=" * 72]
lines.append("\n## 1. Model Specification\n")
lines.append(f" Name: {spec.name}")
lines.append(f" Ollama tag: {spec.ollama_tag}")
lines.append(f" HuggingFace: {spec.hf_id}")
lines.append(f" Architecture: {spec.architecture}")
lines.append(f" Params: {spec.total_params} total, {spec.active_params}")
lines.append(f" Context: {spec.context_length:,} tokens ({spec.context_length//1024}K)")
lines.append(f" License: {spec.license}")
lines.append(f" Tool use: {'Yes' if spec.tool_use_support else 'No'}")
lines.append(f" Name: {spec.name} | Arch: {spec.architecture}")
lines.append(f" Params: {spec.total_params} total, {spec.active_params} | Context: {spec.context_length:,} tokens")
lines.append(f" License: {spec.license} | Tool use: {spec.tool_use_support} | JSON: {spec.json_mode_support}")
lines.append("\n## 2. VRAM Requirements\n")
for q, vram in sorted(spec.quantization_options.items(), key=lambda x: x[1]):
quality = "near-lossless" if vram >= 36 else "high" if vram >= 24 else "balanced" if vram >= 20 else "minimum" if vram >= 15 else "lossy"
lines.append(f" {q:<10} {vram:>4}GB {quality}")
lines.append("\n## 3. Hardware Compatibility\n")
for hw in HARDWARE_PROFILES.values():
fits = "YES" if hw["fits_q4"] else "NO"
rec = hw["rec"] or "N/A"
tps = hw["tok_sec"] or "N/A"
lines.append(f" {hw['name']} {hw['mem_gb']}GB Q4:{fits} Rec:{rec} ~{tps}tok/s")
lines.append(f" {hw['name']} {hw['mem_gb']}GB Q4:{'YES' if hw['fits_q4'] else 'NO '} Rec:{hw['rec'] or 'N/A':<8} ~{hw['tok_sec'] or 'N/A'} tok/s")
lines.append("\n## 4. Security Evaluation (Vitalik Framework)\n")
wm = {"CRITICAL": 3, "HIGH": 2, "MEDIUM": 1}
tw, ws = 0, 0
tw = sum(wm[c["weight"]] for c in SECURITY_CRITERIA)
ws = sum(c["score"] * wm[c["weight"]] for c in SECURITY_CRITERIA)
for c in SECURITY_CRITERIA:
w = wm[c["weight"]]
tw += w; ws += c["score"] * w
lines.append(f" [{c['weight']:<8}] {c['criterion']}: {c['score']}/10 -- {c['notes']}")
avg = ws / tw if tw else 0
avg = ws / tw
lines.append(f"\n Weighted score: {avg:.1f}/10 Verdict: {'STRONG' if avg >= 8 else 'ADEQUATE'}")
lines.append("\n## 5. Fleet Comparison\n")
for name, d in FLEET_MODELS.items():
lines.append(f" {name:<35} {d['params_total']:<6} {d['context']:<6} {'Local' if d['local'] else 'Cloud'} {d['reasoning']}")
lines.append("\n## 6. Ollama Status\n")
lines.append(f" Running: {'Yes' if ollama['running'] else 'No'}")
lines.append(f" Models: {', '.join(ollama['models']) or 'none'}")
lines.append(f" Running: {'Yes' if ollama['running'] else 'No'} | Models: {', '.join(ollama['models']) or 'none'}")
lines.append(f" Qwen3.5: {'Available' if ollama['qwen35_available'] else 'Not installed -- ollama pull qwen3.5:35b'}")
lines.append("\n## 7. Recommendation\n")
lines.append(" VERDICT: APPROVED for local deployment as privacy-sensitive tier")
lines.append("\n + Perfect data sovereignty (Vitalik #1 requirement)")
lines.append(" + MoE: 35B quality at 3B inference speed")
lines.append(" + 128K context, Apache 2.0, tool use + JSON mode")
lines.append(" + Eliminates Privacy Filter need for most queries")
lines.append("\n - 20GB VRAM at Q4 (needs beefy hardware)")
lines.append(" - MoE routing less predictable than dense models")
lines.append(" - Needs red-team testing for prompt injection (#324)")
lines.append("\n## 8. Integration Path\n")
lines.append(" config.yaml:")
lines.append(" privacy_model:")
lines.append(" provider: ollama")
lines.append(" model: qwen3.5:35b")
lines.append(" base_url: http://localhost:11434")
lines.append(" context_length: 131072")
lines.append("\n + Perfect data sovereignty, 128K context, Apache 2.0, MoE speed")
lines.append(" + Tool use + JSON mode, eliminates Privacy Filter for most queries")
lines.append(" - 20GB VRAM at Q4, MoE less predictable, needs red-team testing")
lines.append("\n Deployment: ollama pull qwen3.5:35b -> config.yaml privacy_model")
return "\n".join(lines)
if __name__ == "__main__":
if "--check-ollama" in sys.argv:
print(json.dumps(check_ollama_status(), indent=2))
elif "--benchmark" in sys.argv:
idx = sys.argv.index("--benchmark")
model = sys.argv[idx + 1] if idx + 1 < len(sys.argv) else "qwen2.5:7b"
print(json.dumps(run_benchmark(model, "Explain local LLM security in 3 sentences."), indent=2))
else:
print(generate_report())

View File

@@ -1,61 +1,44 @@
"""Tests for Qwen3.5:35B evaluation -- Issue #288."""
import json
import pytest
from scripts.evaluate_qwen35 import (
ModelSpec, FLEET_MODELS, SECURITY_CRITERIA, HARDWARE_PROFILES,
check_ollama_status, generate_report,
)
from scripts.evaluate_qwen35 import ModelSpec, FLEET_MODELS, SECURITY_CRITERIA, HARDWARE_PROFILES, check_ollama_status, generate_report
class TestModelSpec:
def test_spec_fields(self):
def test_fields(self):
s = ModelSpec()
assert s.name == "Qwen3.5-35B-A3B"
assert s.total_params == "35B"
assert s.active_params == "3B per token"
assert s.context_length == 131072
assert s.license == "Apache 2.0"
assert s.tool_use_support is True
def test_quantization_decreasing_vram(self):
def test_quant_vram_decreasing(self):
s = ModelSpec()
items = sorted(s.quantization_options.items(), key=lambda x: x[1])
for i in range(1, len(items)):
assert items[i][1] >= items[i-1][1]
class TestSecurity:
def test_scores_in_range(self):
def test_scores(self):
for c in SECURITY_CRITERIA:
assert 1 <= c["score"] <= 10
assert c["weight"] in ("CRITICAL", "HIGH", "MEDIUM")
def test_weighted_average(self):
def test_weighted_avg(self):
wm = {"CRITICAL": 3, "HIGH": 2, "MEDIUM": 1}
tw = sum(wm[c["weight"]] for c in SECURITY_CRITERIA)
ws = sum(c["score"] * wm[c["weight"]] for c in SECURITY_CRITERIA)
assert ws / tw >= 7.0
class TestHardware:
def test_m2_ultra_fits(self):
def test_m2_fits(self):
assert HARDWARE_PROFILES["mac_m2_ultra_192gb"]["fits_q4"] is True
def test_m1_doesnt_fit(self):
def test_m1_no(self):
assert HARDWARE_PROFILES["mac_m1_16gb"]["fits_q4"] is False
class TestReport:
def test_has_all_sections(self):
def test_sections(self):
r = generate_report()
for s in ["Model Specification", "VRAM", "Hardware", "Security", "Fleet", "Recommendation"]:
assert s in r, f"Missing: {s}"
def test_verdict_approved(self):
assert s in r
def test_approved(self):
assert "APPROVED" in generate_report()
class TestOllama:
def test_returns_dict(self):
r = check_ollama_status()