Files
hermes-agent/tests/test_evaluate_qwen35.py
Alexander Whitestone adcb5b1ea9
Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 48s
feat: evaluate Qwen3.5:35B as local model option (#288)
Part of Epic #281 -- Vitalik's Secure LLM Architecture.

Evaluation of Qwen3.5-35B-A3B (MoE, 35B total / 3B active) for local
deployment as privacy-sensitive inference tier.

- scripts/evaluate_qwen35.py: specs, VRAM, hardware matrix, security
  scoring (Vitalik framework 8.8/10), fleet comparison, integration
- tests/test_evaluate_qwen35.py: 9 tests

Verdict: APPROVED. Perfect data locality, 128K context, Apache 2.0,
MoE speed advantage, tool use supported, eliminates Privacy Filter.

Closes #288
2026-04-13 21:13:17 -04:00

64 lines
1.9 KiB
Python

"""Tests for Qwen3.5:35B evaluation -- Issue #288."""
import json
import pytest
from scripts.evaluate_qwen35 import (
ModelSpec, FLEET_MODELS, SECURITY_CRITERIA, HARDWARE_PROFILES,
check_ollama_status, generate_report,
)
class TestModelSpec:
def test_spec_fields(self):
s = ModelSpec()
assert s.name == "Qwen3.5-35B-A3B"
assert s.total_params == "35B"
assert s.active_params == "3B per token"
assert s.context_length == 131072
assert s.license == "Apache 2.0"
assert s.tool_use_support is True
def test_quantization_decreasing_vram(self):
s = ModelSpec()
items = sorted(s.quantization_options.items(), key=lambda x: x[1])
for i in range(1, len(items)):
assert items[i][1] >= items[i-1][1]
class TestSecurity:
def test_scores_in_range(self):
for c in SECURITY_CRITERIA:
assert 1 <= c["score"] <= 10
assert c["weight"] in ("CRITICAL", "HIGH", "MEDIUM")
def test_weighted_average(self):
wm = {"CRITICAL": 3, "HIGH": 2, "MEDIUM": 1}
tw = sum(wm[c["weight"]] for c in SECURITY_CRITERIA)
ws = sum(c["score"] * wm[c["weight"]] for c in SECURITY_CRITERIA)
assert ws / tw >= 7.0
class TestHardware:
def test_m2_ultra_fits(self):
assert HARDWARE_PROFILES["mac_m2_ultra_192gb"]["fits_q4"] is True
def test_m1_doesnt_fit(self):
assert HARDWARE_PROFILES["mac_m1_16gb"]["fits_q4"] is False
class TestReport:
def test_has_all_sections(self):
r = generate_report()
for s in ["Model Specification", "VRAM", "Hardware", "Security", "Fleet", "Recommendation"]:
assert s in r, f"Missing: {s}"
def test_verdict_approved(self):
assert "APPROVED" in generate_report()
class TestOllama:
def test_returns_dict(self):
r = check_ollama_status()
assert isinstance(r, dict)
assert "running" in r