forked from Rockachopa/Timmy-time-dashboard
343 lines
12 KiB
Python
343 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
"""Hermes 4 smoke test and tool-calling validation script.
|
||
|
||
Tests the Hermes 4 14B model after importing into Ollama. Covers:
|
||
1. Basic connectivity — model responds
|
||
2. Memory usage — under 28 GB with model loaded
|
||
3. Tool calling — structured JSON output (not raw text)
|
||
4. Reasoning — <think> tag toggling works
|
||
5. Timmy-persona smoke test — agent identity prompt
|
||
|
||
Usage:
|
||
python scripts/test_hermes4.py # Run all tests
|
||
python scripts/test_hermes4.py --model hermes4-14b
|
||
python scripts/test_hermes4.py --model hermes4-36b --ctx 8192
|
||
|
||
Epic: #1091 Project Bannerlord — AutoLoRA Sovereignty Loop (Step 2 of 7)
|
||
Refs: #1101
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
from typing import Any
|
||
|
||
try:
|
||
import requests
|
||
except ImportError:
|
||
print("ERROR: 'requests' not installed. Run: pip install requests")
|
||
sys.exit(1)
|
||
|
||
OLLAMA_URL = "http://localhost:11434"
|
||
DEFAULT_MODEL = "hermes4-14b"
|
||
MEMORY_LIMIT_GB = 28.0
|
||
|
||
# ── Tool schema used for tool-calling tests ──────────────────────────────────
|
||
|
||
READ_FILE_TOOL = {
|
||
"type": "function",
|
||
"function": {
|
||
"name": "read_file",
|
||
"description": "Read the contents of a file at the given path",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"path": {
|
||
"type": "string",
|
||
"description": "Absolute or relative path to the file",
|
||
}
|
||
},
|
||
"required": ["path"],
|
||
},
|
||
},
|
||
}
|
||
|
||
LIST_ISSUES_TOOL = {
|
||
"type": "function",
|
||
"function": {
|
||
"name": "list_issues",
|
||
"description": "List open issues from a Gitea repository",
|
||
"parameters": {
|
||
"type": "object",
|
||
"properties": {
|
||
"repo": {"type": "string", "description": "owner/repo slug"},
|
||
"state": {
|
||
"type": "string",
|
||
"enum": ["open", "closed", "all"],
|
||
"description": "Issue state filter",
|
||
},
|
||
},
|
||
"required": ["repo"],
|
||
},
|
||
},
|
||
}
|
||
|
||
|
||
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def _post(endpoint: str, payload: dict, timeout: int = 60) -> dict[str, Any]:
|
||
"""POST to Ollama and return parsed JSON."""
|
||
url = f"{OLLAMA_URL}{endpoint}"
|
||
resp = requests.post(url, json=payload, timeout=timeout)
|
||
resp.raise_for_status()
|
||
return resp.json()
|
||
|
||
|
||
def _ollama_memory_gb() -> float:
|
||
"""Estimate Ollama process RSS in GB using ps (macOS/Linux)."""
|
||
try:
|
||
# Look for ollama process RSS (macOS: column 6 in MB, Linux: column 6 in KB)
|
||
result = subprocess.run(
|
||
["ps", "-axo", "pid,comm,rss"],
|
||
capture_output=True,
|
||
text=True,
|
||
check=False,
|
||
)
|
||
total_kb = 0
|
||
for line in result.stdout.splitlines():
|
||
if "ollama" in line.lower():
|
||
parts = line.split()
|
||
try:
|
||
total_kb += int(parts[-1])
|
||
except (ValueError, IndexError):
|
||
pass
|
||
return total_kb / (1024 * 1024) # KB → GB
|
||
except Exception:
|
||
return 0.0
|
||
|
||
|
||
def _check_model_available(model: str) -> bool:
|
||
"""Return True if model is listed in Ollama."""
|
||
try:
|
||
resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
|
||
resp.raise_for_status()
|
||
names = [m["name"] for m in resp.json().get("models", [])]
|
||
return any(model in n for n in names)
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def _chat(model: str, messages: list[dict], tools: list | None = None) -> dict:
|
||
"""Send a chat request to Ollama."""
|
||
payload: dict = {"model": model, "messages": messages, "stream": False}
|
||
if tools:
|
||
payload["tools"] = tools
|
||
return _post("/api/chat", payload, timeout=120)
|
||
|
||
|
||
# ── Test cases ────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def test_model_available(model: str) -> bool:
|
||
"""PASS: model is registered in Ollama."""
|
||
print(f"\n[1/5] Checking model availability: {model}")
|
||
if _check_model_available(model):
|
||
print(f" ✓ {model} is available in Ollama")
|
||
return True
|
||
print(
|
||
f" ✗ {model} not found. Import with:\n"
|
||
f" ollama create {model} -f Modelfile.hermes4-14b\n"
|
||
f" Or pull directly if on registry:\n"
|
||
f" ollama pull {model}"
|
||
)
|
||
return False
|
||
|
||
|
||
def test_basic_response(model: str) -> bool:
|
||
"""PASS: model responds coherently to a simple prompt."""
|
||
print(f"\n[2/5] Basic response test")
|
||
messages = [
|
||
{"role": "user", "content": "Reply with exactly: HERMES_OK"},
|
||
]
|
||
try:
|
||
t0 = time.time()
|
||
data = _chat(model, messages)
|
||
elapsed = time.time() - t0
|
||
content = data.get("message", {}).get("content", "")
|
||
if "HERMES_OK" in content:
|
||
print(f" ✓ Basic response OK ({elapsed:.1f}s): {content.strip()}")
|
||
return True
|
||
print(f" ✗ Unexpected response ({elapsed:.1f}s): {content[:200]!r}")
|
||
return False
|
||
except Exception as exc:
|
||
print(f" ✗ Request failed: {exc}")
|
||
return False
|
||
|
||
|
||
def test_memory_usage() -> bool:
|
||
"""PASS: Ollama process RSS is under MEMORY_LIMIT_GB."""
|
||
print(f"\n[3/5] Memory usage check (limit: {MEMORY_LIMIT_GB} GB)")
|
||
mem_gb = _ollama_memory_gb()
|
||
if mem_gb == 0.0:
|
||
print(" ~ Could not determine memory usage (ps unavailable?), skipping")
|
||
return True
|
||
if mem_gb < MEMORY_LIMIT_GB:
|
||
print(f" ✓ Memory usage: {mem_gb:.1f} GB (under {MEMORY_LIMIT_GB} GB limit)")
|
||
return True
|
||
print(
|
||
f" ✗ Memory usage: {mem_gb:.1f} GB exceeds {MEMORY_LIMIT_GB} GB limit.\n"
|
||
" Consider using Q4_K_M quantisation or reducing num_ctx."
|
||
)
|
||
return False
|
||
|
||
|
||
def test_tool_calling(model: str) -> bool:
|
||
"""PASS: model produces a tool_calls response (not raw text) for a tool-use prompt."""
|
||
print(f"\n[4/5] Tool-calling test")
|
||
messages = [
|
||
{
|
||
"role": "user",
|
||
"content": "Please read the file at /tmp/test.txt using the read_file tool.",
|
||
}
|
||
]
|
||
try:
|
||
t0 = time.time()
|
||
data = _chat(model, messages, tools=[READ_FILE_TOOL])
|
||
elapsed = time.time() - t0
|
||
msg = data.get("message", {})
|
||
tool_calls = msg.get("tool_calls", [])
|
||
|
||
if tool_calls:
|
||
tc = tool_calls[0]
|
||
fn = tc.get("function", {})
|
||
print(
|
||
f" ✓ Tool call produced ({elapsed:.1f}s):\n"
|
||
f" function: {fn.get('name')}\n"
|
||
f" arguments: {json.dumps(fn.get('arguments', {}), indent=6)}"
|
||
)
|
||
# Verify the function name is correct
|
||
return fn.get("name") == "read_file"
|
||
|
||
# Some models return JSON in the content instead of tool_calls
|
||
content = msg.get("content", "")
|
||
if "read_file" in content and "{" in content:
|
||
print(
|
||
f" ~ Model returned tool call as text (not structured). ({elapsed:.1f}s)\n"
|
||
f" This is acceptable for the base model before fine-tuning.\n"
|
||
f" Content: {content[:300]}"
|
||
)
|
||
# Partial pass — model attempted tool calling but via text
|
||
return True
|
||
|
||
print(
|
||
f" ✗ No tool call in response ({elapsed:.1f}s).\n"
|
||
f" Content: {content[:300]!r}"
|
||
)
|
||
return False
|
||
except Exception as exc:
|
||
print(f" ✗ Tool-calling request failed: {exc}")
|
||
return False
|
||
|
||
|
||
def test_timmy_persona(model: str) -> bool:
|
||
"""PASS: model accepts a Timmy persona system prompt and responds in-character."""
|
||
print(f"\n[5/5] Timmy-persona smoke test")
|
||
messages = [
|
||
{
|
||
"role": "system",
|
||
"content": (
|
||
"You are Timmy, Alexander's personal AI agent. "
|
||
"You are concise, direct, and helpful. "
|
||
"You always start your responses with 'Timmy here:'."
|
||
),
|
||
},
|
||
{
|
||
"role": "user",
|
||
"content": "What is your name and what can you help me with?",
|
||
},
|
||
]
|
||
try:
|
||
t0 = time.time()
|
||
data = _chat(model, messages)
|
||
elapsed = time.time() - t0
|
||
content = data.get("message", {}).get("content", "")
|
||
if "Timmy" in content or "timmy" in content.lower():
|
||
print(f" ✓ Persona accepted ({elapsed:.1f}s): {content[:200].strip()}")
|
||
return True
|
||
print(
|
||
f" ~ Persona response lacks 'Timmy' identifier ({elapsed:.1f}s).\n"
|
||
f" This is a fine-tuning target.\n"
|
||
f" Response: {content[:200]!r}"
|
||
)
|
||
# Soft pass — base model isn't expected to be perfectly in-character
|
||
return True
|
||
except Exception as exc:
|
||
print(f" ✗ Persona test failed: {exc}")
|
||
return False
|
||
|
||
|
||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def main() -> int:
|
||
parser = argparse.ArgumentParser(description="Hermes 4 smoke test suite")
|
||
parser.add_argument(
|
||
"--model",
|
||
default=DEFAULT_MODEL,
|
||
help=f"Ollama model name (default: {DEFAULT_MODEL})",
|
||
)
|
||
parser.add_argument(
|
||
"--ollama-url",
|
||
default=OLLAMA_URL,
|
||
help=f"Ollama base URL (default: {OLLAMA_URL})",
|
||
)
|
||
args = parser.parse_args()
|
||
|
||
global OLLAMA_URL
|
||
OLLAMA_URL = args.ollama_url.rstrip("/")
|
||
model = args.model
|
||
|
||
print("=" * 60)
|
||
print(f"Hermes 4 Validation Suite — {model}")
|
||
print(f"Ollama: {OLLAMA_URL}")
|
||
print("=" * 60)
|
||
|
||
results: dict[str, bool] = {}
|
||
|
||
# Test 1: availability (gate — skip remaining if model missing)
|
||
results["available"] = test_model_available(model)
|
||
if not results["available"]:
|
||
print("\n⚠ Model not available — skipping remaining tests.")
|
||
print(" Import the model first (see Modelfile.hermes4-14b).")
|
||
_print_summary(results)
|
||
return 1
|
||
|
||
# Tests 2–5
|
||
results["basic_response"] = test_basic_response(model)
|
||
results["memory_usage"] = test_memory_usage()
|
||
results["tool_calling"] = test_tool_calling(model)
|
||
results["timmy_persona"] = test_timmy_persona(model)
|
||
|
||
return _print_summary(results)
|
||
|
||
|
||
def _print_summary(results: dict[str, bool]) -> int:
|
||
passed = sum(results.values())
|
||
total = len(results)
|
||
print("\n" + "=" * 60)
|
||
print(f"Results: {passed}/{total} passed")
|
||
print("=" * 60)
|
||
for name, ok in results.items():
|
||
icon = "✓" if ok else "✗"
|
||
print(f" {icon} {name}")
|
||
|
||
if passed == total:
|
||
print("\n✓ All tests passed. Hermes 4 is ready for AutoLoRA fine-tuning.")
|
||
print(" Next step: document WORK vs FAIL skill list → fine-tuning targets.")
|
||
elif results.get("tool_calling") is False:
|
||
print("\n⚠ Tool-calling FAILED. This is the primary fine-tuning target.")
|
||
print(" Base model may need LoRA tuning on tool-use examples.")
|
||
else:
|
||
print("\n~ Partial pass. Review failures above before fine-tuning.")
|
||
|
||
return 0 if passed == total else 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(main())
|