Timmy-time-dashboard/scripts/test_hermes4.py

#!/usr/bin/env python3
"""Hermes 4 smoke test and tool-calling validation script.

Tests the Hermes 4 14B model after importing into Ollama. Covers:
  1. Basic connectivity — model responds
  2. Memory usage — under 28 GB with model loaded
  3. Tool calling — structured JSON output (not raw text)
  4. Reasoning — <think> tag toggling works
  5. Timmy-persona smoke test — agent identity prompt

Usage:
    python scripts/test_hermes4.py                    # Run all tests
    python scripts/test_hermes4.py --model hermes4-14b
    python scripts/test_hermes4.py --model hermes4-36b --ctx 8192

Epic: #1091 Project Bannerlord — AutoLoRA Sovereignty Loop (Step 2 of 7)
Refs: #1101
"""

from __future__ import annotations

import argparse
import json
import subprocess
import sys
import time
from typing import Any

try:
    import requests
except ImportError:
    print("ERROR: 'requests' not installed. Run: pip install requests")
    sys.exit(1)

OLLAMA_URL = "http://localhost:11434"
DEFAULT_MODEL = "hermes4-14b"
MEMORY_LIMIT_GB = 28.0

# ── Tool schema used for tool-calling tests ──────────────────────────────────

READ_FILE_TOOL = {
    "type": "function",
    "function": {
        "name": "read_file",
        "description": "Read the contents of a file at the given path",
        "parameters": {
            "type": "object",
            "properties": {
                "path": {
                    "type": "string",
                    "description": "Absolute or relative path to the file",
                }
            },
            "required": ["path"],
        },
    },
}

LIST_ISSUES_TOOL = {
    "type": "function",
    "function": {
        "name": "list_issues",
        "description": "List open issues from a Gitea repository",
        "parameters": {
            "type": "object",
            "properties": {
                "repo": {"type": "string", "description": "owner/repo slug"},
                "state": {
                    "type": "string",
                    "enum": ["open", "closed", "all"],
                    "description": "Issue state filter",
                },
            },
            "required": ["repo"],
        },
    },
}


# ── Helpers ───────────────────────────────────────────────────────────────────


def _post(endpoint: str, payload: dict, timeout: int = 60) -> dict[str, Any]:
    """POST to Ollama and return parsed JSON."""
    url = f"{OLLAMA_URL}{endpoint}"
    resp = requests.post(url, json=payload, timeout=timeout)
    resp.raise_for_status()
    return resp.json()


def _ollama_memory_gb() -> float:
    """Estimate Ollama process RSS in GB using ps (macOS/Linux)."""
    try:
        # Look for ollama process RSS (macOS: column 6 in MB, Linux: column 6 in KB)
        result = subprocess.run(
            ["ps", "-axo", "pid,comm,rss"],
            capture_output=True,
            text=True,
            check=False,
        )
        total_kb = 0
        for line in result.stdout.splitlines():
            if "ollama" in line.lower():
                parts = line.split()
                try:
                    total_kb += int(parts[-1])
                except (ValueError, IndexError):
                    pass
        return total_kb / (1024 * 1024)  # KB → GB
    except Exception:
        return 0.0


def _check_model_available(model: str) -> bool:
    """Return True if model is listed in Ollama."""
    try:
        resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=10)
        resp.raise_for_status()
        names = [m["name"] for m in resp.json().get("models", [])]
        return any(model in n for n in names)
    except Exception:
        return False


def _chat(model: str, messages: list[dict], tools: list | None = None) -> dict:
    """Send a chat request to Ollama."""
    payload: dict = {"model": model, "messages": messages, "stream": False}
    if tools:
        payload["tools"] = tools
    return _post("/api/chat", payload, timeout=120)


# ── Test cases ────────────────────────────────────────────────────────────────


def test_model_available(model: str) -> bool:
    """PASS: model is registered in Ollama."""
    print(f"\n[1/5] Checking model availability: {model}")
    if _check_model_available(model):
        print(f"  ✓ {model} is available in Ollama")
        return True
    print(
        f"  ✗ {model} not found. Import with:\n"
        f"    ollama create {model} -f Modelfile.hermes4-14b\n"
        f"  Or pull directly if on registry:\n"
        f"    ollama pull {model}"
    )
    return False


def test_basic_response(model: str) -> bool:
    """PASS: model responds coherently to a simple prompt."""
    print(f"\n[2/5] Basic response test")
    messages = [
        {"role": "user", "content": "Reply with exactly: HERMES_OK"},
    ]
    try:
        t0 = time.time()
        data = _chat(model, messages)
        elapsed = time.time() - t0
        content = data.get("message", {}).get("content", "")
        if "HERMES_OK" in content:
            print(f"  ✓ Basic response OK ({elapsed:.1f}s): {content.strip()}")
            return True
        print(f"  ✗ Unexpected response ({elapsed:.1f}s): {content[:200]!r}")
        return False
    except Exception as exc:
        print(f"  ✗ Request failed: {exc}")
        return False


def test_memory_usage() -> bool:
    """PASS: Ollama process RSS is under MEMORY_LIMIT_GB."""
    print(f"\n[3/5] Memory usage check (limit: {MEMORY_LIMIT_GB} GB)")
    mem_gb = _ollama_memory_gb()
    if mem_gb == 0.0:
        print("  ~ Could not determine memory usage (ps unavailable?), skipping")
        return True
    if mem_gb < MEMORY_LIMIT_GB:
        print(f"  ✓ Memory usage: {mem_gb:.1f} GB (under {MEMORY_LIMIT_GB} GB limit)")
        return True
    print(
        f"  ✗ Memory usage: {mem_gb:.1f} GB exceeds {MEMORY_LIMIT_GB} GB limit.\n"
        "  Consider using Q4_K_M quantisation or reducing num_ctx."
    )
    return False


def test_tool_calling(model: str) -> bool:
    """PASS: model produces a tool_calls response (not raw text) for a tool-use prompt."""
    print(f"\n[4/5] Tool-calling test")
    messages = [
        {
            "role": "user",
            "content": "Please read the file at /tmp/test.txt using the read_file tool.",
        }
    ]
    try:
        t0 = time.time()
        data = _chat(model, messages, tools=[READ_FILE_TOOL])
        elapsed = time.time() - t0
        msg = data.get("message", {})
        tool_calls = msg.get("tool_calls", [])

        if tool_calls:
            tc = tool_calls[0]
            fn = tc.get("function", {})
            print(
                f"  ✓ Tool call produced ({elapsed:.1f}s):\n"
                f"    function: {fn.get('name')}\n"
                f"    arguments: {json.dumps(fn.get('arguments', {}), indent=6)}"
            )
            # Verify the function name is correct
            return fn.get("name") == "read_file"

        # Some models return JSON in the content instead of tool_calls
        content = msg.get("content", "")
        if "read_file" in content and "{" in content:
            print(
                f"  ~ Model returned tool call as text (not structured). ({elapsed:.1f}s)\n"
                f"    This is acceptable for the base model before fine-tuning.\n"
                f"    Content: {content[:300]}"
            )
            # Partial pass — model attempted tool calling but via text
            return True

        print(
            f"  ✗ No tool call in response ({elapsed:.1f}s).\n"
            f"    Content: {content[:300]!r}"
        )
        return False
    except Exception as exc:
        print(f"  ✗ Tool-calling request failed: {exc}")
        return False


def test_timmy_persona(model: str) -> bool:
    """PASS: model accepts a Timmy persona system prompt and responds in-character."""
    print(f"\n[5/5] Timmy-persona smoke test")
    messages = [
        {
            "role": "system",
            "content": (
                "You are Timmy, Alexander's personal AI agent. "
                "You are concise, direct, and helpful. "
                "You always start your responses with 'Timmy here:'."
            ),
        },
        {
            "role": "user",
            "content": "What is your name and what can you help me with?",
        },
    ]
    try:
        t0 = time.time()
        data = _chat(model, messages)
        elapsed = time.time() - t0
        content = data.get("message", {}).get("content", "")
        if "Timmy" in content or "timmy" in content.lower():
            print(f"  ✓ Persona accepted ({elapsed:.1f}s): {content[:200].strip()}")
            return True
        print(
            f"  ~ Persona response lacks 'Timmy' identifier ({elapsed:.1f}s).\n"
            f"    This is a fine-tuning target.\n"
            f"    Response: {content[:200]!r}"
        )
        # Soft pass — base model isn't expected to be perfectly in-character
        return True
    except Exception as exc:
        print(f"  ✗ Persona test failed: {exc}")
        return False


# ── Main ──────────────────────────────────────────────────────────────────────


def main() -> int:
    parser = argparse.ArgumentParser(description="Hermes 4 smoke test suite")
    parser.add_argument(
        "--model",
        default=DEFAULT_MODEL,
        help=f"Ollama model name (default: {DEFAULT_MODEL})",
    )
    parser.add_argument(
        "--ollama-url",
        default=OLLAMA_URL,
        help=f"Ollama base URL (default: {OLLAMA_URL})",
    )
    args = parser.parse_args()

    global OLLAMA_URL
    OLLAMA_URL = args.ollama_url.rstrip("/")
    model = args.model

    print("=" * 60)
    print(f"Hermes 4 Validation Suite — {model}")
    print(f"Ollama: {OLLAMA_URL}")
    print("=" * 60)

    results: dict[str, bool] = {}

    # Test 1: availability (gate — skip remaining if model missing)
    results["available"] = test_model_available(model)
    if not results["available"]:
        print("\n⚠ Model not available — skipping remaining tests.")
        print("  Import the model first (see Modelfile.hermes4-14b).")
        _print_summary(results)
        return 1

    # Tests 2–5
    results["basic_response"] = test_basic_response(model)
    results["memory_usage"] = test_memory_usage()
    results["tool_calling"] = test_tool_calling(model)
    results["timmy_persona"] = test_timmy_persona(model)

    return _print_summary(results)


def _print_summary(results: dict[str, bool]) -> int:
    passed = sum(results.values())
    total = len(results)
    print("\n" + "=" * 60)
    print(f"Results: {passed}/{total} passed")
    print("=" * 60)
    for name, ok in results.items():
        icon = "✓" if ok else "✗"
        print(f"  {icon} {name}")

    if passed == total:
        print("\n✓ All tests passed. Hermes 4 is ready for AutoLoRA fine-tuning.")
        print("  Next step: document WORK vs FAIL skill list → fine-tuning targets.")
    elif results.get("tool_calling") is False:
        print("\n⚠ Tool-calling FAILED. This is the primary fine-tuning target.")
        print("  Base model may need LoRA tuning on tool-use examples.")
    else:
        print("\n~ Partial pass. Review failures above before fine-tuning.")

    return 0 if passed == total else 1


if __name__ == "__main__":
    sys.exit(main())