Files
the-nexus/tests/test_llama_client.py
Timmy (WHIP) ac2ec40657
Some checks failed
CI / test (pull_request) Failing after 51s
Review Approval Gate / verify-review (pull_request) Failing after 6s
CI / validate (pull_request) Failing after 40s
feat: standardize llama.cpp backend for sovereign local inference
Closes #1123. Implements all three phases of the local LLM standardization:

PHASE 1 — Deployment:
- docs/local-llm.md: full deployment guide (build, model download, health check,
  model path convention /opt/models/llama/, hardware recommendations)
- systemd/llama-server.service: hardened unit with resource limits and auto-restart
- Health check: /health endpoint + model loaded verification

PHASE 2 — Hermes Integration:
- bin/llama_client.py: OpenAI-compatible Python client wrapping llama.cpp HTTP API
  (chat completions, streaming, raw completions, health check, model listing,
  benchmarking, full CLI interface)
- nexus/llama_provider.py: Hermes inference router provider adapter
  - Activates when external APIs fail, LOCAL_ONLY=true, or explicit local request
  - Response format normalized to OpenAI-compatible chat completions
  - Token usage estimated and logged
  - Health caching with TTL for efficiency

PHASE 3 — Optimization & Ops:
- Benchmarking: client.benchmark() + CLI benchmark command
- Quantization guide: Q4_K_M recommended for fleet, Q6_K for high-RAM, Q3_K for low
- Model recommendations for VPS Beta (3B), VPS Alpha (7B), Mac (7B Q6_K)
- Night watch integration: health probe script with auto-restart

Fleet standard model: Qwen2.5-7B-Instruct-Q4_K_M.gguf
Default endpoint: http://localhost:11435

22 tests pass.
2026-04-13 21:16:31 -04:00

208 lines
7.6 KiB
Python

"""Tests for llama_client — OpenAI-compatible client for llama.cpp."""
import json
from unittest.mock import MagicMock, patch
from pathlib import Path
import pytest
import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from bin.llama_client import (
LlamaClient,
ChatMessage,
CompletionResponse,
HealthStatus,
)
# ---------------------------------------------------------------------------
# ChatMessage
# ---------------------------------------------------------------------------
class TestChatMessage:
def test_creation(self):
msg = ChatMessage(role="user", content="Hello")
assert msg.role == "user"
assert msg.content == "Hello"
def test_system_message(self):
msg = ChatMessage(role="system", content="You are helpful.")
assert msg.role == "system"
# ---------------------------------------------------------------------------
# HealthStatus
# ---------------------------------------------------------------------------
class TestHealthStatus:
def test_healthy(self):
status = HealthStatus(healthy=True, endpoint="http://localhost:11435", model_loaded=True)
assert status.healthy is True
assert status.model_loaded is True
def test_unhealthy(self):
status = HealthStatus(healthy=False, endpoint="http://localhost:11435", error="Connection refused")
assert status.healthy is False
assert status.error == "Connection refused"
# ---------------------------------------------------------------------------
# LlamaClient
# ---------------------------------------------------------------------------
class TestLlamaClient:
def test_default_endpoint(self):
client = LlamaClient()
assert client.endpoint == "http://localhost:11435"
def test_custom_endpoint(self):
client = LlamaClient(endpoint="http://192.168.1.10:8080")
assert client.endpoint == "http://192.168.1.10:8080"
def test_trailing_slash_stripped(self):
client = LlamaClient(endpoint="http://localhost:11435/")
assert client.endpoint == "http://localhost:11435"
def test_custom_model(self):
client = LlamaClient(model="mistral-7b")
assert client.model == "mistral-7b"
@patch("bin.llama_client._http_get")
def test_health_check_success(self, mock_get):
mock_get.return_value = {"status": "ok", "model_loaded": True}
client = LlamaClient()
status = client.health_check()
assert status.healthy is True
assert status.model_loaded is True
mock_get.assert_called_once_with("http://localhost:11435/health")
@patch("bin.llama_client._http_get")
def test_health_check_failure(self, mock_get):
mock_get.side_effect = ConnectionError("refused")
client = LlamaClient()
status = client.health_check()
assert status.healthy is False
assert "refused" in status.error
@patch("bin.llama_client._http_get")
def test_is_healthy_true(self, mock_get):
mock_get.return_value = {"status": "ok"}
client = LlamaClient()
assert client.is_healthy() is True
@patch("bin.llama_client._http_get")
def test_is_healthy_false(self, mock_get):
mock_get.side_effect = ConnectionError("down")
client = LlamaClient()
assert client.is_healthy() is False
@patch("bin.llama_client._http_get")
def test_list_models(self, mock_get):
mock_get.return_value = {
"data": [{"id": "qwen2.5-7b", "object": "model"}]
}
client = LlamaClient()
models = client.list_models()
assert len(models) == 1
assert models[0]["id"] == "qwen2.5-7b"
@patch("bin.llama_client._http_get")
def test_list_models_empty(self, mock_get):
mock_get.side_effect = ConnectionError("down")
client = LlamaClient()
models = client.list_models()
assert models == []
@patch("bin.llama_client._http_post")
def test_chat_success(self, mock_post):
mock_post.return_value = {
"model": "qwen2.5-7b",
"choices": [{"message": {"content": "Hello! How can I help?"}, "finish_reason": "stop"}],
"usage": {"total_tokens": 25},
}
client = LlamaClient()
messages = [ChatMessage(role="user", content="Hello")]
response = client.chat(messages)
assert response.text == "Hello! How can I help?"
assert response.tokens_used == 25
assert response.finish_reason == "stop"
assert response.latency_ms > 0
@patch("bin.llama_client._http_post")
def test_chat_custom_params(self, mock_post):
mock_post.return_value = {
"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
"usage": {},
}
client = LlamaClient()
messages = [ChatMessage(role="user", content="test")]
client.chat(messages, max_tokens=100, temperature=0.3)
call_data = mock_post.call_args[0][1]
assert call_data["max_tokens"] == 100
assert call_data["temperature"] == 0.3
@patch("bin.llama_client._http_post")
def test_chat_connection_error(self, mock_post):
mock_post.side_effect = ConnectionError("down")
client = LlamaClient()
messages = [ChatMessage(role="user", content="test")]
with pytest.raises(ConnectionError):
client.chat(messages)
@patch("bin.llama_client._http_post")
def test_simple_chat(self, mock_post):
mock_post.return_value = {
"choices": [{"message": {"content": "I am well!"}, "finish_reason": "stop"}],
"usage": {"total_tokens": 15},
}
client = LlamaClient()
result = client.simple_chat("How are you?")
assert result == "I am well!"
@patch("bin.llama_client._http_post")
def test_simple_chat_with_system(self, mock_post):
mock_post.return_value = {
"choices": [{"message": {"content": "Yes"}, "finish_reason": "stop"}],
"usage": {},
}
client = LlamaClient()
client.simple_chat("Are you helpful?", system="You are helpful.")
call_data = mock_post.call_args[0][1]
assert len(call_data["messages"]) == 2
assert call_data["messages"][0]["role"] == "system"
@patch("bin.llama_client._http_post")
def test_complete(self, mock_post):
mock_post.return_value = {
"content": "Once upon a time...",
"tokens_predicted": 50,
}
client = LlamaClient()
response = client.complete("Once upon a time")
assert response.text == "Once upon a time..."
assert response.tokens_used == 50
@patch("bin.llama_client.time.time")
@patch("bin.llama_client._http_post")
def test_benchmark(self, mock_post, mock_time):
mock_post.return_value = {
"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
"usage": {"total_tokens": 10},
}
# Simulate 50ms latency per call
mock_time.side_effect = [0.0, 0.05, 0.05, 0.1, 0.1, 0.15, 0.15, 0.2, 0.2, 0.25]
client = LlamaClient()
result = client.benchmark(iterations=3)
assert result["iterations"] == 3
assert result["avg_latency_ms"] > 0
assert result["tok_per_sec"] > 0
def test_env_override(self):
with patch.dict("os.environ", {"LLAMA_ENDPOINT": "http://custom:9999"}):
from importlib import reload
import bin.llama_client as mod
reload(mod)
# Default endpoint reads from env at import time
assert mod.DEFAULT_ENDPOINT == "http://custom:9999"