Closes #1123. Implements all three phases of the local LLM standardization: PHASE 1 — Deployment: - docs/local-llm.md: full deployment guide (build, model download, health check, model path convention /opt/models/llama/, hardware recommendations) - systemd/llama-server.service: hardened unit with resource limits and auto-restart - Health check: /health endpoint + model loaded verification PHASE 2 — Hermes Integration: - bin/llama_client.py: OpenAI-compatible Python client wrapping llama.cpp HTTP API (chat completions, streaming, raw completions, health check, model listing, benchmarking, full CLI interface) - nexus/llama_provider.py: Hermes inference router provider adapter - Activates when external APIs fail, LOCAL_ONLY=true, or explicit local request - Response format normalized to OpenAI-compatible chat completions - Token usage estimated and logged - Health caching with TTL for efficiency PHASE 3 — Optimization & Ops: - Benchmarking: client.benchmark() + CLI benchmark command - Quantization guide: Q4_K_M recommended for fleet, Q6_K for high-RAM, Q3_K for low - Model recommendations for VPS Beta (3B), VPS Alpha (7B), Mac (7B Q6_K) - Night watch integration: health probe script with auto-restart Fleet standard model: Qwen2.5-7B-Instruct-Q4_K_M.gguf Default endpoint: http://localhost:11435 22 tests pass.
208 lines
7.6 KiB
Python
208 lines
7.6 KiB
Python
"""Tests for llama_client — OpenAI-compatible client for llama.cpp."""
|
|
import json
|
|
from unittest.mock import MagicMock, patch
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
import sys
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
from bin.llama_client import (
|
|
LlamaClient,
|
|
ChatMessage,
|
|
CompletionResponse,
|
|
HealthStatus,
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ChatMessage
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestChatMessage:
|
|
def test_creation(self):
|
|
msg = ChatMessage(role="user", content="Hello")
|
|
assert msg.role == "user"
|
|
assert msg.content == "Hello"
|
|
|
|
def test_system_message(self):
|
|
msg = ChatMessage(role="system", content="You are helpful.")
|
|
assert msg.role == "system"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HealthStatus
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestHealthStatus:
|
|
def test_healthy(self):
|
|
status = HealthStatus(healthy=True, endpoint="http://localhost:11435", model_loaded=True)
|
|
assert status.healthy is True
|
|
assert status.model_loaded is True
|
|
|
|
def test_unhealthy(self):
|
|
status = HealthStatus(healthy=False, endpoint="http://localhost:11435", error="Connection refused")
|
|
assert status.healthy is False
|
|
assert status.error == "Connection refused"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LlamaClient
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestLlamaClient:
|
|
def test_default_endpoint(self):
|
|
client = LlamaClient()
|
|
assert client.endpoint == "http://localhost:11435"
|
|
|
|
def test_custom_endpoint(self):
|
|
client = LlamaClient(endpoint="http://192.168.1.10:8080")
|
|
assert client.endpoint == "http://192.168.1.10:8080"
|
|
|
|
def test_trailing_slash_stripped(self):
|
|
client = LlamaClient(endpoint="http://localhost:11435/")
|
|
assert client.endpoint == "http://localhost:11435"
|
|
|
|
def test_custom_model(self):
|
|
client = LlamaClient(model="mistral-7b")
|
|
assert client.model == "mistral-7b"
|
|
|
|
@patch("bin.llama_client._http_get")
|
|
def test_health_check_success(self, mock_get):
|
|
mock_get.return_value = {"status": "ok", "model_loaded": True}
|
|
client = LlamaClient()
|
|
status = client.health_check()
|
|
assert status.healthy is True
|
|
assert status.model_loaded is True
|
|
mock_get.assert_called_once_with("http://localhost:11435/health")
|
|
|
|
@patch("bin.llama_client._http_get")
|
|
def test_health_check_failure(self, mock_get):
|
|
mock_get.side_effect = ConnectionError("refused")
|
|
client = LlamaClient()
|
|
status = client.health_check()
|
|
assert status.healthy is False
|
|
assert "refused" in status.error
|
|
|
|
@patch("bin.llama_client._http_get")
|
|
def test_is_healthy_true(self, mock_get):
|
|
mock_get.return_value = {"status": "ok"}
|
|
client = LlamaClient()
|
|
assert client.is_healthy() is True
|
|
|
|
@patch("bin.llama_client._http_get")
|
|
def test_is_healthy_false(self, mock_get):
|
|
mock_get.side_effect = ConnectionError("down")
|
|
client = LlamaClient()
|
|
assert client.is_healthy() is False
|
|
|
|
@patch("bin.llama_client._http_get")
|
|
def test_list_models(self, mock_get):
|
|
mock_get.return_value = {
|
|
"data": [{"id": "qwen2.5-7b", "object": "model"}]
|
|
}
|
|
client = LlamaClient()
|
|
models = client.list_models()
|
|
assert len(models) == 1
|
|
assert models[0]["id"] == "qwen2.5-7b"
|
|
|
|
@patch("bin.llama_client._http_get")
|
|
def test_list_models_empty(self, mock_get):
|
|
mock_get.side_effect = ConnectionError("down")
|
|
client = LlamaClient()
|
|
models = client.list_models()
|
|
assert models == []
|
|
|
|
@patch("bin.llama_client._http_post")
|
|
def test_chat_success(self, mock_post):
|
|
mock_post.return_value = {
|
|
"model": "qwen2.5-7b",
|
|
"choices": [{"message": {"content": "Hello! How can I help?"}, "finish_reason": "stop"}],
|
|
"usage": {"total_tokens": 25},
|
|
}
|
|
client = LlamaClient()
|
|
messages = [ChatMessage(role="user", content="Hello")]
|
|
response = client.chat(messages)
|
|
assert response.text == "Hello! How can I help?"
|
|
assert response.tokens_used == 25
|
|
assert response.finish_reason == "stop"
|
|
assert response.latency_ms > 0
|
|
|
|
@patch("bin.llama_client._http_post")
|
|
def test_chat_custom_params(self, mock_post):
|
|
mock_post.return_value = {
|
|
"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
|
|
"usage": {},
|
|
}
|
|
client = LlamaClient()
|
|
messages = [ChatMessage(role="user", content="test")]
|
|
client.chat(messages, max_tokens=100, temperature=0.3)
|
|
call_data = mock_post.call_args[0][1]
|
|
assert call_data["max_tokens"] == 100
|
|
assert call_data["temperature"] == 0.3
|
|
|
|
@patch("bin.llama_client._http_post")
|
|
def test_chat_connection_error(self, mock_post):
|
|
mock_post.side_effect = ConnectionError("down")
|
|
client = LlamaClient()
|
|
messages = [ChatMessage(role="user", content="test")]
|
|
with pytest.raises(ConnectionError):
|
|
client.chat(messages)
|
|
|
|
@patch("bin.llama_client._http_post")
|
|
def test_simple_chat(self, mock_post):
|
|
mock_post.return_value = {
|
|
"choices": [{"message": {"content": "I am well!"}, "finish_reason": "stop"}],
|
|
"usage": {"total_tokens": 15},
|
|
}
|
|
client = LlamaClient()
|
|
result = client.simple_chat("How are you?")
|
|
assert result == "I am well!"
|
|
|
|
@patch("bin.llama_client._http_post")
|
|
def test_simple_chat_with_system(self, mock_post):
|
|
mock_post.return_value = {
|
|
"choices": [{"message": {"content": "Yes"}, "finish_reason": "stop"}],
|
|
"usage": {},
|
|
}
|
|
client = LlamaClient()
|
|
client.simple_chat("Are you helpful?", system="You are helpful.")
|
|
call_data = mock_post.call_args[0][1]
|
|
assert len(call_data["messages"]) == 2
|
|
assert call_data["messages"][0]["role"] == "system"
|
|
|
|
@patch("bin.llama_client._http_post")
|
|
def test_complete(self, mock_post):
|
|
mock_post.return_value = {
|
|
"content": "Once upon a time...",
|
|
"tokens_predicted": 50,
|
|
}
|
|
client = LlamaClient()
|
|
response = client.complete("Once upon a time")
|
|
assert response.text == "Once upon a time..."
|
|
assert response.tokens_used == 50
|
|
|
|
@patch("bin.llama_client.time.time")
|
|
@patch("bin.llama_client._http_post")
|
|
def test_benchmark(self, mock_post, mock_time):
|
|
mock_post.return_value = {
|
|
"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
|
|
"usage": {"total_tokens": 10},
|
|
}
|
|
# Simulate 50ms latency per call
|
|
mock_time.side_effect = [0.0, 0.05, 0.05, 0.1, 0.1, 0.15, 0.15, 0.2, 0.2, 0.25]
|
|
client = LlamaClient()
|
|
result = client.benchmark(iterations=3)
|
|
assert result["iterations"] == 3
|
|
assert result["avg_latency_ms"] > 0
|
|
assert result["tok_per_sec"] > 0
|
|
|
|
def test_env_override(self):
|
|
with patch.dict("os.environ", {"LLAMA_ENDPOINT": "http://custom:9999"}):
|
|
from importlib import reload
|
|
import bin.llama_client as mod
|
|
reload(mod)
|
|
# Default endpoint reads from env at import time
|
|
assert mod.DEFAULT_ENDPOINT == "http://custom:9999"
|