the-nexus/tests/test_llama_client.py

"""Tests for llama_client — OpenAI-compatible client for llama.cpp."""
import json
from unittest.mock import MagicMock, patch
from pathlib import Path

import pytest

import sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from bin.llama_client import (
    LlamaClient,
    ChatMessage,
    CompletionResponse,
    HealthStatus,
)


# ---------------------------------------------------------------------------
# ChatMessage
# ---------------------------------------------------------------------------

class TestChatMessage:
    def test_creation(self):
        msg = ChatMessage(role="user", content="Hello")
        assert msg.role == "user"
        assert msg.content == "Hello"

    def test_system_message(self):
        msg = ChatMessage(role="system", content="You are helpful.")
        assert msg.role == "system"


# ---------------------------------------------------------------------------
# HealthStatus
# ---------------------------------------------------------------------------

class TestHealthStatus:
    def test_healthy(self):
        status = HealthStatus(healthy=True, endpoint="http://localhost:11435", model_loaded=True)
        assert status.healthy is True
        assert status.model_loaded is True

    def test_unhealthy(self):
        status = HealthStatus(healthy=False, endpoint="http://localhost:11435", error="Connection refused")
        assert status.healthy is False
        assert status.error == "Connection refused"


# ---------------------------------------------------------------------------
# LlamaClient
# ---------------------------------------------------------------------------

class TestLlamaClient:
    def test_default_endpoint(self):
        client = LlamaClient()
        assert client.endpoint == "http://localhost:11435"

    def test_custom_endpoint(self):
        client = LlamaClient(endpoint="http://192.168.1.10:8080")
        assert client.endpoint == "http://192.168.1.10:8080"

    def test_trailing_slash_stripped(self):
        client = LlamaClient(endpoint="http://localhost:11435/")
        assert client.endpoint == "http://localhost:11435"

    def test_custom_model(self):
        client = LlamaClient(model="mistral-7b")
        assert client.model == "mistral-7b"

    @patch("bin.llama_client._http_get")
    def test_health_check_success(self, mock_get):
        mock_get.return_value = {"status": "ok", "model_loaded": True}
        client = LlamaClient()
        status = client.health_check()
        assert status.healthy is True
        assert status.model_loaded is True
        mock_get.assert_called_once_with("http://localhost:11435/health")

    @patch("bin.llama_client._http_get")
    def test_health_check_failure(self, mock_get):
        mock_get.side_effect = ConnectionError("refused")
        client = LlamaClient()
        status = client.health_check()
        assert status.healthy is False
        assert "refused" in status.error

    @patch("bin.llama_client._http_get")
    def test_is_healthy_true(self, mock_get):
        mock_get.return_value = {"status": "ok"}
        client = LlamaClient()
        assert client.is_healthy() is True

    @patch("bin.llama_client._http_get")
    def test_is_healthy_false(self, mock_get):
        mock_get.side_effect = ConnectionError("down")
        client = LlamaClient()
        assert client.is_healthy() is False

    @patch("bin.llama_client._http_get")
    def test_list_models(self, mock_get):
        mock_get.return_value = {
            "data": [{"id": "qwen2.5-7b", "object": "model"}]
        }
        client = LlamaClient()
        models = client.list_models()
        assert len(models) == 1
        assert models[0]["id"] == "qwen2.5-7b"

    @patch("bin.llama_client._http_get")
    def test_list_models_empty(self, mock_get):
        mock_get.side_effect = ConnectionError("down")
        client = LlamaClient()
        models = client.list_models()
        assert models == []

    @patch("bin.llama_client._http_post")
    def test_chat_success(self, mock_post):
        mock_post.return_value = {
            "model": "qwen2.5-7b",
            "choices": [{"message": {"content": "Hello! How can I help?"}, "finish_reason": "stop"}],
            "usage": {"total_tokens": 25},
        }
        client = LlamaClient()
        messages = [ChatMessage(role="user", content="Hello")]
        response = client.chat(messages)
        assert response.text == "Hello! How can I help?"
        assert response.tokens_used == 25
        assert response.finish_reason == "stop"
        assert response.latency_ms > 0

    @patch("bin.llama_client._http_post")
    def test_chat_custom_params(self, mock_post):
        mock_post.return_value = {
            "choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
            "usage": {},
        }
        client = LlamaClient()
        messages = [ChatMessage(role="user", content="test")]
        client.chat(messages, max_tokens=100, temperature=0.3)
        call_data = mock_post.call_args[0][1]
        assert call_data["max_tokens"] == 100
        assert call_data["temperature"] == 0.3

    @patch("bin.llama_client._http_post")
    def test_chat_connection_error(self, mock_post):
        mock_post.side_effect = ConnectionError("down")
        client = LlamaClient()
        messages = [ChatMessage(role="user", content="test")]
        with pytest.raises(ConnectionError):
            client.chat(messages)

    @patch("bin.llama_client._http_post")
    def test_simple_chat(self, mock_post):
        mock_post.return_value = {
            "choices": [{"message": {"content": "I am well!"}, "finish_reason": "stop"}],
            "usage": {"total_tokens": 15},
        }
        client = LlamaClient()
        result = client.simple_chat("How are you?")
        assert result == "I am well!"

    @patch("bin.llama_client._http_post")
    def test_simple_chat_with_system(self, mock_post):
        mock_post.return_value = {
            "choices": [{"message": {"content": "Yes"}, "finish_reason": "stop"}],
            "usage": {},
        }
        client = LlamaClient()
        client.simple_chat("Are you helpful?", system="You are helpful.")
        call_data = mock_post.call_args[0][1]
        assert len(call_data["messages"]) == 2
        assert call_data["messages"][0]["role"] == "system"

    @patch("bin.llama_client._http_post")
    def test_complete(self, mock_post):
        mock_post.return_value = {
            "content": "Once upon a time...",
            "tokens_predicted": 50,
        }
        client = LlamaClient()
        response = client.complete("Once upon a time")
        assert response.text == "Once upon a time..."
        assert response.tokens_used == 50

    @patch("bin.llama_client.time.time")
    @patch("bin.llama_client._http_post")
    def test_benchmark(self, mock_post, mock_time):
        mock_post.return_value = {
            "choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
            "usage": {"total_tokens": 10},
        }
        # Simulate 50ms latency per call
        mock_time.side_effect = [0.0, 0.05, 0.05, 0.1, 0.1, 0.15, 0.15, 0.2, 0.2, 0.25]
        client = LlamaClient()
        result = client.benchmark(iterations=3)
        assert result["iterations"] == 3
        assert result["avg_latency_ms"] > 0
        assert result["tok_per_sec"] > 0

    def test_env_override(self):
        with patch.dict("os.environ", {"LLAMA_ENDPOINT": "http://custom:9999"}):
            from importlib import reload
            import bin.llama_client as mod
            reload(mod)
            # Default endpoint reads from env at import time
            assert mod.DEFAULT_ENDPOINT == "http://custom:9999"