the-nexus/nexus/llama_provider.py

"""llama_provider.py — Hermes inference router provider for llama.cpp."""
import logging, os, time
from dataclasses import dataclass
from typing import Optional
from bin.llama_client import ChatMessage, LlamaClient

logger = logging.getLogger("nexus.llama_provider")

LLAMA_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
LLAMA_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
LOCAL_ONLY = os.environ.get("LOCAL_ONLY", "false").lower() in ("true", "1", "yes")
FALLBACK_ON_FAILURE = os.environ.get("LLAMA_FALLBACK", "true").lower() in ("true", "1", "yes")

@dataclass
class ProviderResult:
    text: str
    provider: str = "llama.cpp"
    model: str = ""
    tokens_used: int = 0
    latency_ms: float = 0.0
    finish_reason: str = ""
    is_local: bool = True
    error: Optional[str] = None

class LlamaProvider:
    def __init__(self, endpoint=LLAMA_ENDPOINT, model=LLAMA_MODEL, local_only=LOCAL_ONLY):
        self.client = LlamaClient(endpoint=endpoint, model=model)
        self.local_only = local_only
        self.endpoint = endpoint
        self._last_health = None
        self._last_check = 0.0

    def available(self):
        now = time.time()
        if self._last_health is not None and (now - self._last_check) < 30:
            return self._last_health
        status = self.client.health_check()
        self._last_health = status.healthy and status.model_loaded
        self._last_check = now
        if not self._last_health:
            logger.warning("llama.cpp unhealthy: %s", status.error or "model not loaded")
        return self._last_health

    def infer(self, messages, max_tokens=512, temperature=0.7, model=None, **kwargs):
        if not self.available():
            return ProviderResult(text="", error=f"llama.cpp at {self.endpoint} unavailable")
        chat_msgs = [ChatMessage(m["role"], m["content"]) for m in messages if "role" in m and "content" in m]
        if not chat_msgs:
            return ProviderResult(text="", error="No valid messages")
        start = time.time()
        try:
            resp = self.client.chat(chat_msgs, max_tokens=max_tokens, temperature=temperature)
            return ProviderResult(text=resp.text, provider="llama.cpp",
                model=resp.model or self.client.model, tokens_used=resp.tokens_used,
                latency_ms=(time.time()-start)*1000, finish_reason=resp.finish_reason, is_local=True)
        except Exception as e:
            logger.error("llama.cpp failed: %s", e)
            return ProviderResult(text="", error=str(e))

    def should_use_local(self, external_failed=False, explicit_local=False):
        if self.local_only: return True
        if explicit_local: return True
        if external_failed and FALLBACK_ON_FAILURE: return self.available()
        return False

    def status(self):
        h = self.client.health_check()
        return {"provider": "llama.cpp", "endpoint": self.endpoint,
            "healthy": h.healthy, "model_loaded": h.model_loaded,
            "model_name": h.model_name, "local_only": self.local_only}

    def get_name(self): return "llama.cpp"
    def get_priority(self): return 0 if self.local_only else 100