Files
the-nexus/nexus/llama_provider.py
Alexander Whitestone e27c51c6da
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
feat: standardize llama.cpp backend for sovereign local inference (#1123)
2026-04-14 01:48:26 +00:00

74 lines
3.1 KiB
Python

"""llama_provider.py — Hermes inference router provider for llama.cpp."""
import logging, os, time
from dataclasses import dataclass
from typing import Optional
from bin.llama_client import ChatMessage, LlamaClient
logger = logging.getLogger("nexus.llama_provider")
LLAMA_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
LLAMA_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
LOCAL_ONLY = os.environ.get("LOCAL_ONLY", "false").lower() in ("true", "1", "yes")
FALLBACK_ON_FAILURE = os.environ.get("LLAMA_FALLBACK", "true").lower() in ("true", "1", "yes")
@dataclass
class ProviderResult:
text: str
provider: str = "llama.cpp"
model: str = ""
tokens_used: int = 0
latency_ms: float = 0.0
finish_reason: str = ""
is_local: bool = True
error: Optional[str] = None
class LlamaProvider:
def __init__(self, endpoint=LLAMA_ENDPOINT, model=LLAMA_MODEL, local_only=LOCAL_ONLY):
self.client = LlamaClient(endpoint=endpoint, model=model)
self.local_only = local_only
self.endpoint = endpoint
self._last_health = None
self._last_check = 0.0
def available(self):
now = time.time()
if self._last_health is not None and (now - self._last_check) < 30:
return self._last_health
status = self.client.health_check()
self._last_health = status.healthy and status.model_loaded
self._last_check = now
if not self._last_health:
logger.warning("llama.cpp unhealthy: %s", status.error or "model not loaded")
return self._last_health
def infer(self, messages, max_tokens=512, temperature=0.7, model=None, **kwargs):
if not self.available():
return ProviderResult(text="", error=f"llama.cpp at {self.endpoint} unavailable")
chat_msgs = [ChatMessage(m["role"], m["content"]) for m in messages if "role" in m and "content" in m]
if not chat_msgs:
return ProviderResult(text="", error="No valid messages")
start = time.time()
try:
resp = self.client.chat(chat_msgs, max_tokens=max_tokens, temperature=temperature)
return ProviderResult(text=resp.text, provider="llama.cpp",
model=resp.model or self.client.model, tokens_used=resp.tokens_used,
latency_ms=(time.time()-start)*1000, finish_reason=resp.finish_reason, is_local=True)
except Exception as e:
logger.error("llama.cpp failed: %s", e)
return ProviderResult(text="", error=str(e))
def should_use_local(self, external_failed=False, explicit_local=False):
if self.local_only: return True
if explicit_local: return True
if external_failed and FALLBACK_ON_FAILURE: return self.available()
return False
def status(self):
h = self.client.health_check()
return {"provider": "llama.cpp", "endpoint": self.endpoint,
"healthy": h.healthy, "model_loaded": h.model_loaded,
"model_name": h.model_name, "local_only": self.local_only}
def get_name(self): return "llama.cpp"
def get_priority(self): return 0 if self.local_only else 100