feat: standardize llama.cpp backend (#1123 )

2026-04-14 01:42:40 +00:00 · 2026-04-14 01:42:37 +00:00 · 2026-04-14 01:42:29 +00:00 · 2026-04-14 01:41:35 +00:00 · 2026-04-14 01:40:14 +00:00
5 changed files with 405 additions and 0 deletions
--- a/bin/llama_client.py
+++ b/bin/llama_client.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""llama_client.py — OpenAI-compatible client for llama.cpp HTTP API."""
+import argparse, json, os, sys, time
+from dataclasses import dataclass
+from typing import Generator, Optional
+import urllib.request, urllib.error
+
+DEFAULT_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
+DEFAULT_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
+DEFAULT_MAX_TOKENS = int(os.environ.get("LLAMA_MAX_TOKENS", "512"))
+DEFAULT_TEMPERATURE = float(os.environ.get("LLAMA_TEMPERATURE", "0.7"))
+
+@dataclass
+class ChatMessage:
+    role: str
+    content: str
+
+@dataclass
+class CompletionResponse:
+    text: str
+    tokens_used: int = 0
+    latency_ms: float = 0.0
+    model: str = ""
+    finish_reason: str = ""
+
+@dataclass
+class HealthStatus:
+    healthy: bool
+    endpoint: str
+    model_loaded: bool = False
+    model_name: str = ""
+    error: str = ""
+
+def _http_post(url, data, timeout=120):
+    body = json.dumps(data).encode()
+    req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST")
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        return json.loads(resp.read())
+
+def _http_get(url, timeout=10):
+    req = urllib.request.Request(url, headers={"Accept": "application/json"})
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        return json.loads(resp.read())
+
+class LlamaClient:
+    def __init__(self, endpoint=DEFAULT_ENDPOINT, model=DEFAULT_MODEL):
+        self.endpoint = endpoint.rstrip("/")
+        self.model = model
+
+    def health_check(self) -> HealthStatus:
+        try:
+            data = _http_get(f"{self.endpoint}/health")
+            return HealthStatus(healthy=True, endpoint=self.endpoint,
+                model_loaded=data.get("status") == "ok" or data.get("model_loaded", False),
+                model_name=data.get("model_path", self.model))
+        except Exception as e:
+            return HealthStatus(healthy=False, endpoint=self.endpoint, error=str(e))
+
+    def is_healthy(self) -> bool:
+        return self.health_check().healthy
+
+    def list_models(self) -> list:
+        try:
+            data = _http_get(f"{self.endpoint}/v1/models")
+            return data.get("data", [])
+        except Exception:
+            return []
+
+    def chat(self, messages, max_tokens=DEFAULT_MAX_TOKENS, temperature=DEFAULT_TEMPERATURE, stream=False):
+        payload = {"model": self.model,
+            "messages": [{"role": m.role, "content": m.content} for m in messages],
+            "max_tokens": max_tokens, "temperature": temperature, "stream": stream}
+        start = time.time()
+        data = _http_post(f"{self.endpoint}/v1/chat/completions", payload)
+        latency = (time.time() - start) * 1000
+        choice = data.get("choices", [{}])[0]
+        msg = choice.get("message", {})
+        usage = data.get("usage", {})
+        return CompletionResponse(text=msg.get("content", ""),
+            tokens_used=usage.get("total_tokens", 0), latency_ms=latency,
+            model=data.get("model", self.model), finish_reason=choice.get("finish_reason", ""))
+
+    def chat_stream(self, messages, max_tokens=DEFAULT_MAX_TOKENS, temperature=DEFAULT_TEMPERATURE):
+        payload = {"model": self.model,
+            "messages": [{"role": m.role, "content": m.content} for m in messages],
+            "max_tokens": max_tokens, "temperature": temperature, "stream": True}
+        req = urllib.request.Request(f"{self.endpoint}/v1/chat/completions",
+            data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, method="POST")
+        with urllib.request.urlopen(req, timeout=300) as resp:
+            for line in resp:
+                line = line.decode().strip()
+                if line.startswith("data: "):
+                    chunk = line[6:]
+                    if chunk == "[DONE]": break
+                    try:
+                        data = json.loads(chunk)
+                        content = data.get("choices", [{}])[0].get("delta", {}).get("content", "")
+                        if content: yield content
+                    except json.JSONDecodeError: continue
+
+    def simple_chat(self, prompt, system=None, max_tokens=DEFAULT_MAX_TOKENS):
+        messages = []
+        if system: messages.append(ChatMessage(role="system", content=system))
+        messages.append(ChatMessage(role="user", content=prompt))
+        return self.chat(messages, max_tokens=max_tokens).text
+
+    def complete(self, prompt, max_tokens=DEFAULT_MAX_TOKENS, temperature=DEFAULT_TEMPERATURE):
+        payload = {"prompt": prompt, "n_predict": max_tokens, "temperature": temperature}
+        start = time.time()
+        data = _http_post(f"{self.endpoint}/completion", payload)
+        return CompletionResponse(text=data.get("content", ""),
+            tokens_used=data.get("tokens_predicted", 0), latency_ms=(time.time()-start)*1000, model=self.model)
+
+    def benchmark(self, prompt="Explain sovereignty in 3 sentences.", iterations=5, max_tokens=128):
+        latencies, token_counts = [], []
+        for _ in range(iterations):
+            resp = self.chat([ChatMessage(role="user", content=prompt)], max_tokens=max_tokens)
+            latencies.append(resp.latency_ms)
+            token_counts.append(resp.tokens_used)
+        avg_lat = sum(latencies)/len(latencies)
+        avg_tok = sum(token_counts)/len(token_counts)
+        return {"iterations": iterations, "prompt": prompt,
+            "avg_latency_ms": round(avg_lat, 1), "min_latency_ms": round(min(latencies), 1),
+            "max_latency_ms": round(max(latencies), 1), "avg_tokens": round(avg_tok, 1),
+            "tok_per_sec": round((avg_tok/avg_lat)*1000 if avg_lat > 0 else 0, 1)}
+
+def main():
+    p = argparse.ArgumentParser(description="llama.cpp client CLI")
+    p.add_argument("--url", default=DEFAULT_ENDPOINT)
+    p.add_argument("--model", default=DEFAULT_MODEL)
+    sub = p.add_subparsers(dest="cmd")
+    sub.add_parser("health")
+    sub.add_parser("models")
+    cp = sub.add_parser("chat"); cp.add_argument("prompt"); cp.add_argument("--system"); cp.add_argument("--max-tokens", type=int, default=DEFAULT_MAX_TOKENS); cp.add_argument("--stream", action="store_true")
+    bp = sub.add_parser("benchmark"); bp.add_argument("--prompt", default="Explain sovereignty."); bp.add_argument("--iterations", type=int, default=5); bp.add_argument("--max-tokens", type=int, default=128)
+    args = p.parse_args()
+    client = LlamaClient(args.url, args.model)
+    if args.cmd == "health":
+        print(json.dumps(client.health_check().__dict__, indent=2)); sys.exit(0 if client.is_healthy() else 1)
+    elif args.cmd == "models":
+        print(json.dumps(client.list_models(), indent=2))
+    elif args.cmd == "chat":
+        if args.stream:
+            msgs = []
+            if args.system: msgs.append(ChatMessage("system", args.system))
+            msgs.append(ChatMessage("user", args.prompt))
+            for chunk in client.chat_stream(msgs, max_tokens=args.max_tokens): print(chunk, end="", flush=True)
+            print()
+        else: print(client.simple_chat(args.prompt, system=args.system, max_tokens=args.max_tokens))
+    elif args.cmd == "benchmark":
+        print(json.dumps(client.benchmark(args.prompt, args.iterations, args.max_tokens), indent=2))
+    else: p.print_help()
+
+if __name__ == "__main__": main()
--- a/docs/local-llm.md
+++ b/docs/local-llm.md
@@ -0,0 +1,54 @@
+# Local LLM Deployment Guide — llama.cpp Sovereign Inference
+
+llama.cpp provides sovereign, offline-capable inference on CPU, CUDA, and
+Apple Silicon. One binary, one model path, one health endpoint.
+
+## Quick Start
+
+    git clone https://github.com/ggerganov/llama.cpp.git
+    cd llama.cpp && cmake -B build && cmake --build build --config Release -j$(nproc)
+    sudo cp build/bin/llama-server /usr/local/bin/
+    mkdir -p /opt/models/llama
+    wget -O /opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q4_k_m.gguf"
+    llama-server -m /opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf --host 0.0.0.0 --port 11435 -c 4096 -t $(nproc) --cont-batching
+    curl http://localhost:11435/health
+
+## Model Path Convention
+
+- /opt/models/llama/ — Production (system-wide)
+- ~/models/llama/ — Per-user (dev)
+- MODEL_DIR env var — Override
+
+## Recommended Models
+
+- Qwen2.5-7B-Instruct (4.7GB, 8GB RAM, 25-40 tok/s) — Fleet standard
+- Qwen2.5-3B-Instruct (2.0GB, 4GB RAM, 50-80 tok/s) — VPS Beta
+- Mistral-7B-Instruct-v0.3 (4.4GB, 8GB RAM) — Alternative
+
+## Quantization Guide
+
+- Q6_K (5.5GB) — Best quality/speed, RAM > 12GB
+- Q4_K_M (4.7GB) — Fleet standard
+- Q3_K_M (3.4GB) — < 6GB RAM fallback
+
+## Hardware Targets
+
+- VPS Beta (2 vCPU, 4GB): Qwen2.5-3B-Q4_K_M, ctx 2048, ~40-60 tok/s
+- VPS Alpha (4 vCPU, 8GB): Qwen2.5-7B-Q4_K_M, ctx 4096, ~20-35 tok/s
+- Mac Apple Silicon: Qwen2.5-7B-Q6_K, Metal, ~30-50 tok/s
+
+## Health Check
+
+    curl -sf http://localhost:11435/health
+    curl -s http://localhost:11435/v1/models
+
+## API Compatibility
+
+llama-server exposes OpenAI-compatible API at /v1/chat/completions.
+
+## Troubleshooting
+
+- Won't start: use smaller model or lower quant
+- Slow: match -t to available cores
+- OOM: reduce -c context size
+- Port in use: lsof -i :11435
--- a/nexus/llama_provider.py
+++ b/nexus/llama_provider.py
@@ -0,0 +1,73 @@
+"""llama_provider.py — Hermes inference router provider for llama.cpp local server."""
+import logging, os, time
+from dataclasses import dataclass
+from typing import Optional
+from bin.llama_client import ChatMessage, LlamaClient
+
+logger = logging.getLogger("nexus.llama_provider")
+
+LLAMA_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
+LLAMA_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
+LOCAL_ONLY = os.environ.get("LOCAL_ONLY", "false").lower() in ("true", "1", "yes")
+FALLBACK_ON_FAILURE = os.environ.get("LLAMA_FALLBACK", "true").lower() in ("true", "1", "yes")
+
+@dataclass
+class ProviderResult:
+    text: str
+    provider: str = "llama.cpp"
+    model: str = ""
+    tokens_used: int = 0
+    latency_ms: float = 0.0
+    finish_reason: str = ""
+    is_local: bool = True
+    error: Optional[str] = None
+
+class LlamaProvider:
+    def __init__(self, endpoint=LLAMA_ENDPOINT, model=LLAMA_MODEL, local_only=LOCAL_ONLY):
+        self.client = LlamaClient(endpoint=endpoint, model=model)
+        self.local_only = local_only
+        self.endpoint = endpoint
+        self._last_health = None
+        self._last_check = 0.0
+
+    def available(self):
+        now = time.time()
+        if self._last_health is not None and (now - self._last_check) < 30:
+            return self._last_health
+        status = self.client.health_check()
+        self._last_health = status.healthy and status.model_loaded
+        self._last_check = now
+        if not self._last_health:
+            logger.warning("llama.cpp unhealthy: %s", status.error or "model not loaded")
+        return self._last_health
+
+    def infer(self, messages, max_tokens=512, temperature=0.7, model=None, **kwargs):
+        if not self.available():
+            return ProviderResult(text="", error=f"llama.cpp at {self.endpoint} unavailable")
+        chat_msgs = [ChatMessage(m["role"], m["content"]) for m in messages if "role" in m and "content" in m]
+        if not chat_msgs:
+            return ProviderResult(text="", error="No valid messages")
+        start = time.time()
+        try:
+            resp = self.client.chat(chat_msgs, max_tokens=max_tokens, temperature=temperature)
+            return ProviderResult(text=resp.text, provider="llama.cpp",
+                model=resp.model or self.client.model, tokens_used=resp.tokens_used,
+                latency_ms=(time.time()-start)*1000, finish_reason=resp.finish_reason, is_local=True)
+        except Exception as e:
+            logger.error("llama.cpp failed: %s", e)
+            return ProviderResult(text="", error=str(e))
+
+    def should_use_local(self, external_failed=False, explicit_local=False):
+        if self.local_only: return True
+        if explicit_local: return True
+        if external_failed and FALLBACK_ON_FAILURE: return self.available()
+        return False
+
+    def status(self):
+        h = self.client.health_check()
+        return {"provider": "llama.cpp", "endpoint": self.endpoint,
+            "healthy": h.healthy, "model_loaded": h.model_loaded,
+            "model_name": h.model_name, "local_only": self.local_only}
+
+    def get_name(self): return "llama.cpp"
+    def get_priority(self): return 0 if self.local_only else 100
--- a/systemd/llama-server.service
+++ b/systemd/llama-server.service
@@ -0,0 +1,29 @@
+[Unit]
+Description=llama.cpp Local LLM Server
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+User=root
+Environment=MODEL_PATH=/opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf
+Environment=LLAMA_HOST=0.0.0.0
+Environment=LLAMA_PORT=11435
+Environment=LLAMA_CTX_SIZE=4096
+Environment=LLAMA_THREADS=4
+ExecStart=/usr/local/bin/llama-server -m ${MODEL_PATH} --host ${LLAMA_HOST} --port ${LLAMA_PORT} -c ${LLAMA_CTX_SIZE} -t ${LLAMA_THREADS} --cont-batching
+Restart=on-failure
+RestartSec=10
+MemoryMax=12G
+CPUQuota=90%
+NoNewPrivileges=true
+ProtectSystem=strict
+ProtectHome=read-only
+ReadWritePaths=/opt/models
+PrivateTmp=true
+StandardOutput=journal
+StandardError=journal
+SyslogIdentifier=llama-server
+
+[Install]
+WantedBy=multi-user.target
--- a/tests/test_llama_client.py
+++ b/tests/test_llama_client.py
@@ -0,0 +1,95 @@
+"""Tests for llama_client."""
+from unittest.mock import patch
+from pathlib import Path
+import pytest, sys
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+from bin.llama_client import LlamaClient, ChatMessage, HealthStatus
+
+class TestChatMessage:
+    def test_creation(self):
+        m = ChatMessage("user", "Hello")
+        assert m.role == "user" and m.content == "Hello"
+
+class TestHealthStatus:
+    def test_healthy(self):
+        s = HealthStatus(True, "http://x:11435", model_loaded=True)
+        assert s.healthy and s.model_loaded
+
+class TestLlamaClient:
+    def test_defaults(self):
+        c = LlamaClient()
+        assert c.endpoint == "http://localhost:11435"
+        assert c.model == "qwen2.5-7b"
+
+    def test_custom(self):
+        c = LlamaClient("http://x:8080", "mistral")
+        assert c.endpoint == "http://x:8080" and c.model == "mistral"
+
+    def test_trailing_slash(self):
+        assert LlamaClient("http://x/").endpoint == "http://x"
+
+    @patch("bin.llama_client._http_get")
+    def test_health_ok(self, m):
+        m.return_value = {"status": "ok"}
+        assert LlamaClient().health_check().healthy is True
+
+    @patch("bin.llama_client._http_get")
+    def test_health_fail(self, m):
+        m.side_effect = ConnectionError("down")
+        s = LlamaClient().health_check()
+        assert s.healthy is False and "down" in s.error
+
+    @patch("bin.llama_client._http_get")
+    def test_is_healthy(self, m):
+        m.return_value = {"status": "ok"}
+        assert LlamaClient().is_healthy() is True
+        m.side_effect = ConnectionError()
+        assert LlamaClient().is_healthy() is False
+
+    @patch("bin.llama_client._http_get")
+    def test_list_models(self, m):
+        m.return_value = {"data": [{"id": "qwen"}]}
+        assert len(LlamaClient().list_models()) == 1
+
+    @patch("bin.llama_client._http_get")
+    def test_list_models_fail(self, m):
+        m.side_effect = ConnectionError()
+        assert LlamaClient().list_models() == []
+
+    @patch("bin.llama_client._http_post")
+    def test_chat(self, m):
+        m.return_value = {"choices": [{"message": {"content": "Hi"}, "finish_reason": "stop"}], "usage": {"total_tokens": 10}}
+        r = LlamaClient().chat([ChatMessage("user", "test")])
+        assert r.text == "Hi" and r.tokens_used == 10
+
+    @patch("bin.llama_client._http_post")
+    def test_chat_params(self, m):
+        m.return_value = {"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}], "usage": {}}
+        LlamaClient().chat([ChatMessage("user", "t")], max_tokens=100, temperature=0.3)
+        d = m.call_args[0][1]
+        assert d["max_tokens"] == 100 and d["temperature"] == 0.3
+
+    @patch("bin.llama_client._http_post")
+    def test_simple_chat(self, m):
+        m.return_value = {"choices": [{"message": {"content": "Yes"}, "finish_reason": "stop"}], "usage": {}}
+        assert LlamaClient().simple_chat("test") == "Yes"
+
+    @patch("bin.llama_client._http_post")
+    def test_simple_chat_system(self, m):
+        m.return_value = {"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}], "usage": {}}
+        LlamaClient().simple_chat("t", system="be helpful")
+        assert len(m.call_args[0][1]["messages"]) == 2
+
+    @patch("bin.llama_client._http_post")
+    def test_complete(self, m):
+        m.return_value = {"content": "result", "tokens_predicted": 50}
+        r = LlamaClient().complete("prompt")
+        assert r.text == "result" and r.tokens_used == 50
+
+    @patch("bin.llama_client.time.time")
+    @patch("bin.llama_client._http_post")
+    def test_benchmark(self, mp, mt):
+        mp.return_value = {"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}], "usage": {"total_tokens": 10}}
+        mt.side_effect = [0.0, 0.05, 0.05, 0.1, 0.1, 0.15]
+        r = LlamaClient().benchmark(iterations=2)
+        assert r["iterations"] == 2 and r["avg_latency_ms"] > 0 and r["tok_per_sec"] > 0
Author	SHA1	Message	Date
Alexander Whitestone	449170070b	feat: standardize llama.cpp backend (#1123 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details Staging Verification Gate / verify-staging (push) Has been cancelled Details CI / test (pull_request) Failing after 49s Details Review Approval Gate / verify-review (pull_request) Failing after 7s Details CI / validate (pull_request) Failing after 53s Details	2026-04-14 01:42:40 +00:00
Alexander Whitestone	3ed6bce5a0	feat: standardize llama.cpp backend (#1123 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details Staging Verification Gate / verify-staging (push) Has been cancelled Details	2026-04-14 01:42:37 +00:00
Alexander Whitestone	2ecb4cd3a4	feat: standardize llama.cpp backend (#1123 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details Staging Verification Gate / verify-staging (push) Has been cancelled Details	2026-04-14 01:42:29 +00:00
Alexander Whitestone	1c67f91b74	feat: standardize llama.cpp backend for sovereign local inference (#1123 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details Staging Verification Gate / verify-staging (push) Has been cancelled Details	2026-04-14 01:41:35 +00:00
Alexander Whitestone	53d9a55444	feat: standardize llama.cpp backend for sovereign local inference (#1123 ) Some checks failed Deploy Nexus / deploy (push) Has been cancelled Details Staging Verification Gate / verify-staging (push) Has been cancelled Details	2026-04-14 01:40:14 +00:00