Compare commits

...

9 Commits

Author SHA1 Message Date
84eb8104d8 feat: standardize llama.cpp backend for sovereign local inference (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
Review Approval Gate / verify-review (pull_request) Failing after 10s
CI / test (pull_request) Failing after 55s
CI / validate (pull_request) Failing after 56s
2026-04-14 01:48:34 +00:00
93228388d7 feat: standardize llama.cpp backend for sovereign local inference (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-14 01:48:29 +00:00
e27c51c6da feat: standardize llama.cpp backend for sovereign local inference (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-14 01:48:26 +00:00
ed79826608 feat: standardize llama.cpp backend for sovereign local inference (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-14 01:48:23 +00:00
e438662c97 feat: standardize llama.cpp backend for sovereign local inference (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-14 01:48:17 +00:00
449170070b feat: standardize llama.cpp backend (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
CI / test (pull_request) Failing after 49s
Review Approval Gate / verify-review (pull_request) Failing after 7s
CI / validate (pull_request) Failing after 53s
2026-04-14 01:42:40 +00:00
3ed6bce5a0 feat: standardize llama.cpp backend (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-14 01:42:37 +00:00
2ecb4cd3a4 feat: standardize llama.cpp backend (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-14 01:42:29 +00:00
1c67f91b74 feat: standardize llama.cpp backend for sovereign local inference (#1123)
Some checks failed
Deploy Nexus / deploy (push) Has been cancelled
Staging Verification Gate / verify-staging (push) Has been cancelled
2026-04-14 01:41:35 +00:00
5 changed files with 362 additions and 15 deletions

154
bin/llama_client.py Normal file
View File

@@ -0,0 +1,154 @@
#!/usr/bin/env python3
"""llama_client.py — OpenAI-compatible client for llama.cpp HTTP API."""
import argparse, json, os, sys, time
from dataclasses import dataclass
from typing import Generator, Optional
import urllib.request, urllib.error
DEFAULT_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
DEFAULT_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
DEFAULT_MAX_TOKENS = int(os.environ.get("LLAMA_MAX_TOKENS", "512"))
DEFAULT_TEMPERATURE = float(os.environ.get("LLAMA_TEMPERATURE", "0.7"))
@dataclass
class ChatMessage:
role: str
content: str
@dataclass
class CompletionResponse:
text: str
tokens_used: int = 0
latency_ms: float = 0.0
model: str = ""
finish_reason: str = ""
@dataclass
class HealthStatus:
healthy: bool
endpoint: str
model_loaded: bool = False
model_name: str = ""
error: str = ""
def _http_post(url, data, timeout=120):
body = json.dumps(data).encode()
req = urllib.request.Request(url, data=body, headers={"Content-Type": "application/json"}, method="POST")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read())
def _http_get(url, timeout=10):
req = urllib.request.Request(url, headers={"Accept": "application/json"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read())
class LlamaClient:
def __init__(self, endpoint=DEFAULT_ENDPOINT, model=DEFAULT_MODEL):
self.endpoint = endpoint.rstrip("/")
self.model = model
def health_check(self) -> HealthStatus:
try:
data = _http_get(f"{self.endpoint}/health")
return HealthStatus(healthy=True, endpoint=self.endpoint,
model_loaded=data.get("status") == "ok" or data.get("model_loaded", False),
model_name=data.get("model_path", self.model))
except Exception as e:
return HealthStatus(healthy=False, endpoint=self.endpoint, error=str(e))
def is_healthy(self) -> bool:
return self.health_check().healthy
def list_models(self) -> list:
try:
data = _http_get(f"{self.endpoint}/v1/models")
return data.get("data", [])
except Exception:
return []
def chat(self, messages, max_tokens=DEFAULT_MAX_TOKENS, temperature=DEFAULT_TEMPERATURE, stream=False):
payload = {"model": self.model,
"messages": [{"role": m.role, "content": m.content} for m in messages],
"max_tokens": max_tokens, "temperature": temperature, "stream": stream}
start = time.time()
data = _http_post(f"{self.endpoint}/v1/chat/completions", payload)
latency = (time.time() - start) * 1000
choice = data.get("choices", [{}])[0]
msg = choice.get("message", {})
usage = data.get("usage", {})
return CompletionResponse(text=msg.get("content", ""),
tokens_used=usage.get("total_tokens", 0), latency_ms=latency,
model=data.get("model", self.model), finish_reason=choice.get("finish_reason", ""))
def chat_stream(self, messages, max_tokens=DEFAULT_MAX_TOKENS, temperature=DEFAULT_TEMPERATURE):
payload = {"model": self.model,
"messages": [{"role": m.role, "content": m.content} for m in messages],
"max_tokens": max_tokens, "temperature": temperature, "stream": True}
req = urllib.request.Request(f"{self.endpoint}/v1/chat/completions",
data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, method="POST")
with urllib.request.urlopen(req, timeout=300) as resp:
for line in resp:
line = line.decode().strip()
if line.startswith("data: "):
chunk = line[6:]
if chunk == "[DONE]": break
try:
data = json.loads(chunk)
content = data.get("choices", [{}])[0].get("delta", {}).get("content", "")
if content: yield content
except json.JSONDecodeError: continue
def simple_chat(self, prompt, system=None, max_tokens=DEFAULT_MAX_TOKENS):
messages = []
if system: messages.append(ChatMessage(role="system", content=system))
messages.append(ChatMessage(role="user", content=prompt))
return self.chat(messages, max_tokens=max_tokens).text
def complete(self, prompt, max_tokens=DEFAULT_MAX_TOKENS, temperature=DEFAULT_TEMPERATURE):
payload = {"prompt": prompt, "n_predict": max_tokens, "temperature": temperature}
start = time.time()
data = _http_post(f"{self.endpoint}/completion", payload)
return CompletionResponse(text=data.get("content", ""),
tokens_used=data.get("tokens_predicted", 0), latency_ms=(time.time()-start)*1000, model=self.model)
def benchmark(self, prompt="Explain sovereignty in 3 sentences.", iterations=5, max_tokens=128):
latencies, token_counts = [], []
for _ in range(iterations):
resp = self.chat([ChatMessage(role="user", content=prompt)], max_tokens=max_tokens)
latencies.append(resp.latency_ms)
token_counts.append(resp.tokens_used)
avg_lat = sum(latencies)/len(latencies)
avg_tok = sum(token_counts)/len(token_counts)
return {"iterations": iterations, "prompt": prompt,
"avg_latency_ms": round(avg_lat, 1), "min_latency_ms": round(min(latencies), 1),
"max_latency_ms": round(max(latencies), 1), "avg_tokens": round(avg_tok, 1),
"tok_per_sec": round((avg_tok/avg_lat)*1000 if avg_lat > 0 else 0, 1)}
def main():
p = argparse.ArgumentParser(description="llama.cpp client CLI")
p.add_argument("--url", default=DEFAULT_ENDPOINT)
p.add_argument("--model", default=DEFAULT_MODEL)
sub = p.add_subparsers(dest="cmd")
sub.add_parser("health")
sub.add_parser("models")
cp = sub.add_parser("chat"); cp.add_argument("prompt"); cp.add_argument("--system"); cp.add_argument("--max-tokens", type=int, default=DEFAULT_MAX_TOKENS); cp.add_argument("--stream", action="store_true")
bp = sub.add_parser("benchmark"); bp.add_argument("--prompt", default="Explain sovereignty."); bp.add_argument("--iterations", type=int, default=5); bp.add_argument("--max-tokens", type=int, default=128)
args = p.parse_args()
client = LlamaClient(args.url, args.model)
if args.cmd == "health":
print(json.dumps(client.health_check().__dict__, indent=2)); sys.exit(0 if client.is_healthy() else 1)
elif args.cmd == "models":
print(json.dumps(client.list_models(), indent=2))
elif args.cmd == "chat":
if args.stream:
msgs = []
if args.system: msgs.append(ChatMessage("system", args.system))
msgs.append(ChatMessage("user", args.prompt))
for chunk in client.chat_stream(msgs, max_tokens=args.max_tokens): print(chunk, end="", flush=True)
print()
else: print(client.simple_chat(args.prompt, system=args.system, max_tokens=args.max_tokens))
elif args.cmd == "benchmark":
print(json.dumps(client.benchmark(args.prompt, args.iterations, args.max_tokens), indent=2))
else: p.print_help()
if __name__ == "__main__": main()

View File

@@ -1,7 +1,6 @@
# Local LLM Deployment Guide — llama.cpp Sovereign Inference
# Local LLM Deployment Guide — llama.cpp
llama.cpp provides sovereign, offline-capable inference on CPU, CUDA, and
Apple Silicon. One binary, one model path, one health endpoint.
Standardizes local LLM inference across the fleet. One binary, one model path, one health endpoint.
## Quick Start
@@ -21,15 +20,15 @@ Apple Silicon. One binary, one model path, one health endpoint.
## Recommended Models
- Qwen2.5-7B-Instruct (4.7GB, 8GB RAM, 25-40 tok/s) — Fleet standard
- Qwen2.5-3B-Instruct (2.0GB, 4GB RAM, 50-80 tok/s) — VPS Beta
- Qwen2.5-7B-Instruct (4.7GB, 8GB RAM) — Fleet standard
- Qwen2.5-3B-Instruct (2.0GB, 4GB RAM) — VPS Beta
- Mistral-7B-Instruct-v0.3 (4.4GB, 8GB RAM) — Alternative
## Quantization Guide
## Quantization
- Q6_K (5.5GB) — Best quality/speed, RAM > 12GB
- Q6_K (5.5GB) — Best quality/speed
- Q4_K_M (4.7GB) — Fleet standard
- Q3_K_M (3.4GB) — < 6GB RAM fallback
- Q3_K_M (3.4GB) — Low-RAM fallback
## Hardware Targets
@@ -39,16 +38,15 @@ Apple Silicon. One binary, one model path, one health endpoint.
## Health Check
curl -sf http://localhost:11435/health
curl -s http://localhost:11435/v1/models
curl -sf http://localhost:11435/health && echo OK || echo FAIL
## API Compatibility
## API
llama-server exposes OpenAI-compatible API at /v1/chat/completions.
## Troubleshooting
- Won't start: use smaller model or lower quant
- Slow: match -t to available cores
- OOM: reduce -c context size
- Port in use: lsof -i :11435
- Won't start smaller model / lower quant
- Slow match -t to cores
- OOM reduce -c context
- Port conflict → lsof -i :11435

73
nexus/llama_provider.py Normal file
View File

@@ -0,0 +1,73 @@
"""llama_provider.py — Hermes inference router provider for llama.cpp."""
import logging, os, time
from dataclasses import dataclass
from typing import Optional
from bin.llama_client import ChatMessage, LlamaClient
logger = logging.getLogger("nexus.llama_provider")
LLAMA_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
LLAMA_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
LOCAL_ONLY = os.environ.get("LOCAL_ONLY", "false").lower() in ("true", "1", "yes")
FALLBACK_ON_FAILURE = os.environ.get("LLAMA_FALLBACK", "true").lower() in ("true", "1", "yes")
@dataclass
class ProviderResult:
text: str
provider: str = "llama.cpp"
model: str = ""
tokens_used: int = 0
latency_ms: float = 0.0
finish_reason: str = ""
is_local: bool = True
error: Optional[str] = None
class LlamaProvider:
def __init__(self, endpoint=LLAMA_ENDPOINT, model=LLAMA_MODEL, local_only=LOCAL_ONLY):
self.client = LlamaClient(endpoint=endpoint, model=model)
self.local_only = local_only
self.endpoint = endpoint
self._last_health = None
self._last_check = 0.0
def available(self):
now = time.time()
if self._last_health is not None and (now - self._last_check) < 30:
return self._last_health
status = self.client.health_check()
self._last_health = status.healthy and status.model_loaded
self._last_check = now
if not self._last_health:
logger.warning("llama.cpp unhealthy: %s", status.error or "model not loaded")
return self._last_health
def infer(self, messages, max_tokens=512, temperature=0.7, model=None, **kwargs):
if not self.available():
return ProviderResult(text="", error=f"llama.cpp at {self.endpoint} unavailable")
chat_msgs = [ChatMessage(m["role"], m["content"]) for m in messages if "role" in m and "content" in m]
if not chat_msgs:
return ProviderResult(text="", error="No valid messages")
start = time.time()
try:
resp = self.client.chat(chat_msgs, max_tokens=max_tokens, temperature=temperature)
return ProviderResult(text=resp.text, provider="llama.cpp",
model=resp.model or self.client.model, tokens_used=resp.tokens_used,
latency_ms=(time.time()-start)*1000, finish_reason=resp.finish_reason, is_local=True)
except Exception as e:
logger.error("llama.cpp failed: %s", e)
return ProviderResult(text="", error=str(e))
def should_use_local(self, external_failed=False, explicit_local=False):
if self.local_only: return True
if explicit_local: return True
if external_failed and FALLBACK_ON_FAILURE: return self.available()
return False
def status(self):
h = self.client.health_check()
return {"provider": "llama.cpp", "endpoint": self.endpoint,
"healthy": h.healthy, "model_loaded": h.model_loaded,
"model_name": h.model_name, "local_only": self.local_only}
def get_name(self): return "llama.cpp"
def get_priority(self): return 0 if self.local_only else 100

View File

@@ -0,0 +1,29 @@
[Unit]
Description=llama.cpp Local LLM Server
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=root
Environment=MODEL_PATH=/opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf
Environment=LLAMA_HOST=0.0.0.0
Environment=LLAMA_PORT=11435
Environment=LLAMA_CTX_SIZE=4096
Environment=LLAMA_THREADS=4
ExecStart=/usr/local/bin/llama-server -m ${MODEL_PATH} --host ${LLAMA_HOST} --port ${LLAMA_PORT} -c ${LLAMA_CTX_SIZE} -t ${LLAMA_THREADS} --cont-batching
Restart=on-failure
RestartSec=10
MemoryMax=12G
CPUQuota=90%
NoNewPrivileges=true
ProtectSystem=strict
ProtectHome=read-only
ReadWritePaths=/opt/models
PrivateTmp=true
StandardOutput=journal
StandardError=journal
SyslogIdentifier=llama-server
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,93 @@
"""Tests for llama_client."""
from unittest.mock import patch
from pathlib import Path
import pytest, sys
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from bin.llama_client import LlamaClient, ChatMessage, HealthStatus
class TestChatMessage:
def test_creation(self):
m = ChatMessage("user", "Hello")
assert m.role == "user" and m.content == "Hello"
class TestHealthStatus:
def test_healthy(self):
s = HealthStatus(True, "http://x:11435", model_loaded=True)
assert s.healthy and s.model_loaded
class TestLlamaClient:
def test_defaults(self):
c = LlamaClient()
assert c.endpoint == "http://localhost:11435"
assert c.model == "qwen2.5-7b"
def test_custom(self):
c = LlamaClient("http://x:8080", "mistral")
assert c.endpoint == "http://x:8080" and c.model == "mistral"
def test_trailing_slash(self):
assert LlamaClient("http://x/").endpoint == "http://x"
@patch("bin.llama_client._http_get")
def test_health_ok(self, m):
m.return_value = {"status": "ok"}
assert LlamaClient().health_check().healthy is True
@patch("bin.llama_client._http_get")
def test_health_fail(self, m):
m.side_effect = ConnectionError("down")
s = LlamaClient().health_check()
assert s.healthy is False and "down" in s.error
@patch("bin.llama_client._http_get")
def test_is_healthy(self, m):
m.return_value = {"status": "ok"}
assert LlamaClient().is_healthy() is True
@patch("bin.llama_client._http_get")
def test_list_models(self, m):
m.return_value = {"data": [{"id": "qwen"}]}
assert len(LlamaClient().list_models()) == 1
@patch("bin.llama_client._http_get")
def test_list_models_fail(self, m):
m.side_effect = ConnectionError()
assert LlamaClient().list_models() == []
@patch("bin.llama_client._http_post")
def test_chat(self, m):
m.return_value = {"choices": [{"message": {"content": "Hi"}, "finish_reason": "stop"}], "usage": {"total_tokens": 10}}
r = LlamaClient().chat([ChatMessage("user", "test")])
assert r.text == "Hi" and r.tokens_used == 10
@patch("bin.llama_client._http_post")
def test_chat_params(self, m):
m.return_value = {"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}], "usage": {}}
LlamaClient().chat([ChatMessage("user", "t")], max_tokens=100, temperature=0.3)
d = m.call_args[0][1]
assert d["max_tokens"] == 100 and d["temperature"] == 0.3
@patch("bin.llama_client._http_post")
def test_simple_chat(self, m):
m.return_value = {"choices": [{"message": {"content": "Yes"}, "finish_reason": "stop"}], "usage": {}}
assert LlamaClient().simple_chat("test") == "Yes"
@patch("bin.llama_client._http_post")
def test_simple_chat_system(self, m):
m.return_value = {"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}], "usage": {}}
LlamaClient().simple_chat("t", system="be helpful")
assert len(m.call_args[0][1]["messages"]) == 2
@patch("bin.llama_client._http_post")
def test_complete(self, m):
m.return_value = {"content": "result", "tokens_predicted": 50}
r = LlamaClient().complete("prompt")
assert r.text == "result" and r.tokens_used == 50
@patch("bin.llama_client.time.time")
@patch("bin.llama_client._http_post")
def test_benchmark(self, mp, mt):
mp.return_value = {"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}], "usage": {"total_tokens": 10}}
mt.side_effect = [0.0, 0.05, 0.05, 0.1, 0.1, 0.15]
r = LlamaClient().benchmark(iterations=2)
assert r["iterations"] == 2 and r["avg_latency_ms"] > 0 and r["tok_per_sec"] > 0