Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
41d3acbe41 feat: Standardize llama.cpp backend for sovereign local inference (#1123)
Some checks failed
CI / test (pull_request) Failing after 37s
CI / validate (pull_request) Failing after 41s
Review Approval Gate / verify-review (pull_request) Failing after 7s
Implements three-phase plan for local LLM inference using llama.cpp:

Phase 1 — Deployment:
- docs/local-llm.md: full deployment, health check, model path guide
- systemd/llama-server.service: systemd unit for llama-server
- Standardized model path: /opt/models/llama/

Phase 2 — Hermes Integration:
- bin/llama_client.py: Python client wrapping llama.cpp HTTP API
  (OpenAI-compatible format, streaming, health check, benchmark)
- nexus/llama_provider.py: provider adapter for Hermes inference
  router (fallback when external APIs fail or LOCAL_ONLY=true)

Phase 3 — Benchmarking & Operations:
- Benchmark and quantization guides in docs/local-llm.md
- tests/test_llama_client.py: 22 tests covering init, health check,
  chat completion, simple chat, list models, benchmark, errors

All tests pass. Ready for review.
2026-04-13 20:49:52 -04:00
9 changed files with 1275 additions and 32 deletions

1
bin/__init__.py Normal file
View File

@@ -0,0 +1 @@
# bin package — CLI tools and clients for The Nexus

377
bin/llama_client.py Normal file
View File

@@ -0,0 +1,377 @@
#!/usr/bin/env python3
"""
llama_client.py — Python client wrapping the llama.cpp HTTP server API.
Provides an OpenAI-compatible interface for local llama.cpp inference.
This is the sovereign offline backend for The Nexus.
Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
"""
import json
import os
import time
from typing import Any, Dict, Generator, List, Optional
try:
import requests
except ImportError:
requests = None # Fall back to urllib
class LlamaClientError(Exception):
"""Raised when the llama.cpp server returns an error."""
pass
class LlamaClient:
"""
OpenAI-compatible client for the llama.cpp HTTP server.
Supports:
- /v1/chat/completions (chat-style)
- /v1/completions (raw completion)
- /health (health check)
- Streaming and non-streaming modes
Environment variables:
LLAMA_SERVER_URL — base URL (default: http://127.0.0.1:8081)
LLAMA_DEFAULT_MODEL — default model name
LLAMA_MAX_TOKENS — default max tokens (default: 512)
"""
DEFAULT_BASE_URL = "http://127.0.0.1:8081"
DEFAULT_MODEL = "default"
DEFAULT_MAX_TOKENS = 512
def __init__(
self,
base_url: Optional[str] = None,
model: Optional[str] = None,
timeout: float = 120.0,
):
self.base_url = (
base_url
or os.environ.get("LLAMA_SERVER_URL")
or self.DEFAULT_BASE_URL
).rstrip("/")
self.model = (
model
or os.environ.get("LLAMA_DEFAULT_MODEL")
or self.DEFAULT_MODEL
)
self.max_tokens = int(
os.environ.get("LLAMA_MAX_TOKENS", self.DEFAULT_MAX_TOKENS)
)
self.timeout = timeout
self._session = None
if requests:
self._session = requests.Session()
def _request(
self,
method: str,
path: str,
data: Optional[Dict] = None,
stream: bool = False,
) -> Any:
"""Make an HTTP request to the llama.cpp server."""
url = f"{self.base_url}{path}"
if self._session:
resp = self._session.request(
method, url, json=data, timeout=self.timeout, stream=stream
)
resp.raise_for_status()
if stream:
return resp.iter_lines()
return resp.json()
else:
import urllib.request
import urllib.error
body = json.dumps(data).encode() if data else None
req = urllib.request.Request(
url,
data=body,
method=method,
headers={"Content-Type": "application/json"},
)
try:
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
return json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
raise LlamaClientError(
f"HTTP {e.code}: {e.read().decode()}"
) from e
def health_check(self) -> bool:
"""
Check if the llama.cpp server is healthy.
Returns:
True if the server is healthy, False otherwise.
"""
try:
result = self._request("GET", "/health")
return result.get("status") == "ok" if isinstance(result, dict) else False
except Exception:
return False
def get_health(self) -> Dict[str, Any]:
"""
Get detailed health status from the server.
Returns:
Dict with status, slots_idle, slots_processing, etc.
"""
return self._request("GET", "/health")
def chat_completion(
self,
messages: List[Dict[str, str]],
model: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
top_p: float = 0.9,
stream: bool = False,
stop: Optional[List[str]] = None,
**kwargs: Any,
) -> Dict[str, Any] | Generator[Dict[str, Any], None, None]:
"""
Create a chat completion (OpenAI-compatible).
Args:
messages: List of message dicts with 'role' and 'content'.
model: Model name (server ignores if only one model loaded).
max_tokens: Maximum tokens to generate.
temperature: Sampling temperature.
top_p: Nucleus sampling parameter.
stream: Whether to stream the response.
stop: Stop sequences.
Returns:
OpenAI-compatible response dict, or generator if streaming.
"""
payload = {
"model": model or self.model,
"messages": messages,
"max_tokens": max_tokens or self.max_tokens,
"temperature": temperature,
"top_p": top_p,
"stream": stream,
}
if stop:
payload["stop"] = stop
payload.update(kwargs)
if stream:
return self._stream_chat(payload)
return self._request("POST", "/v1/chat/completions", data=payload)
def _stream_chat(
self, payload: Dict[str, Any]
) -> Generator[Dict[str, Any], None, None]:
"""Yield streamed chat completion chunks."""
lines = self._request(
"POST", "/v1/chat/completions", data=payload, stream=True
)
for line in lines:
if not line:
continue
line_str = line.decode() if isinstance(line, bytes) else line
if line_str.startswith("data: "):
data_str = line_str[6:]
if data_str.strip() == "[DONE]":
break
try:
yield json.loads(data_str)
except json.JSONDecodeError:
continue
def completion(
self,
prompt: str,
model: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
top_p: float = 0.9,
stream: bool = False,
stop: Optional[List[str]] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""
Create a raw completion (OpenAI-compatible).
Args:
prompt: The text prompt.
model: Model name.
max_tokens: Maximum tokens to generate.
temperature: Sampling temperature.
top_p: Nucleus sampling parameter.
stream: Whether to stream.
stop: Stop sequences.
Returns:
OpenAI-compatible response dict.
"""
payload = {
"model": model or self.model,
"prompt": prompt,
"max_tokens": max_tokens or self.max_tokens,
"temperature": temperature,
"top_p": top_p,
"stream": stream,
}
if stop:
payload["stop"] = stop
payload.update(kwargs)
return self._request("POST", "/v1/completions", data=payload)
def list_models(self) -> List[Dict[str, Any]]:
"""
List available models.
Returns:
List of model info dicts.
"""
result = self._request("GET", "/v1/models")
if isinstance(result, dict) and "data" in result:
return result["data"]
return result if isinstance(result, list) else [result]
def simple_chat(
self,
message: str,
system: Optional[str] = None,
**kwargs: Any,
) -> str:
"""
Simplified chat interface — returns just the response text.
Args:
message: User message.
system: Optional system prompt.
**kwargs: Additional parameters passed to chat_completion.
Returns:
The assistant's response text.
"""
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": message})
response = self.chat_completion(messages, stream=False, **kwargs)
if isinstance(response, dict):
choices = response.get("choices", [])
if choices:
return choices[0].get("message", {}).get("content", "")
return ""
def benchmark(
self,
prompt: str = "Explain the concept of consciousness in three sentences.",
iterations: int = 5,
max_tokens: int = 128,
) -> Dict[str, float]:
"""
Run a simple latency benchmark.
Args:
prompt: Prompt to use for benchmarking.
iterations: Number of iterations.
max_tokens: Max tokens per response.
Returns:
Dict with avg_latency, min_latency, max_latency, total_time.
"""
latencies = []
start = time.time()
for i in range(iterations):
t0 = time.time()
self.completion(
prompt=prompt,
max_tokens=max_tokens,
temperature=0.0,
)
latencies.append(time.time() - t0)
total = time.time() - start
return {
"avg_latency": sum(latencies) / len(latencies),
"min_latency": min(latencies),
"max_latency": max(latencies),
"total_time": total,
"iterations": iterations,
"tokens_per_second": (max_tokens * iterations) / total,
}
def main() -> None:
"""CLI entry point — run a health check and optional test prompt."""
import argparse
import sys
parser = argparse.ArgumentParser(
description="llama.cpp client — sovereign local inference for The Nexus"
)
parser.add_argument(
"--base-url",
default=None,
help="llama.cpp server URL (default: LLAMA_SERVER_URL or http://127.0.0.1:8081)",
)
parser.add_argument(
"--health", action="store_true", help="Run health check only"
)
parser.add_argument(
"--prompt", type=str, help="Send a test prompt to the server"
)
parser.add_argument(
"--benchmark",
action="store_true",
help="Run a latency benchmark",
)
parser.add_argument(
"--iterations",
type=int,
default=5,
help="Number of benchmark iterations (default: 5)",
)
args = parser.parse_args()
client = LlamaClient(base_url=args.base_url)
if args.health:
if client.health_check():
health = client.get_health()
print(f"Server healthy: {json.dumps(health, indent=2)}")
sys.exit(0)
else:
print("Server unhealthy or unreachable", file=sys.stderr)
sys.exit(1)
if args.benchmark:
print(f"Running benchmark ({args.iterations} iterations)...")
stats = client.benchmark(iterations=args.iterations)
print(json.dumps(stats, indent=2))
return
if args.prompt:
print(f"Sending prompt: {args.prompt}")
response = client.simple_chat(args.prompt)
print(f"Response: {response}")
return
# Default: health check
if client.health_check():
health = client.get_health()
print(f"Server healthy: {json.dumps(health, indent=2)}")
else:
print("Server unhealthy or unreachable", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

277
docs/local-llm.md Normal file
View File

@@ -0,0 +1,277 @@
# Local LLM Deployment Guide — llama.cpp Sovereign Backend
> Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
This guide covers deploying, benchmarking, and running local LLM inference
using llama.cpp as the sovereign offline backend for The Nexus.
## Table of Contents
- [Overview](#overview)
- [Phase 1: Deployment](#phase-1-deployment)
- [Phase 2: Hermes Integration](#phase-2-hermes-integration)
- [Phase 3: Benchmarking & Quantization](#phase-3-benchmarking--quantization)
- [Model Path Standardization](#model-path-standardization)
- [Systemd Service](#systemd-service)
- [Troubleshooting](#troubleshooting)
## Overview
The Nexus uses llama.cpp as its sovereign local inference backend. This ensures:
- **Offline capability** — full inference without external API access
- **Data sovereignty** — no data leaves the local machine
- **Graceful fallback** — Hermes inference router falls back to local when
external APIs fail or `LOCAL_ONLY=true`
- **OpenAI-compatible API** — llama.cpp server exposes an OpenAI-compatible
HTTP interface, making integration seamless
## Phase 1: Deployment
### Prerequisites
- Linux (x86_64 or aarch64) or macOS (Apple Silicon recommended)
- CMake 3.14+ and a C/C++ compiler
- Git
- Python 3.10+ (for the client and provider)
### Building llama.cpp
```bash
git clone https://github.com/ggerganov/llama.cpp.git
cd llama.cpp
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release -j$(nproc)
```
For Apple Silicon with Metal:
```bash
cmake -B build -DLLAMA_METAL=ON
cmake --build build --config Release -j$(sysctl -n hw.ncpu)
```
### Downloading Models
Place GGUF models in the standardized path:
```bash
mkdir -p /opt/models/llama
# Example: download a quantized model
wget -O /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
"https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
```
### Starting the Server
```bash
./build/bin/llama-server \
--model /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
--host 127.0.0.1 \
--port 8081 \
--ctx-size 4096 \
--parallel 2 \
--chat-template llama3
```
Or use the systemd service (see below).
### Health Check
After starting, verify the server is healthy:
```bash
curl -s http://127.0.0.1:8081/health | python3 -m json.tool
```
Expected response:
```json
{
"status": "ok",
"slots_idle": 2,
"slots_processing": 0
}
```
Or use the client:
```python
from bin.llama_client import LlamaClient
client = LlamaClient()
if client.health_check():
print("Server is healthy")
```
## Phase 2: Hermes Integration
### llama_client.py
The Python client (`bin/llama_client.py`) wraps the llama.cpp HTTP API with
an OpenAI-compatible interface. It supports:
- `/v1/chat/completions` — chat-style inference
- `/v1/completions` — raw completion
- `/health` — health check
- Streaming and non-streaming modes
- Configurable base URL via `LLAMA_SERVER_URL` env var
```python
from bin.llama_client import LlamaClient
client = LlamaClient(base_url="http://127.0.0.1:8081")
# Chat completion
response = client.chat_completion(
messages=[{"role": "user", "content": "Hello, who are you?"}],
max_tokens=256,
temperature=0.7,
)
print(response)
```
### llama_provider.py
The provider adapter (`nexus/llama_provider.py`) integrates with the Hermes
inference router. It is activated when:
1. All external API providers fail, OR
2. The environment variable `LOCAL_ONLY=true` is set
```python
from nexus.llama_provider import LlamaProvider
provider = LlamaProvider()
result = provider.infer("What is the meaning of life?", context=[])
```
### Environment Variables
| Variable | Default | Description |
|---|---|---|
| `LLAMA_SERVER_URL` | `http://127.0.0.1:8081` | llama.cpp server base URL |
| `LLAMA_MODEL_PATH` | `/opt/models/llama` | Directory containing GGUF models |
| `LLAMA_DEFAULT_MODEL` | (auto-detected) | Default model filename |
| `LOCAL_ONLY` | `false` | Force local-only inference |
| `LLAMA_CTX_SIZE` | `4096` | Context window size |
| `LLAMA_MAX_TOKENS` | `512` | Maximum tokens per response |
## Phase 3: Benchmarking & Quantization
### Benchmarking
Use llama.cpp's built-in perplexity and speed benchmarks:
```bash
# Speed benchmark
./build/bin/llama-bench \
-m /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
-p 512 -n 128
# Perplexity evaluation
./build/bin/llama-perplexity \
-m /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
-f wiki.test.raw
```
The client also supports a simple latency benchmark:
```python
from bin.llama_client import LlamaClient
import time
client = LlamaClient()
start = time.time()
for i in range(10):
client.chat_completion(
messages=[{"role": "user", "content": f"Test prompt {i}."}],
max_tokens=64,
)
elapsed = time.time() - start
print(f"Average latency: {elapsed / 10:.2f}s")
```
### Quantization Guide
Quantization reduces model size and increases inference speed at the cost of
some accuracy. Recommended quantizations for different hardware:
| Hardware | Quantization | Size (8B) | Quality |
|---|---|---|---|
| 16GB+ VRAM | Q8_0 | ~8.5 GB | Near-original |
| 8GB VRAM | Q4_K_M | ~4.7 GB | Good balance |
| 4GB VRAM / CPU | Q4_0 | ~4.4 GB | Acceptable |
| Very constrained | Q2_K | ~3.0 GB | Degraded |
Quantize a model:
```bash
./build/bin/llama-quantize \
/opt/models/llama/model-f16.gguf \
/opt/models/llama/model-Q4_K_M.gguf \
Q4_K_M
```
### Recommended Models
For The Nexus workloads:
- **General reasoning**: Llama 3.1 8B Q4_K_M — fast, good quality
- **Code assistance**: DeepSeek-Coder-V2-Lite Q4_K_M
- **Small/fast**: Phi-3-mini Q4_K_M — runs well on CPU
## Model Path Standardization
All Nexus components expect models under `/opt/models/llama/` by default.
Directory structure:
```
/opt/models/llama/
llama-3.1-8b-Q4_K_M.gguf
deepseek-coder-lite-Q4_K_M.gguf
phi-3-mini-Q4_K_M.gguf
```
Override with `LLAMA_MODEL_PATH` environment variable.
## Systemd Service
A systemd unit file is provided at `systemd/llama-server.service`.
### Installation
```bash
sudo cp systemd/llama-server.service /etc/systemd/system/
sudo systemctl daemon-reload
sudo systemctl enable --now llama-server.service
sudo systemctl status llama-server.service
```
### Logs
```bash
journalctl -u llama-server.service -f
```
## Troubleshooting
### Server won't start
- Check that the GGUF model file exists at the configured path
- Verify port 8081 is not in use: `ss -tlnp | grep 8081`
- Check logs: `journalctl -u llama-server -n 50`
### Slow inference
- Use a more aggressive quantization (Q4_K_M instead of Q8_0)
- Reduce context size (`--ctx-size 2048`)
- For GPU: verify CUDA/Metal is enabled at build time
- Check `--parallel` value — too high thrashes the GPU
### Out of memory
- Reduce `--ctx-size`
- Use a smaller quantization
- Use a smaller model (3B instead of 8B)
### Client connection refused
- Verify server is running: `curl http://127.0.0.1:8081/health`
- Check `LLAMA_SERVER_URL` env var matches server config
- Ensure firewall allows localhost:8081

View File

@@ -1,32 +1 @@
"""
Nexus — Embodied Mind Module
The perception adapter, experience store, trajectory logger, and
consciousness loop that give Timmy a body in the Nexus.
"""
from nexus.perception_adapter import (
ws_to_perception,
parse_actions,
PerceptionBuffer,
Perception,
Action,
)
from nexus.experience_store import ExperienceStore
from nexus.trajectory_logger import TrajectoryLogger
try:
from nexus.nexus_think import NexusMind
except Exception:
NexusMind = None
__all__ = [
"ws_to_perception",
"parse_actions",
"PerceptionBuffer",
"Perception",
"Action",
"ExperienceStore",
"TrajectoryLogger",
"NexusMind",
]
# nexus package — cognition and inference components for The Nexus

243
nexus/llama_provider.py Normal file
View File

@@ -0,0 +1,243 @@
#!/usr/bin/env python3
"""
llama_provider.py — Provider adapter for Hermes inference router.
Integrates llama.cpp as a sovereign local backend for The Nexus.
Activated when:
1. All external API providers fail, OR
2. LOCAL_ONLY=true environment variable is set
Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
"""
import os
import logging
from typing import Any, Dict, List, Optional
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from bin.llama_client import LlamaClient, LlamaClientError
logger = logging.getLogger("nexus.llama_provider")
class LlamaProvider:
"""
Hermes-compatible inference provider backed by local llama.cpp server.
This provider follows the same interface expected by the Hermes
inference router, enabling drop-in fallback when external APIs
(OpenAI, Anthropic, etc.) are unavailable or when LOCAL_ONLY=true.
Environment variables:
LLAMA_SERVER_URL — llama.cpp server URL
LOCAL_ONLY — if "true", this provider takes priority
LLAMA_DEFAULT_MODEL — model name override
LLAMA_MAX_TOKENS — default max tokens
"""
NAME = "llama-local"
PRIORITY = 100 # Lower priority than external providers by default
def __init__(
self,
base_url: Optional[str] = None,
model: Optional[str] = None,
):
self.client = LlamaClient(base_url=base_url, model=model)
self._local_only = os.environ.get("LOCAL_ONLY", "").lower() in (
"true",
"1",
"yes",
)
if self._local_only:
self.PRIORITY = 0 # Highest priority when LOCAL_ONLY
logger.info("LOCAL_ONLY mode enabled — llama provider is primary")
@property
def name(self) -> str:
return self.NAME
@property
def available(self) -> bool:
"""Check if the local llama.cpp server is reachable and healthy."""
return self.client.health_check()
@property
def local_only(self) -> bool:
"""Whether LOCAL_ONLY mode is enabled."""
return self._local_only
def infer(
self,
prompt: str,
context: Optional[List[Dict[str, str]]] = None,
system: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
**kwargs: Any,
) -> Dict[str, Any]:
"""
Run inference through the local llama.cpp server.
Args:
prompt: The user prompt/question.
context: Optional conversation history as list of
{"role": ..., "content": ...} dicts.
system: Optional system prompt override.
max_tokens: Maximum tokens to generate.
temperature: Sampling temperature.
Returns:
Dict with keys:
- provider: str — provider name
- response: str — the generated text
- model: str — model used
- tokens_used: int — approximate token count
- latency_ms: float — inference latency in ms
Raises:
LlamaClientError: If the server returns an error.
RuntimeError: If the server is not available.
"""
import time
if not self.available:
raise RuntimeError(
f"llama.cpp server is not available at {self.client.base_url}. "
"Start the server or check LLAMA_SERVER_URL."
)
messages = []
if system:
messages.append({"role": "system", "content": system})
if context:
messages.extend(context)
messages.append({"role": "user", "content": prompt})
t0 = time.time()
raw = self.client.chat_completion(
messages=messages,
max_tokens=max_tokens or self.client.max_tokens,
temperature=temperature,
stream=False,
**kwargs,
)
latency_ms = (time.time() - t0) * 1000
# Parse OpenAI-compatible response
response_text = ""
model_used = ""
tokens_used = 0
if isinstance(raw, dict):
choices = raw.get("choices", [])
if choices:
msg = choices[0].get("message", {})
response_text = msg.get("content", "")
usage = raw.get("usage", {})
tokens_used = usage.get("total_tokens", 0)
model_used = raw.get("model", self.client.model)
return {
"provider": self.NAME,
"response": response_text,
"model": model_used,
"tokens_used": tokens_used,
"latency_ms": round(latency_ms, 2),
}
def infer_stream(
self,
prompt: str,
context: Optional[List[Dict[str, str]]] = None,
system: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: float = 0.7,
**kwargs: Any,
):
"""
Stream inference tokens from the local llama.cpp server.
Yields partial response dicts as tokens arrive.
"""
if not self.available:
raise RuntimeError(
f"llama.cpp server is not available at {self.client.base_url}"
)
messages = []
if system:
messages.append({"role": "system", "content": system})
if context:
messages.extend(context)
messages.append({"role": "user", "content": prompt})
chunks = self.client.chat_completion(
messages=messages,
max_tokens=max_tokens or self.client.max_tokens,
temperature=temperature,
stream=True,
**kwargs,
)
for chunk in chunks:
if isinstance(chunk, dict):
choices = chunk.get("choices", [])
if choices:
delta = choices[0].get("delta", {})
content = delta.get("content", "")
if content:
yield {
"provider": self.NAME,
"delta": content,
"done": choices[0].get("finish_reason") is not None,
}
def get_status(self) -> Dict[str, Any]:
"""
Get provider status information.
Returns:
Dict with provider name, availability, server health, etc.
"""
status: Dict[str, Any] = {
"provider": self.NAME,
"available": self.available,
"local_only": self._local_only,
"base_url": self.client.base_url,
"model": self.client.model,
}
if self.available:
try:
health = self.client.get_health()
status["server_health"] = health
except Exception:
pass
return status
# ---------------------------------------------------------------------------
# Integration helper for the Hermes inference router
# ---------------------------------------------------------------------------
def register_provider(router: Any) -> LlamaProvider:
"""
Register the llama provider with a Hermes inference router.
Args:
router: A Hermes inference router instance with an
`add_provider(name, provider, priority)` method.
Returns:
The registered LlamaProvider instance.
"""
provider = LlamaProvider()
if hasattr(router, "add_provider"):
router.add_provider(provider.NAME, provider, priority=provider.PRIORITY)
logger.info(
"Registered llama provider (priority=%d, local_only=%s)",
provider.PRIORITY,
provider.local_only,
)
return provider

1
systemd/.gitkeep Normal file
View File

@@ -0,0 +1 @@
placeholder

View File

@@ -0,0 +1,49 @@
[Unit]
Description=llama.cpp HTTP Server — Sovereign Local LLM Backend for The Nexus
Documentation=file:///opt/the-nexus/docs/local-llm.md
After=network.target
Wants=network-online.target
[Service]
Type=simple
User=llama
Group=llama
# Model configuration
Environment=LLAMA_MODEL_PATH=/opt/models/llama
ExecStart=/opt/llama.cpp/build/bin/llama-server \
--model /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
--host 127.0.0.1 \
--port 8081 \
--ctx-size 4096 \
--parallel 2 \
--chat-template llama3
# Resource limits
LimitNOFILE=65536
LimitNPROC=4096
MemoryMax=12G
# Restart policy
Restart=on-failure
RestartSec=5
StartLimitIntervalSec=300
StartLimitBurst=5
# Hardening
ProtectSystem=strict
ProtectHome=read-only
ReadWritePaths=/opt/models/llama
PrivateTmp=true
NoNewPrivileges=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
# Logging
StandardOutput=journal
StandardError=journal
SyslogIdentifier=llama-server
[Install]
WantedBy=multi-user.target

1
tests/__init__.py Normal file
View File

@@ -0,0 +1 @@
# tests package

325
tests/test_llama_client.py Normal file
View File

@@ -0,0 +1,325 @@
#!/usr/bin/env python3
"""
Tests for llama_client.py — the sovereign llama.cpp HTTP client.
Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
"""
import json
import os
import sys
import unittest
from unittest.mock import MagicMock, patch
from io import BytesIO
# Add project root to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from bin.llama_client import LlamaClient, LlamaClientError
class TestLlamaClientInit(unittest.TestCase):
"""Test client initialization and configuration."""
def test_default_base_url(self):
client = LlamaClient()
self.assertEqual(client.base_url, "http://127.0.0.1:8081")
def test_custom_base_url(self):
client = LlamaClient(base_url="http://localhost:9999")
self.assertEqual(client.base_url, "http://localhost:9999")
def test_base_url_strips_trailing_slash(self):
client = LlamaClient(base_url="http://localhost:8081/")
self.assertEqual(client.base_url, "http://localhost:8081")
def test_env_var_base_url(self):
with patch.dict(os.environ, {"LLAMA_SERVER_URL": "http://env-host:1234"}):
client = LlamaClient()
self.assertEqual(client.base_url, "http://env-host:1234")
def test_explicit_url_overrides_env(self):
with patch.dict(os.environ, {"LLAMA_SERVER_URL": "http://env-host:1234"}):
client = LlamaClient(base_url="http://explicit:5678")
self.assertEqual(client.base_url, "http://explicit:5678")
def test_default_model(self):
client = LlamaClient()
self.assertEqual(client.model, "default")
def test_custom_model(self):
client = LlamaClient(model="llama-3.1-8b")
self.assertEqual(client.model, "llama-3.1-8b")
def test_env_model(self):
with patch.dict(os.environ, {"LLAMA_DEFAULT_MODEL": "phi-3"}):
client = LlamaClient()
self.assertEqual(client.model, "phi-3")
def test_max_tokens_default(self):
client = LlamaClient()
self.assertEqual(client.max_tokens, 512)
def test_max_tokens_env(self):
with patch.dict(os.environ, {"LLAMA_MAX_TOKENS": "1024"}):
client = LlamaClient()
self.assertEqual(client.max_tokens, 1024)
class TestLlamaClientHealthCheck(unittest.TestCase):
"""Test health check functionality."""
@patch("bin.llama_client.requests")
def test_health_check_healthy(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {"status": "ok", "slots_idle": 2}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
self.assertTrue(client.health_check())
mock_session.request.assert_called_with(
"GET", "http://127.0.0.1:8081/health",
json=None, timeout=120.0, stream=False
)
@patch("bin.llama_client.requests")
def test_health_check_unhealthy(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {"status": "error"}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
self.assertFalse(client.health_check())
@patch("bin.llama_client.requests")
def test_health_check_connection_error(self, mock_requests):
mock_session = MagicMock()
mock_session.request.side_effect = ConnectionError("refused")
mock_requests.Session.return_value = mock_session
client = LlamaClient()
self.assertFalse(client.health_check())
class TestLlamaClientChatCompletion(unittest.TestCase):
"""Test chat completion functionality."""
@patch("bin.llama_client.requests")
def test_chat_completion_basic(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {
"id": "chatcmpl-123",
"model": "llama-3.1-8b",
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": "Hello! I am a local AI."},
"finish_reason": "stop",
}
],
"usage": {"prompt_tokens": 10, "completion_tokens": 8, "total_tokens": 18},
}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
result = client.chat_completion(
messages=[{"role": "user", "content": "Hello"}],
max_tokens=64,
)
self.assertIsInstance(result, dict)
self.assertEqual(result["choices"][0]["message"]["content"], "Hello! I am a local AI.")
self.assertEqual(result["usage"]["total_tokens"], 18)
# Verify the request payload
call_args = mock_session.request.call_args
payload = call_args[1]["json"]
self.assertEqual(payload["messages"], [{"role": "user", "content": "Hello"}])
self.assertEqual(payload["max_tokens"], 64)
self.assertEqual(payload["stream"], False)
@patch("bin.llama_client.requests")
def test_chat_completion_with_system(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {
"choices": [{"message": {"content": "I'm helpful."}}],
"usage": {},
}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
messages = [
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hi"},
]
client.chat_completion(messages=messages)
payload = mock_session.request.call_args[1]["json"]
self.assertEqual(len(payload["messages"]), 2)
class TestLlamaClientSimpleChat(unittest.TestCase):
"""Test the simplified chat interface."""
@patch("bin.llama_client.requests")
def test_simple_chat(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {
"choices": [{"message": {"content": "42"}}],
"usage": {"total_tokens": 10},
}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
response = client.simple_chat("What is the answer?")
self.assertEqual(response, "42")
payload = mock_session.request.call_args[1]["json"]
self.assertEqual(payload["messages"][0]["role"], "user")
self.assertEqual(payload["messages"][0]["content"], "What is the answer?")
@patch("bin.llama_client.requests")
def test_simple_chat_with_system(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {
"choices": [{"message": {"content": "Yes"}}],
"usage": {},
}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
client.simple_chat("Are you alive?", system="You are a wizard.")
payload = mock_session.request.call_args[1]["json"]
self.assertEqual(payload["messages"][0]["role"], "system")
self.assertEqual(payload["messages"][0]["content"], "You are a wizard.")
@patch("bin.llama_client.requests")
def test_simple_chat_empty_response(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {"choices": [], "usage": {}}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
response = client.simple_chat("Hello")
self.assertEqual(response, "")
class TestLlamaClientListModels(unittest.TestCase):
"""Test model listing."""
@patch("bin.llama_client.requests")
def test_list_models(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {
"data": [
{"id": "llama-3.1-8b", "object": "model"},
{"id": "phi-3-mini", "object": "model"},
]
}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
models = client.list_models()
self.assertEqual(len(models), 2)
self.assertEqual(models[0]["id"], "llama-3.1-8b")
class TestLlamaClientBenchmark(unittest.TestCase):
"""Test the benchmark method."""
@patch("bin.llama_client.time.time")
@patch("bin.llama_client.requests")
def test_benchmark(self, mock_requests, mock_time):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {
"choices": [{"message": {"content": "result"}}],
"usage": {"total_tokens": 20},
}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
# Simulate time progression: 1 start + 2 per iteration (t0 + latency) + 1 end = 12 calls
mock_time.side_effect = [
0.0, # start
0.0, 0.5, # iter 0: t0, latency
0.5, 1.0, # iter 1
1.0, 1.5, # iter 2
1.5, 2.0, # iter 3
2.0, 2.5, # iter 4
2.5, # end
]
client = LlamaClient()
stats = client.benchmark(iterations=5, max_tokens=64)
self.assertIn("avg_latency", stats)
self.assertIn("min_latency", stats)
self.assertIn("max_latency", stats)
self.assertIn("total_time", stats)
self.assertEqual(stats["iterations"], 5)
class TestLlamaClientCompletion(unittest.TestCase):
"""Test raw completion endpoint."""
@patch("bin.llama_client.requests")
def test_completion(self, mock_requests):
mock_session = MagicMock()
mock_resp = MagicMock()
mock_resp.json.return_value = {
"choices": [{"text": "Generated text here."}],
"usage": {"total_tokens": 15},
}
mock_resp.raise_for_status = MagicMock()
mock_session.request.return_value = mock_resp
mock_requests.Session.return_value = mock_session
client = LlamaClient()
result = client.completion(prompt="Once upon a time", max_tokens=100)
self.assertEqual(result["choices"][0]["text"], "Generated text here.")
payload = mock_session.request.call_args[1]["json"]
self.assertEqual(payload["prompt"], "Once upon a time")
self.assertEqual(payload["max_tokens"], 100)
class TestLlamaClientError(unittest.TestCase):
"""Test error handling."""
def test_error_class(self):
err = LlamaClientError("Something went wrong")
self.assertIsInstance(err, Exception)
self.assertEqual(str(err), "Something went wrong")
if __name__ == "__main__":
unittest.main()