Compare commits
1 Commits
fix/883
...
whip/1123-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
41d3acbe41 |
1
bin/__init__.py
Normal file
1
bin/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# bin package — CLI tools and clients for The Nexus
|
||||
377
bin/llama_client.py
Normal file
377
bin/llama_client.py
Normal file
@@ -0,0 +1,377 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
llama_client.py — Python client wrapping the llama.cpp HTTP server API.
|
||||
|
||||
Provides an OpenAI-compatible interface for local llama.cpp inference.
|
||||
This is the sovereign offline backend for The Nexus.
|
||||
|
||||
Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict, Generator, List, Optional
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
requests = None # Fall back to urllib
|
||||
|
||||
|
||||
class LlamaClientError(Exception):
|
||||
"""Raised when the llama.cpp server returns an error."""
|
||||
pass
|
||||
|
||||
|
||||
class LlamaClient:
|
||||
"""
|
||||
OpenAI-compatible client for the llama.cpp HTTP server.
|
||||
|
||||
Supports:
|
||||
- /v1/chat/completions (chat-style)
|
||||
- /v1/completions (raw completion)
|
||||
- /health (health check)
|
||||
- Streaming and non-streaming modes
|
||||
|
||||
Environment variables:
|
||||
LLAMA_SERVER_URL — base URL (default: http://127.0.0.1:8081)
|
||||
LLAMA_DEFAULT_MODEL — default model name
|
||||
LLAMA_MAX_TOKENS — default max tokens (default: 512)
|
||||
"""
|
||||
|
||||
DEFAULT_BASE_URL = "http://127.0.0.1:8081"
|
||||
DEFAULT_MODEL = "default"
|
||||
DEFAULT_MAX_TOKENS = 512
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
timeout: float = 120.0,
|
||||
):
|
||||
self.base_url = (
|
||||
base_url
|
||||
or os.environ.get("LLAMA_SERVER_URL")
|
||||
or self.DEFAULT_BASE_URL
|
||||
).rstrip("/")
|
||||
self.model = (
|
||||
model
|
||||
or os.environ.get("LLAMA_DEFAULT_MODEL")
|
||||
or self.DEFAULT_MODEL
|
||||
)
|
||||
self.max_tokens = int(
|
||||
os.environ.get("LLAMA_MAX_TOKENS", self.DEFAULT_MAX_TOKENS)
|
||||
)
|
||||
self.timeout = timeout
|
||||
self._session = None
|
||||
if requests:
|
||||
self._session = requests.Session()
|
||||
|
||||
def _request(
|
||||
self,
|
||||
method: str,
|
||||
path: str,
|
||||
data: Optional[Dict] = None,
|
||||
stream: bool = False,
|
||||
) -> Any:
|
||||
"""Make an HTTP request to the llama.cpp server."""
|
||||
url = f"{self.base_url}{path}"
|
||||
|
||||
if self._session:
|
||||
resp = self._session.request(
|
||||
method, url, json=data, timeout=self.timeout, stream=stream
|
||||
)
|
||||
resp.raise_for_status()
|
||||
if stream:
|
||||
return resp.iter_lines()
|
||||
return resp.json()
|
||||
else:
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
body = json.dumps(data).encode() if data else None
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=body,
|
||||
method=method,
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
||||
return json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
raise LlamaClientError(
|
||||
f"HTTP {e.code}: {e.read().decode()}"
|
||||
) from e
|
||||
|
||||
def health_check(self) -> bool:
|
||||
"""
|
||||
Check if the llama.cpp server is healthy.
|
||||
|
||||
Returns:
|
||||
True if the server is healthy, False otherwise.
|
||||
"""
|
||||
try:
|
||||
result = self._request("GET", "/health")
|
||||
return result.get("status") == "ok" if isinstance(result, dict) else False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def get_health(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed health status from the server.
|
||||
|
||||
Returns:
|
||||
Dict with status, slots_idle, slots_processing, etc.
|
||||
"""
|
||||
return self._request("GET", "/health")
|
||||
|
||||
def chat_completion(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
model: Optional[str] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
temperature: float = 0.7,
|
||||
top_p: float = 0.9,
|
||||
stream: bool = False,
|
||||
stop: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any] | Generator[Dict[str, Any], None, None]:
|
||||
"""
|
||||
Create a chat completion (OpenAI-compatible).
|
||||
|
||||
Args:
|
||||
messages: List of message dicts with 'role' and 'content'.
|
||||
model: Model name (server ignores if only one model loaded).
|
||||
max_tokens: Maximum tokens to generate.
|
||||
temperature: Sampling temperature.
|
||||
top_p: Nucleus sampling parameter.
|
||||
stream: Whether to stream the response.
|
||||
stop: Stop sequences.
|
||||
|
||||
Returns:
|
||||
OpenAI-compatible response dict, or generator if streaming.
|
||||
"""
|
||||
payload = {
|
||||
"model": model or self.model,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens or self.max_tokens,
|
||||
"temperature": temperature,
|
||||
"top_p": top_p,
|
||||
"stream": stream,
|
||||
}
|
||||
if stop:
|
||||
payload["stop"] = stop
|
||||
payload.update(kwargs)
|
||||
|
||||
if stream:
|
||||
return self._stream_chat(payload)
|
||||
return self._request("POST", "/v1/chat/completions", data=payload)
|
||||
|
||||
def _stream_chat(
|
||||
self, payload: Dict[str, Any]
|
||||
) -> Generator[Dict[str, Any], None, None]:
|
||||
"""Yield streamed chat completion chunks."""
|
||||
lines = self._request(
|
||||
"POST", "/v1/chat/completions", data=payload, stream=True
|
||||
)
|
||||
for line in lines:
|
||||
if not line:
|
||||
continue
|
||||
line_str = line.decode() if isinstance(line, bytes) else line
|
||||
if line_str.startswith("data: "):
|
||||
data_str = line_str[6:]
|
||||
if data_str.strip() == "[DONE]":
|
||||
break
|
||||
try:
|
||||
yield json.loads(data_str)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
def completion(
|
||||
self,
|
||||
prompt: str,
|
||||
model: Optional[str] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
temperature: float = 0.7,
|
||||
top_p: float = 0.9,
|
||||
stream: bool = False,
|
||||
stop: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a raw completion (OpenAI-compatible).
|
||||
|
||||
Args:
|
||||
prompt: The text prompt.
|
||||
model: Model name.
|
||||
max_tokens: Maximum tokens to generate.
|
||||
temperature: Sampling temperature.
|
||||
top_p: Nucleus sampling parameter.
|
||||
stream: Whether to stream.
|
||||
stop: Stop sequences.
|
||||
|
||||
Returns:
|
||||
OpenAI-compatible response dict.
|
||||
"""
|
||||
payload = {
|
||||
"model": model or self.model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens or self.max_tokens,
|
||||
"temperature": temperature,
|
||||
"top_p": top_p,
|
||||
"stream": stream,
|
||||
}
|
||||
if stop:
|
||||
payload["stop"] = stop
|
||||
payload.update(kwargs)
|
||||
return self._request("POST", "/v1/completions", data=payload)
|
||||
|
||||
def list_models(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
List available models.
|
||||
|
||||
Returns:
|
||||
List of model info dicts.
|
||||
"""
|
||||
result = self._request("GET", "/v1/models")
|
||||
if isinstance(result, dict) and "data" in result:
|
||||
return result["data"]
|
||||
return result if isinstance(result, list) else [result]
|
||||
|
||||
def simple_chat(
|
||||
self,
|
||||
message: str,
|
||||
system: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""
|
||||
Simplified chat interface — returns just the response text.
|
||||
|
||||
Args:
|
||||
message: User message.
|
||||
system: Optional system prompt.
|
||||
**kwargs: Additional parameters passed to chat_completion.
|
||||
|
||||
Returns:
|
||||
The assistant's response text.
|
||||
"""
|
||||
messages = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": message})
|
||||
|
||||
response = self.chat_completion(messages, stream=False, **kwargs)
|
||||
if isinstance(response, dict):
|
||||
choices = response.get("choices", [])
|
||||
if choices:
|
||||
return choices[0].get("message", {}).get("content", "")
|
||||
return ""
|
||||
|
||||
def benchmark(
|
||||
self,
|
||||
prompt: str = "Explain the concept of consciousness in three sentences.",
|
||||
iterations: int = 5,
|
||||
max_tokens: int = 128,
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Run a simple latency benchmark.
|
||||
|
||||
Args:
|
||||
prompt: Prompt to use for benchmarking.
|
||||
iterations: Number of iterations.
|
||||
max_tokens: Max tokens per response.
|
||||
|
||||
Returns:
|
||||
Dict with avg_latency, min_latency, max_latency, total_time.
|
||||
"""
|
||||
latencies = []
|
||||
start = time.time()
|
||||
|
||||
for i in range(iterations):
|
||||
t0 = time.time()
|
||||
self.completion(
|
||||
prompt=prompt,
|
||||
max_tokens=max_tokens,
|
||||
temperature=0.0,
|
||||
)
|
||||
latencies.append(time.time() - t0)
|
||||
|
||||
total = time.time() - start
|
||||
return {
|
||||
"avg_latency": sum(latencies) / len(latencies),
|
||||
"min_latency": min(latencies),
|
||||
"max_latency": max(latencies),
|
||||
"total_time": total,
|
||||
"iterations": iterations,
|
||||
"tokens_per_second": (max_tokens * iterations) / total,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""CLI entry point — run a health check and optional test prompt."""
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="llama.cpp client — sovereign local inference for The Nexus"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
default=None,
|
||||
help="llama.cpp server URL (default: LLAMA_SERVER_URL or http://127.0.0.1:8081)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--health", action="store_true", help="Run health check only"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prompt", type=str, help="Send a test prompt to the server"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--benchmark",
|
||||
action="store_true",
|
||||
help="Run a latency benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--iterations",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Number of benchmark iterations (default: 5)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
client = LlamaClient(base_url=args.base_url)
|
||||
|
||||
if args.health:
|
||||
if client.health_check():
|
||||
health = client.get_health()
|
||||
print(f"Server healthy: {json.dumps(health, indent=2)}")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("Server unhealthy or unreachable", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if args.benchmark:
|
||||
print(f"Running benchmark ({args.iterations} iterations)...")
|
||||
stats = client.benchmark(iterations=args.iterations)
|
||||
print(json.dumps(stats, indent=2))
|
||||
return
|
||||
|
||||
if args.prompt:
|
||||
print(f"Sending prompt: {args.prompt}")
|
||||
response = client.simple_chat(args.prompt)
|
||||
print(f"Response: {response}")
|
||||
return
|
||||
|
||||
# Default: health check
|
||||
if client.health_check():
|
||||
health = client.get_health()
|
||||
print(f"Server healthy: {json.dumps(health, indent=2)}")
|
||||
else:
|
||||
print("Server unhealthy or unreachable", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
277
docs/local-llm.md
Normal file
277
docs/local-llm.md
Normal file
@@ -0,0 +1,277 @@
|
||||
# Local LLM Deployment Guide — llama.cpp Sovereign Backend
|
||||
|
||||
> Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
|
||||
|
||||
This guide covers deploying, benchmarking, and running local LLM inference
|
||||
using llama.cpp as the sovereign offline backend for The Nexus.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Overview](#overview)
|
||||
- [Phase 1: Deployment](#phase-1-deployment)
|
||||
- [Phase 2: Hermes Integration](#phase-2-hermes-integration)
|
||||
- [Phase 3: Benchmarking & Quantization](#phase-3-benchmarking--quantization)
|
||||
- [Model Path Standardization](#model-path-standardization)
|
||||
- [Systemd Service](#systemd-service)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
|
||||
## Overview
|
||||
|
||||
The Nexus uses llama.cpp as its sovereign local inference backend. This ensures:
|
||||
|
||||
- **Offline capability** — full inference without external API access
|
||||
- **Data sovereignty** — no data leaves the local machine
|
||||
- **Graceful fallback** — Hermes inference router falls back to local when
|
||||
external APIs fail or `LOCAL_ONLY=true`
|
||||
- **OpenAI-compatible API** — llama.cpp server exposes an OpenAI-compatible
|
||||
HTTP interface, making integration seamless
|
||||
|
||||
## Phase 1: Deployment
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Linux (x86_64 or aarch64) or macOS (Apple Silicon recommended)
|
||||
- CMake 3.14+ and a C/C++ compiler
|
||||
- Git
|
||||
- Python 3.10+ (for the client and provider)
|
||||
|
||||
### Building llama.cpp
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp.git
|
||||
cd llama.cpp
|
||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
||||
cmake --build build --config Release -j$(nproc)
|
||||
```
|
||||
|
||||
For Apple Silicon with Metal:
|
||||
```bash
|
||||
cmake -B build -DLLAMA_METAL=ON
|
||||
cmake --build build --config Release -j$(sysctl -n hw.ncpu)
|
||||
```
|
||||
|
||||
### Downloading Models
|
||||
|
||||
Place GGUF models in the standardized path:
|
||||
|
||||
```bash
|
||||
mkdir -p /opt/models/llama
|
||||
|
||||
# Example: download a quantized model
|
||||
wget -O /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
|
||||
"https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
|
||||
```
|
||||
|
||||
### Starting the Server
|
||||
|
||||
```bash
|
||||
./build/bin/llama-server \
|
||||
--model /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
|
||||
--host 127.0.0.1 \
|
||||
--port 8081 \
|
||||
--ctx-size 4096 \
|
||||
--parallel 2 \
|
||||
--chat-template llama3
|
||||
```
|
||||
|
||||
Or use the systemd service (see below).
|
||||
|
||||
### Health Check
|
||||
|
||||
After starting, verify the server is healthy:
|
||||
|
||||
```bash
|
||||
curl -s http://127.0.0.1:8081/health | python3 -m json.tool
|
||||
```
|
||||
|
||||
Expected response:
|
||||
```json
|
||||
{
|
||||
"status": "ok",
|
||||
"slots_idle": 2,
|
||||
"slots_processing": 0
|
||||
}
|
||||
```
|
||||
|
||||
Or use the client:
|
||||
```python
|
||||
from bin.llama_client import LlamaClient
|
||||
|
||||
client = LlamaClient()
|
||||
if client.health_check():
|
||||
print("Server is healthy")
|
||||
```
|
||||
|
||||
## Phase 2: Hermes Integration
|
||||
|
||||
### llama_client.py
|
||||
|
||||
The Python client (`bin/llama_client.py`) wraps the llama.cpp HTTP API with
|
||||
an OpenAI-compatible interface. It supports:
|
||||
|
||||
- `/v1/chat/completions` — chat-style inference
|
||||
- `/v1/completions` — raw completion
|
||||
- `/health` — health check
|
||||
- Streaming and non-streaming modes
|
||||
- Configurable base URL via `LLAMA_SERVER_URL` env var
|
||||
|
||||
```python
|
||||
from bin.llama_client import LlamaClient
|
||||
|
||||
client = LlamaClient(base_url="http://127.0.0.1:8081")
|
||||
|
||||
# Chat completion
|
||||
response = client.chat_completion(
|
||||
messages=[{"role": "user", "content": "Hello, who are you?"}],
|
||||
max_tokens=256,
|
||||
temperature=0.7,
|
||||
)
|
||||
print(response)
|
||||
```
|
||||
|
||||
### llama_provider.py
|
||||
|
||||
The provider adapter (`nexus/llama_provider.py`) integrates with the Hermes
|
||||
inference router. It is activated when:
|
||||
|
||||
1. All external API providers fail, OR
|
||||
2. The environment variable `LOCAL_ONLY=true` is set
|
||||
|
||||
```python
|
||||
from nexus.llama_provider import LlamaProvider
|
||||
|
||||
provider = LlamaProvider()
|
||||
result = provider.infer("What is the meaning of life?", context=[])
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|---|---|---|
|
||||
| `LLAMA_SERVER_URL` | `http://127.0.0.1:8081` | llama.cpp server base URL |
|
||||
| `LLAMA_MODEL_PATH` | `/opt/models/llama` | Directory containing GGUF models |
|
||||
| `LLAMA_DEFAULT_MODEL` | (auto-detected) | Default model filename |
|
||||
| `LOCAL_ONLY` | `false` | Force local-only inference |
|
||||
| `LLAMA_CTX_SIZE` | `4096` | Context window size |
|
||||
| `LLAMA_MAX_TOKENS` | `512` | Maximum tokens per response |
|
||||
|
||||
## Phase 3: Benchmarking & Quantization
|
||||
|
||||
### Benchmarking
|
||||
|
||||
Use llama.cpp's built-in perplexity and speed benchmarks:
|
||||
|
||||
```bash
|
||||
# Speed benchmark
|
||||
./build/bin/llama-bench \
|
||||
-m /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
|
||||
-p 512 -n 128
|
||||
|
||||
# Perplexity evaluation
|
||||
./build/bin/llama-perplexity \
|
||||
-m /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
|
||||
-f wiki.test.raw
|
||||
```
|
||||
|
||||
The client also supports a simple latency benchmark:
|
||||
|
||||
```python
|
||||
from bin.llama_client import LlamaClient
|
||||
import time
|
||||
|
||||
client = LlamaClient()
|
||||
|
||||
start = time.time()
|
||||
for i in range(10):
|
||||
client.chat_completion(
|
||||
messages=[{"role": "user", "content": f"Test prompt {i}."}],
|
||||
max_tokens=64,
|
||||
)
|
||||
elapsed = time.time() - start
|
||||
print(f"Average latency: {elapsed / 10:.2f}s")
|
||||
```
|
||||
|
||||
### Quantization Guide
|
||||
|
||||
Quantization reduces model size and increases inference speed at the cost of
|
||||
some accuracy. Recommended quantizations for different hardware:
|
||||
|
||||
| Hardware | Quantization | Size (8B) | Quality |
|
||||
|---|---|---|---|
|
||||
| 16GB+ VRAM | Q8_0 | ~8.5 GB | Near-original |
|
||||
| 8GB VRAM | Q4_K_M | ~4.7 GB | Good balance |
|
||||
| 4GB VRAM / CPU | Q4_0 | ~4.4 GB | Acceptable |
|
||||
| Very constrained | Q2_K | ~3.0 GB | Degraded |
|
||||
|
||||
Quantize a model:
|
||||
|
||||
```bash
|
||||
./build/bin/llama-quantize \
|
||||
/opt/models/llama/model-f16.gguf \
|
||||
/opt/models/llama/model-Q4_K_M.gguf \
|
||||
Q4_K_M
|
||||
```
|
||||
|
||||
### Recommended Models
|
||||
|
||||
For The Nexus workloads:
|
||||
|
||||
- **General reasoning**: Llama 3.1 8B Q4_K_M — fast, good quality
|
||||
- **Code assistance**: DeepSeek-Coder-V2-Lite Q4_K_M
|
||||
- **Small/fast**: Phi-3-mini Q4_K_M — runs well on CPU
|
||||
|
||||
## Model Path Standardization
|
||||
|
||||
All Nexus components expect models under `/opt/models/llama/` by default.
|
||||
|
||||
Directory structure:
|
||||
```
|
||||
/opt/models/llama/
|
||||
llama-3.1-8b-Q4_K_M.gguf
|
||||
deepseek-coder-lite-Q4_K_M.gguf
|
||||
phi-3-mini-Q4_K_M.gguf
|
||||
```
|
||||
|
||||
Override with `LLAMA_MODEL_PATH` environment variable.
|
||||
|
||||
## Systemd Service
|
||||
|
||||
A systemd unit file is provided at `systemd/llama-server.service`.
|
||||
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
sudo cp systemd/llama-server.service /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now llama-server.service
|
||||
sudo systemctl status llama-server.service
|
||||
```
|
||||
|
||||
### Logs
|
||||
|
||||
```bash
|
||||
journalctl -u llama-server.service -f
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Server won't start
|
||||
- Check that the GGUF model file exists at the configured path
|
||||
- Verify port 8081 is not in use: `ss -tlnp | grep 8081`
|
||||
- Check logs: `journalctl -u llama-server -n 50`
|
||||
|
||||
### Slow inference
|
||||
- Use a more aggressive quantization (Q4_K_M instead of Q8_0)
|
||||
- Reduce context size (`--ctx-size 2048`)
|
||||
- For GPU: verify CUDA/Metal is enabled at build time
|
||||
- Check `--parallel` value — too high thrashes the GPU
|
||||
|
||||
### Out of memory
|
||||
- Reduce `--ctx-size`
|
||||
- Use a smaller quantization
|
||||
- Use a smaller model (3B instead of 8B)
|
||||
|
||||
### Client connection refused
|
||||
- Verify server is running: `curl http://127.0.0.1:8081/health`
|
||||
- Check `LLAMA_SERVER_URL` env var matches server config
|
||||
- Ensure firewall allows localhost:8081
|
||||
@@ -1,32 +1 @@
|
||||
"""
|
||||
Nexus — Embodied Mind Module
|
||||
|
||||
The perception adapter, experience store, trajectory logger, and
|
||||
consciousness loop that give Timmy a body in the Nexus.
|
||||
"""
|
||||
|
||||
from nexus.perception_adapter import (
|
||||
ws_to_perception,
|
||||
parse_actions,
|
||||
PerceptionBuffer,
|
||||
Perception,
|
||||
Action,
|
||||
)
|
||||
from nexus.experience_store import ExperienceStore
|
||||
from nexus.trajectory_logger import TrajectoryLogger
|
||||
|
||||
try:
|
||||
from nexus.nexus_think import NexusMind
|
||||
except Exception:
|
||||
NexusMind = None
|
||||
|
||||
__all__ = [
|
||||
"ws_to_perception",
|
||||
"parse_actions",
|
||||
"PerceptionBuffer",
|
||||
"Perception",
|
||||
"Action",
|
||||
"ExperienceStore",
|
||||
"TrajectoryLogger",
|
||||
"NexusMind",
|
||||
]
|
||||
# nexus package — cognition and inference components for The Nexus
|
||||
|
||||
243
nexus/llama_provider.py
Normal file
243
nexus/llama_provider.py
Normal file
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
llama_provider.py — Provider adapter for Hermes inference router.
|
||||
|
||||
Integrates llama.cpp as a sovereign local backend for The Nexus.
|
||||
Activated when:
|
||||
1. All external API providers fail, OR
|
||||
2. LOCAL_ONLY=true environment variable is set
|
||||
|
||||
Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
from bin.llama_client import LlamaClient, LlamaClientError
|
||||
|
||||
logger = logging.getLogger("nexus.llama_provider")
|
||||
|
||||
|
||||
class LlamaProvider:
|
||||
"""
|
||||
Hermes-compatible inference provider backed by local llama.cpp server.
|
||||
|
||||
This provider follows the same interface expected by the Hermes
|
||||
inference router, enabling drop-in fallback when external APIs
|
||||
(OpenAI, Anthropic, etc.) are unavailable or when LOCAL_ONLY=true.
|
||||
|
||||
Environment variables:
|
||||
LLAMA_SERVER_URL — llama.cpp server URL
|
||||
LOCAL_ONLY — if "true", this provider takes priority
|
||||
LLAMA_DEFAULT_MODEL — model name override
|
||||
LLAMA_MAX_TOKENS — default max tokens
|
||||
"""
|
||||
|
||||
NAME = "llama-local"
|
||||
PRIORITY = 100 # Lower priority than external providers by default
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
):
|
||||
self.client = LlamaClient(base_url=base_url, model=model)
|
||||
self._local_only = os.environ.get("LOCAL_ONLY", "").lower() in (
|
||||
"true",
|
||||
"1",
|
||||
"yes",
|
||||
)
|
||||
if self._local_only:
|
||||
self.PRIORITY = 0 # Highest priority when LOCAL_ONLY
|
||||
logger.info("LOCAL_ONLY mode enabled — llama provider is primary")
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return self.NAME
|
||||
|
||||
@property
|
||||
def available(self) -> bool:
|
||||
"""Check if the local llama.cpp server is reachable and healthy."""
|
||||
return self.client.health_check()
|
||||
|
||||
@property
|
||||
def local_only(self) -> bool:
|
||||
"""Whether LOCAL_ONLY mode is enabled."""
|
||||
return self._local_only
|
||||
|
||||
def infer(
|
||||
self,
|
||||
prompt: str,
|
||||
context: Optional[List[Dict[str, str]]] = None,
|
||||
system: Optional[str] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
temperature: float = 0.7,
|
||||
**kwargs: Any,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run inference through the local llama.cpp server.
|
||||
|
||||
Args:
|
||||
prompt: The user prompt/question.
|
||||
context: Optional conversation history as list of
|
||||
{"role": ..., "content": ...} dicts.
|
||||
system: Optional system prompt override.
|
||||
max_tokens: Maximum tokens to generate.
|
||||
temperature: Sampling temperature.
|
||||
|
||||
Returns:
|
||||
Dict with keys:
|
||||
- provider: str — provider name
|
||||
- response: str — the generated text
|
||||
- model: str — model used
|
||||
- tokens_used: int — approximate token count
|
||||
- latency_ms: float — inference latency in ms
|
||||
|
||||
Raises:
|
||||
LlamaClientError: If the server returns an error.
|
||||
RuntimeError: If the server is not available.
|
||||
"""
|
||||
import time
|
||||
|
||||
if not self.available:
|
||||
raise RuntimeError(
|
||||
f"llama.cpp server is not available at {self.client.base_url}. "
|
||||
"Start the server or check LLAMA_SERVER_URL."
|
||||
)
|
||||
|
||||
messages = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
if context:
|
||||
messages.extend(context)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
t0 = time.time()
|
||||
raw = self.client.chat_completion(
|
||||
messages=messages,
|
||||
max_tokens=max_tokens or self.client.max_tokens,
|
||||
temperature=temperature,
|
||||
stream=False,
|
||||
**kwargs,
|
||||
)
|
||||
latency_ms = (time.time() - t0) * 1000
|
||||
|
||||
# Parse OpenAI-compatible response
|
||||
response_text = ""
|
||||
model_used = ""
|
||||
tokens_used = 0
|
||||
|
||||
if isinstance(raw, dict):
|
||||
choices = raw.get("choices", [])
|
||||
if choices:
|
||||
msg = choices[0].get("message", {})
|
||||
response_text = msg.get("content", "")
|
||||
usage = raw.get("usage", {})
|
||||
tokens_used = usage.get("total_tokens", 0)
|
||||
model_used = raw.get("model", self.client.model)
|
||||
|
||||
return {
|
||||
"provider": self.NAME,
|
||||
"response": response_text,
|
||||
"model": model_used,
|
||||
"tokens_used": tokens_used,
|
||||
"latency_ms": round(latency_ms, 2),
|
||||
}
|
||||
|
||||
def infer_stream(
|
||||
self,
|
||||
prompt: str,
|
||||
context: Optional[List[Dict[str, str]]] = None,
|
||||
system: Optional[str] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
temperature: float = 0.7,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""
|
||||
Stream inference tokens from the local llama.cpp server.
|
||||
|
||||
Yields partial response dicts as tokens arrive.
|
||||
"""
|
||||
if not self.available:
|
||||
raise RuntimeError(
|
||||
f"llama.cpp server is not available at {self.client.base_url}"
|
||||
)
|
||||
|
||||
messages = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
if context:
|
||||
messages.extend(context)
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
chunks = self.client.chat_completion(
|
||||
messages=messages,
|
||||
max_tokens=max_tokens or self.client.max_tokens,
|
||||
temperature=temperature,
|
||||
stream=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
for chunk in chunks:
|
||||
if isinstance(chunk, dict):
|
||||
choices = chunk.get("choices", [])
|
||||
if choices:
|
||||
delta = choices[0].get("delta", {})
|
||||
content = delta.get("content", "")
|
||||
if content:
|
||||
yield {
|
||||
"provider": self.NAME,
|
||||
"delta": content,
|
||||
"done": choices[0].get("finish_reason") is not None,
|
||||
}
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get provider status information.
|
||||
|
||||
Returns:
|
||||
Dict with provider name, availability, server health, etc.
|
||||
"""
|
||||
status: Dict[str, Any] = {
|
||||
"provider": self.NAME,
|
||||
"available": self.available,
|
||||
"local_only": self._local_only,
|
||||
"base_url": self.client.base_url,
|
||||
"model": self.client.model,
|
||||
}
|
||||
if self.available:
|
||||
try:
|
||||
health = self.client.get_health()
|
||||
status["server_health"] = health
|
||||
except Exception:
|
||||
pass
|
||||
return status
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Integration helper for the Hermes inference router
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def register_provider(router: Any) -> LlamaProvider:
|
||||
"""
|
||||
Register the llama provider with a Hermes inference router.
|
||||
|
||||
Args:
|
||||
router: A Hermes inference router instance with an
|
||||
`add_provider(name, provider, priority)` method.
|
||||
|
||||
Returns:
|
||||
The registered LlamaProvider instance.
|
||||
"""
|
||||
provider = LlamaProvider()
|
||||
if hasattr(router, "add_provider"):
|
||||
router.add_provider(provider.NAME, provider, priority=provider.PRIORITY)
|
||||
logger.info(
|
||||
"Registered llama provider (priority=%d, local_only=%s)",
|
||||
provider.PRIORITY,
|
||||
provider.local_only,
|
||||
)
|
||||
return provider
|
||||
1
systemd/.gitkeep
Normal file
1
systemd/.gitkeep
Normal file
@@ -0,0 +1 @@
|
||||
placeholder
|
||||
49
systemd/llama-server.service
Normal file
49
systemd/llama-server.service
Normal file
@@ -0,0 +1,49 @@
|
||||
[Unit]
|
||||
Description=llama.cpp HTTP Server — Sovereign Local LLM Backend for The Nexus
|
||||
Documentation=file:///opt/the-nexus/docs/local-llm.md
|
||||
After=network.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=llama
|
||||
Group=llama
|
||||
|
||||
# Model configuration
|
||||
Environment=LLAMA_MODEL_PATH=/opt/models/llama
|
||||
ExecStart=/opt/llama.cpp/build/bin/llama-server \
|
||||
--model /opt/models/llama/llama-3.1-8b-Q4_K_M.gguf \
|
||||
--host 127.0.0.1 \
|
||||
--port 8081 \
|
||||
--ctx-size 4096 \
|
||||
--parallel 2 \
|
||||
--chat-template llama3
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
LimitNPROC=4096
|
||||
MemoryMax=12G
|
||||
|
||||
# Restart policy
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
StartLimitIntervalSec=300
|
||||
StartLimitBurst=5
|
||||
|
||||
# Hardening
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ReadWritePaths=/opt/models/llama
|
||||
PrivateTmp=true
|
||||
NoNewPrivileges=true
|
||||
ProtectKernelTunables=true
|
||||
ProtectKernelModules=true
|
||||
ProtectControlGroups=true
|
||||
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=llama-server
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
1
tests/__init__.py
Normal file
1
tests/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# tests package
|
||||
325
tests/test_llama_client.py
Normal file
325
tests/test_llama_client.py
Normal file
@@ -0,0 +1,325 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for llama_client.py — the sovereign llama.cpp HTTP client.
|
||||
|
||||
Issue: #1123 — Standardize llama.cpp Backend for Sovereign Inference
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
from io import BytesIO
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from bin.llama_client import LlamaClient, LlamaClientError
|
||||
|
||||
|
||||
class TestLlamaClientInit(unittest.TestCase):
|
||||
"""Test client initialization and configuration."""
|
||||
|
||||
def test_default_base_url(self):
|
||||
client = LlamaClient()
|
||||
self.assertEqual(client.base_url, "http://127.0.0.1:8081")
|
||||
|
||||
def test_custom_base_url(self):
|
||||
client = LlamaClient(base_url="http://localhost:9999")
|
||||
self.assertEqual(client.base_url, "http://localhost:9999")
|
||||
|
||||
def test_base_url_strips_trailing_slash(self):
|
||||
client = LlamaClient(base_url="http://localhost:8081/")
|
||||
self.assertEqual(client.base_url, "http://localhost:8081")
|
||||
|
||||
def test_env_var_base_url(self):
|
||||
with patch.dict(os.environ, {"LLAMA_SERVER_URL": "http://env-host:1234"}):
|
||||
client = LlamaClient()
|
||||
self.assertEqual(client.base_url, "http://env-host:1234")
|
||||
|
||||
def test_explicit_url_overrides_env(self):
|
||||
with patch.dict(os.environ, {"LLAMA_SERVER_URL": "http://env-host:1234"}):
|
||||
client = LlamaClient(base_url="http://explicit:5678")
|
||||
self.assertEqual(client.base_url, "http://explicit:5678")
|
||||
|
||||
def test_default_model(self):
|
||||
client = LlamaClient()
|
||||
self.assertEqual(client.model, "default")
|
||||
|
||||
def test_custom_model(self):
|
||||
client = LlamaClient(model="llama-3.1-8b")
|
||||
self.assertEqual(client.model, "llama-3.1-8b")
|
||||
|
||||
def test_env_model(self):
|
||||
with patch.dict(os.environ, {"LLAMA_DEFAULT_MODEL": "phi-3"}):
|
||||
client = LlamaClient()
|
||||
self.assertEqual(client.model, "phi-3")
|
||||
|
||||
def test_max_tokens_default(self):
|
||||
client = LlamaClient()
|
||||
self.assertEqual(client.max_tokens, 512)
|
||||
|
||||
def test_max_tokens_env(self):
|
||||
with patch.dict(os.environ, {"LLAMA_MAX_TOKENS": "1024"}):
|
||||
client = LlamaClient()
|
||||
self.assertEqual(client.max_tokens, 1024)
|
||||
|
||||
|
||||
class TestLlamaClientHealthCheck(unittest.TestCase):
|
||||
"""Test health check functionality."""
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_health_check_healthy(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {"status": "ok", "slots_idle": 2}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
self.assertTrue(client.health_check())
|
||||
mock_session.request.assert_called_with(
|
||||
"GET", "http://127.0.0.1:8081/health",
|
||||
json=None, timeout=120.0, stream=False
|
||||
)
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_health_check_unhealthy(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {"status": "error"}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
self.assertFalse(client.health_check())
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_health_check_connection_error(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_session.request.side_effect = ConnectionError("refused")
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
self.assertFalse(client.health_check())
|
||||
|
||||
|
||||
class TestLlamaClientChatCompletion(unittest.TestCase):
|
||||
"""Test chat completion functionality."""
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_chat_completion_basic(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"id": "chatcmpl-123",
|
||||
"model": "llama-3.1-8b",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {"role": "assistant", "content": "Hello! I am a local AI."},
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
"usage": {"prompt_tokens": 10, "completion_tokens": 8, "total_tokens": 18},
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
result = client.chat_completion(
|
||||
messages=[{"role": "user", "content": "Hello"}],
|
||||
max_tokens=64,
|
||||
)
|
||||
|
||||
self.assertIsInstance(result, dict)
|
||||
self.assertEqual(result["choices"][0]["message"]["content"], "Hello! I am a local AI.")
|
||||
self.assertEqual(result["usage"]["total_tokens"], 18)
|
||||
|
||||
# Verify the request payload
|
||||
call_args = mock_session.request.call_args
|
||||
payload = call_args[1]["json"]
|
||||
self.assertEqual(payload["messages"], [{"role": "user", "content": "Hello"}])
|
||||
self.assertEqual(payload["max_tokens"], 64)
|
||||
self.assertEqual(payload["stream"], False)
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_chat_completion_with_system(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "I'm helpful."}}],
|
||||
"usage": {},
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
messages = [
|
||||
{"role": "system", "content": "You are helpful."},
|
||||
{"role": "user", "content": "Hi"},
|
||||
]
|
||||
client.chat_completion(messages=messages)
|
||||
|
||||
payload = mock_session.request.call_args[1]["json"]
|
||||
self.assertEqual(len(payload["messages"]), 2)
|
||||
|
||||
|
||||
class TestLlamaClientSimpleChat(unittest.TestCase):
|
||||
"""Test the simplified chat interface."""
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_simple_chat(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "42"}}],
|
||||
"usage": {"total_tokens": 10},
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
response = client.simple_chat("What is the answer?")
|
||||
|
||||
self.assertEqual(response, "42")
|
||||
|
||||
payload = mock_session.request.call_args[1]["json"]
|
||||
self.assertEqual(payload["messages"][0]["role"], "user")
|
||||
self.assertEqual(payload["messages"][0]["content"], "What is the answer?")
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_simple_chat_with_system(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "Yes"}}],
|
||||
"usage": {},
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
client.simple_chat("Are you alive?", system="You are a wizard.")
|
||||
|
||||
payload = mock_session.request.call_args[1]["json"]
|
||||
self.assertEqual(payload["messages"][0]["role"], "system")
|
||||
self.assertEqual(payload["messages"][0]["content"], "You are a wizard.")
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_simple_chat_empty_response(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {"choices": [], "usage": {}}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
response = client.simple_chat("Hello")
|
||||
self.assertEqual(response, "")
|
||||
|
||||
|
||||
class TestLlamaClientListModels(unittest.TestCase):
|
||||
"""Test model listing."""
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_list_models(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"data": [
|
||||
{"id": "llama-3.1-8b", "object": "model"},
|
||||
{"id": "phi-3-mini", "object": "model"},
|
||||
]
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
models = client.list_models()
|
||||
|
||||
self.assertEqual(len(models), 2)
|
||||
self.assertEqual(models[0]["id"], "llama-3.1-8b")
|
||||
|
||||
|
||||
class TestLlamaClientBenchmark(unittest.TestCase):
|
||||
"""Test the benchmark method."""
|
||||
|
||||
@patch("bin.llama_client.time.time")
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_benchmark(self, mock_requests, mock_time):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "result"}}],
|
||||
"usage": {"total_tokens": 20},
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
# Simulate time progression: 1 start + 2 per iteration (t0 + latency) + 1 end = 12 calls
|
||||
mock_time.side_effect = [
|
||||
0.0, # start
|
||||
0.0, 0.5, # iter 0: t0, latency
|
||||
0.5, 1.0, # iter 1
|
||||
1.0, 1.5, # iter 2
|
||||
1.5, 2.0, # iter 3
|
||||
2.0, 2.5, # iter 4
|
||||
2.5, # end
|
||||
]
|
||||
|
||||
client = LlamaClient()
|
||||
stats = client.benchmark(iterations=5, max_tokens=64)
|
||||
|
||||
self.assertIn("avg_latency", stats)
|
||||
self.assertIn("min_latency", stats)
|
||||
self.assertIn("max_latency", stats)
|
||||
self.assertIn("total_time", stats)
|
||||
self.assertEqual(stats["iterations"], 5)
|
||||
|
||||
|
||||
class TestLlamaClientCompletion(unittest.TestCase):
|
||||
"""Test raw completion endpoint."""
|
||||
|
||||
@patch("bin.llama_client.requests")
|
||||
def test_completion(self, mock_requests):
|
||||
mock_session = MagicMock()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"text": "Generated text here."}],
|
||||
"usage": {"total_tokens": 15},
|
||||
}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
mock_session.request.return_value = mock_resp
|
||||
mock_requests.Session.return_value = mock_session
|
||||
|
||||
client = LlamaClient()
|
||||
result = client.completion(prompt="Once upon a time", max_tokens=100)
|
||||
|
||||
self.assertEqual(result["choices"][0]["text"], "Generated text here.")
|
||||
payload = mock_session.request.call_args[1]["json"]
|
||||
self.assertEqual(payload["prompt"], "Once upon a time")
|
||||
self.assertEqual(payload["max_tokens"], 100)
|
||||
|
||||
|
||||
class TestLlamaClientError(unittest.TestCase):
|
||||
"""Test error handling."""
|
||||
|
||||
def test_error_class(self):
|
||||
err = LlamaClientError("Something went wrong")
|
||||
self.assertIsInstance(err, Exception)
|
||||
self.assertEqual(str(err), "Something went wrong")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user