Compare commits
1 Commits
fix/879
...
whip/1123-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ac2ec40657 |
354
bin/llama_client.py
Normal file
354
bin/llama_client.py
Normal file
@@ -0,0 +1,354 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
llama_client.py — OpenAI-compatible client for llama.cpp HTTP API.
|
||||
|
||||
Wraps the llama-server endpoint for use as a sovereign local LLM backend.
|
||||
Supports chat completions, raw completions, streaming, health checks,
|
||||
model listing, and benchmarking.
|
||||
|
||||
Usage:
|
||||
python3 bin/llama_client.py chat "Hello, how are you?"
|
||||
python3 bin/llama_client.py health
|
||||
python3 bin/llama_client.py models
|
||||
python3 bin/llama_client.py benchmark --iterations 10
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Generator, Optional
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
requests = None # fallback to urllib
|
||||
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DEFAULT_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
|
||||
DEFAULT_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
|
||||
DEFAULT_MAX_TOKENS = int(os.environ.get("LLAMA_MAX_TOKENS", "512"))
|
||||
DEFAULT_TEMPERATURE = float(os.environ.get("LLAMA_TEMPERATURE", "0.7"))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class ChatMessage:
|
||||
role: str # "system", "user", "assistant"
|
||||
content: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class CompletionResponse:
|
||||
text: str
|
||||
tokens_used: int = 0
|
||||
latency_ms: float = 0.0
|
||||
model: str = ""
|
||||
finish_reason: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthStatus:
|
||||
healthy: bool
|
||||
endpoint: str
|
||||
model_loaded: bool = False
|
||||
model_name: str = ""
|
||||
error: str = ""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP helper (works with or without requests library)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _http_post(url: str, data: dict, timeout: int = 120) -> dict:
|
||||
"""POST JSON to URL, return parsed JSON response."""
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
except urllib.error.URLError as e:
|
||||
raise ConnectionError(f"Cannot reach {url}: {e}")
|
||||
|
||||
|
||||
def _http_get(url: str, timeout: int = 10) -> dict:
|
||||
"""GET URL, return parsed JSON response."""
|
||||
req = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return json.loads(resp.read().decode("utf-8"))
|
||||
except urllib.error.URLError as e:
|
||||
raise ConnectionError(f"Cannot reach {url}: {e}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlamaClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class LlamaClient:
|
||||
"""OpenAI-compatible client for llama.cpp HTTP server."""
|
||||
|
||||
def __init__(self, endpoint: str = DEFAULT_ENDPOINT, model: str = DEFAULT_MODEL):
|
||||
self.endpoint = endpoint.rstrip("/")
|
||||
self.model = model
|
||||
|
||||
# --- Health ---
|
||||
|
||||
def health_check(self) -> HealthStatus:
|
||||
"""Probe the /health endpoint."""
|
||||
try:
|
||||
data = _http_get(f"{self.endpoint}/health")
|
||||
model_loaded = data.get("status", "") == "ok" or data.get("model_loaded", False)
|
||||
return HealthStatus(
|
||||
healthy=True,
|
||||
endpoint=self.endpoint,
|
||||
model_loaded=model_loaded,
|
||||
model_name=data.get("model_path", self.model),
|
||||
)
|
||||
except Exception as e:
|
||||
return HealthStatus(healthy=False, endpoint=self.endpoint, error=str(e))
|
||||
|
||||
def is_healthy(self) -> bool:
|
||||
"""Quick boolean health check."""
|
||||
return self.health_check().healthy
|
||||
|
||||
# --- Models ---
|
||||
|
||||
def list_models(self) -> list[dict]:
|
||||
"""List loaded models (OpenAI-compatible /v1/models)."""
|
||||
try:
|
||||
data = _http_get(f"{self.endpoint}/v1/models")
|
||||
return data.get("data", [])
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# --- Chat completions ---
|
||||
|
||||
def chat(
|
||||
self,
|
||||
messages: list[ChatMessage],
|
||||
max_tokens: int = DEFAULT_MAX_TOKENS,
|
||||
temperature: float = DEFAULT_TEMPERATURE,
|
||||
stream: bool = False,
|
||||
) -> CompletionResponse:
|
||||
"""Send a chat completion request (OpenAI-compatible /v1/chat/completions)."""
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [{"role": m.role, "content": m.content} for m in messages],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stream": stream,
|
||||
}
|
||||
|
||||
start = time.time()
|
||||
data = _http_post(f"{self.endpoint}/v1/chat/completions", payload)
|
||||
latency = (time.time() - start) * 1000
|
||||
|
||||
choice = data.get("choices", [{}])[0]
|
||||
message = choice.get("message", {})
|
||||
usage = data.get("usage", {})
|
||||
|
||||
return CompletionResponse(
|
||||
text=message.get("content", ""),
|
||||
tokens_used=usage.get("total_tokens", 0),
|
||||
latency_ms=latency,
|
||||
model=data.get("model", self.model),
|
||||
finish_reason=choice.get("finish_reason", ""),
|
||||
)
|
||||
|
||||
def chat_stream(
|
||||
self,
|
||||
messages: list[ChatMessage],
|
||||
max_tokens: int = DEFAULT_MAX_TOKENS,
|
||||
temperature: float = DEFAULT_TEMPERATURE,
|
||||
) -> Generator[str, None, None]:
|
||||
"""Stream chat completion tokens."""
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [{"role": m.role, "content": m.content} for m in messages],
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stream": True,
|
||||
}
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
f"{self.endpoint}/v1/chat/completions",
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
for line in resp:
|
||||
line = line.decode("utf-8").strip()
|
||||
if line.startswith("data: "):
|
||||
chunk = line[6:]
|
||||
if chunk == "[DONE]":
|
||||
break
|
||||
try:
|
||||
data = json.loads(chunk)
|
||||
delta = data.get("choices", [{}])[0].get("delta", {})
|
||||
content = delta.get("content", "")
|
||||
if content:
|
||||
yield content
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# --- Simple helpers ---
|
||||
|
||||
def simple_chat(
|
||||
self,
|
||||
prompt: str,
|
||||
system: Optional[str] = None,
|
||||
max_tokens: int = DEFAULT_MAX_TOKENS,
|
||||
) -> str:
|
||||
"""One-shot chat: send prompt, return text response."""
|
||||
messages = []
|
||||
if system:
|
||||
messages.append(ChatMessage(role="system", content=system))
|
||||
messages.append(ChatMessage(role="user", content=prompt))
|
||||
response = self.chat(messages, max_tokens=max_tokens)
|
||||
return response.text
|
||||
|
||||
# --- Raw completion ---
|
||||
|
||||
def complete(
|
||||
self,
|
||||
prompt: str,
|
||||
max_tokens: int = DEFAULT_MAX_TOKENS,
|
||||
temperature: float = DEFAULT_TEMPERATURE,
|
||||
) -> CompletionResponse:
|
||||
"""Raw text completion (llama.cpp /completion endpoint)."""
|
||||
payload = {
|
||||
"prompt": prompt,
|
||||
"n_predict": max_tokens,
|
||||
"temperature": temperature,
|
||||
}
|
||||
start = time.time()
|
||||
data = _http_post(f"{self.endpoint}/completion", payload)
|
||||
latency = (time.time() - start) * 1000
|
||||
|
||||
return CompletionResponse(
|
||||
text=data.get("content", ""),
|
||||
tokens_used=data.get("tokens_predicted", 0),
|
||||
latency_ms=latency,
|
||||
model=self.model,
|
||||
)
|
||||
|
||||
# --- Benchmark ---
|
||||
|
||||
def benchmark(
|
||||
self,
|
||||
prompt: str = "Explain sovereignty in 3 sentences.",
|
||||
iterations: int = 5,
|
||||
max_tokens: int = 128,
|
||||
) -> dict:
|
||||
"""Run N iterations and report latency + throughput stats."""
|
||||
latencies = []
|
||||
token_counts = []
|
||||
|
||||
for i in range(iterations):
|
||||
messages = [ChatMessage(role="user", content=prompt)]
|
||||
resp = self.chat(messages, max_tokens=max_tokens)
|
||||
latencies.append(resp.latency_ms)
|
||||
token_counts.append(resp.tokens_used)
|
||||
|
||||
avg_latency = sum(latencies) / len(latencies)
|
||||
avg_tokens = sum(token_counts) / len(token_counts)
|
||||
tok_per_sec = (avg_tokens / avg_latency) * 1000 if avg_latency > 0 else 0
|
||||
|
||||
return {
|
||||
"iterations": iterations,
|
||||
"prompt": prompt,
|
||||
"avg_latency_ms": round(avg_latency, 1),
|
||||
"min_latency_ms": round(min(latencies), 1),
|
||||
"max_latency_ms": round(max(latencies), 1),
|
||||
"avg_tokens": round(avg_tokens, 1),
|
||||
"tok_per_sec": round(tok_per_sec, 1),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="llama.cpp client CLI")
|
||||
parser.add_argument("--url", default=DEFAULT_ENDPOINT, help="llama-server endpoint")
|
||||
parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name")
|
||||
|
||||
sub = parser.add_subparsers(dest="command")
|
||||
|
||||
# health
|
||||
sub.add_parser("health", help="Check server health")
|
||||
|
||||
# models
|
||||
sub.add_parser("models", help="List loaded models")
|
||||
|
||||
# chat
|
||||
chat_p = sub.add_parser("chat", help="One-shot chat")
|
||||
chat_p.add_argument("prompt", help="User message")
|
||||
chat_p.add_argument("--system", default=None, help="System prompt")
|
||||
chat_p.add_argument("--max-tokens", type=int, default=DEFAULT_MAX_TOKENS)
|
||||
chat_p.add_argument("--stream", action="store_true", help="Stream response")
|
||||
|
||||
# benchmark
|
||||
bench_p = sub.add_parser("benchmark", help="Run benchmark")
|
||||
bench_p.add_argument("--prompt", default="Explain sovereignty in 3 sentences.")
|
||||
bench_p.add_argument("--iterations", type=int, default=5)
|
||||
bench_p.add_argument("--max-tokens", type=int, default=128)
|
||||
|
||||
args = parser.parse_args()
|
||||
client = LlamaClient(endpoint=args.url, model=args.model)
|
||||
|
||||
if args.command == "health":
|
||||
status = client.health_check()
|
||||
print(json.dumps(status.__dict__, indent=2))
|
||||
sys.exit(0 if status.healthy else 1)
|
||||
|
||||
elif args.command == "models":
|
||||
models = client.list_models()
|
||||
print(json.dumps(models, indent=2))
|
||||
|
||||
elif args.command == "chat":
|
||||
if args.stream:
|
||||
messages = []
|
||||
if args.system:
|
||||
messages.append(ChatMessage(role="system", content=args.system))
|
||||
messages.append(ChatMessage(role="user", content=args.prompt))
|
||||
for chunk in client.chat_stream(messages, max_tokens=args.max_tokens):
|
||||
print(chunk, end="", flush=True)
|
||||
print()
|
||||
else:
|
||||
result = client.simple_chat(args.prompt, system=args.system, max_tokens=args.max_tokens)
|
||||
print(result)
|
||||
|
||||
elif args.command == "benchmark":
|
||||
result = client.benchmark(
|
||||
prompt=args.prompt,
|
||||
iterations=args.iterations,
|
||||
max_tokens=args.max_tokens,
|
||||
)
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
184
docs/local-llm.md
Normal file
184
docs/local-llm.md
Normal file
@@ -0,0 +1,184 @@
|
||||
# Local LLM Deployment Guide — llama.cpp Sovereign Inference
|
||||
|
||||
## Overview
|
||||
|
||||
llama.cpp provides sovereign, offline-capable inference on CPU, CUDA, and
|
||||
Apple Silicon. This guide standardizes deployment across the fleet.
|
||||
|
||||
**Golden path:** One binary, one model path, one health endpoint.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# 1. Install llama.cpp (build from source)
|
||||
git clone https://github.com/ggerganov/llama.cpp.git
|
||||
cd llama.cpp && cmake -B build && cmake --build build --config Release -j$(nproc)
|
||||
sudo cp build/bin/llama-server /usr/local/bin/
|
||||
|
||||
# 2. Download a model
|
||||
mkdir -p /opt/models/llama
|
||||
wget -O /opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf \
|
||||
"https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q4_k_m.gguf"
|
||||
|
||||
# 3. Start the server
|
||||
llama-server -m /opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf \
|
||||
--host 0.0.0.0 --port 11435 -c 4096 -t $(nproc) --cont-batching
|
||||
|
||||
# 4. Verify
|
||||
curl http://localhost:11435/health
|
||||
```
|
||||
|
||||
## Model Path Convention
|
||||
|
||||
| Path | Purpose |
|
||||
|------|---------|
|
||||
| `/opt/models/llama/` | Production models (system-wide) |
|
||||
| `~/models/llama/` | Per-user models (development) |
|
||||
| `MODEL_DIR` env var | Override default path |
|
||||
|
||||
All fleet nodes should use `/opt/models/llama/` for consistency.
|
||||
|
||||
## Recommended Models
|
||||
|
||||
| Model | Size (Q4_K_M) | RAM | Tokens/sec (est.) | Use Case |
|
||||
|-------|---------------|-----|-------------------|----------|
|
||||
| Qwen2.5-7B-Instruct | 4.7 GB | 8 GB | 25-40 | General chat, code assist |
|
||||
| Qwen2.5-3B-Instruct | 2.0 GB | 4 GB | 50-80 | Fast responses, lightweight |
|
||||
| Llama-3.2-3B-Instruct | 2.0 GB | 4 GB | 50-80 | Alternative small model |
|
||||
| Mistral-7B-Instruct-v0.3 | 4.4 GB | 8 GB | 25-40 | Strong reasoning |
|
||||
| Phi-3.5-mini-instruct | 2.3 GB | 4 GB | 45-70 | Microsoft small model |
|
||||
|
||||
**Fleet standard:** `Qwen2.5-7B-Instruct-Q4_K_M.gguf`
|
||||
|
||||
## Quantization Guide
|
||||
|
||||
| Quantization | Size (7B) | Quality | Speed | Recommendation |
|
||||
|-------------|-----------|---------|-------|----------------|
|
||||
| Q8_0 | 7.2 GB | Excellent | Slow | Only if RAM allows |
|
||||
| Q6_K | 5.5 GB | Very Good | Medium | Best quality/speed ratio |
|
||||
| Q5_K_M | 5.0 GB | Good | Medium | Good balance |
|
||||
| **Q4_K_M** | **4.7 GB** | **Good** | **Fast** | **Fleet standard** |
|
||||
| Q3_K_M | 3.4 GB | Fair | Fast | Low-memory fallback |
|
||||
| Q2_K | 2.8 GB | Poor | Very Fast | Emergency only |
|
||||
|
||||
**Rule of thumb:** Use Q4_K_M unless you have <6GB RAM (then Q3_K_M) or >16GB RAM (then Q6_K).
|
||||
|
||||
## Hardware Recommendations
|
||||
|
||||
### VPS Beta (2 vCPU, 4 GB RAM)
|
||||
- Model: Qwen2.5-3B-Instruct-Q4_K_M (2.0 GB)
|
||||
- Context: 2048 tokens
|
||||
- Threads: 2
|
||||
- Expected: ~40-60 tok/s
|
||||
|
||||
### VPS Alpha (4 vCPU, 8 GB RAM)
|
||||
- Model: Qwen2.5-7B-Instruct-Q4_K_M (4.7 GB)
|
||||
- Context: 4096 tokens
|
||||
- Threads: 4
|
||||
- Expected: ~20-35 tok/s
|
||||
|
||||
### Local Mac (Apple Silicon, 16+ GB)
|
||||
- Model: Qwen2.5-7B-Instruct-Q6_K (5.5 GB)
|
||||
- Context: 8192 tokens
|
||||
- Metal acceleration enabled
|
||||
- Expected: ~30-50 tok/s
|
||||
|
||||
## Health Check
|
||||
|
||||
```bash
|
||||
# Simple health probe
|
||||
curl -sf http://localhost:11435/health && echo "OK" || echo "FAIL"
|
||||
|
||||
# Detailed status
|
||||
curl -s http://localhost:11435/health | python3 -m json.tool
|
||||
|
||||
# Model loaded check
|
||||
curl -s http://localhost:11435/v1/models | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
models = [m['id'] for m in data.get('data', [])]
|
||||
print(f'Loaded: {models}' if models else 'No models loaded')
|
||||
"
|
||||
```
|
||||
|
||||
## Night Watch Integration
|
||||
|
||||
Add to your health check cron:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# llama-health.sh — probe local llama.cpp server
|
||||
ENDPOINT="${LLAMA_ENDPOINT:-http://localhost:11435}"
|
||||
|
||||
if ! curl -sf "$ENDPOINT/health" > /dev/null 2>&1; then
|
||||
echo "ALERT: llama.cpp server at $ENDPOINT is DOWN"
|
||||
# Auto-restart if systemd service exists
|
||||
systemctl is-active llama-server && sudo systemctl restart llama-server
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Verify model is loaded
|
||||
MODELS=$(curl -s "$ENDPOINT/v1/models" | python3 -c "
|
||||
import sys, json
|
||||
data = json.load(sys.stdin)
|
||||
print(len(data.get('data', [])))
|
||||
" 2>/dev/null)
|
||||
|
||||
if [ "$MODELS" = "0" ] || [ -z "$MODELS" ]; then
|
||||
echo "WARNING: llama.cpp server running but no model loaded"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "OK: llama.cpp healthy, $MODELS model(s) loaded"
|
||||
```
|
||||
|
||||
## Benchmarking
|
||||
|
||||
```bash
|
||||
# Using the built-in llama_client.py benchmark
|
||||
python3 bin/llama_client.py --url http://localhost:11435 benchmark --prompt "Explain sovereignty in 3 sentences." --iterations 10
|
||||
|
||||
# Using llama.cpp native benchmark
|
||||
llama-bench -m /opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf -t 4
|
||||
```
|
||||
|
||||
## API Compatibility
|
||||
|
||||
llama-server exposes an OpenAI-compatible API:
|
||||
|
||||
```bash
|
||||
# Chat completions (compatible with OpenAI SDK)
|
||||
curl http://localhost:11435/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen2.5-7b",
|
||||
"messages": [{"role": "user", "content": "Hello"}],
|
||||
"max_tokens": 256,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
|
||||
# Raw completions
|
||||
curl http://localhost:11435/completion \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"prompt": "Once upon a time", "n_predict": 128}'
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Problem | Cause | Fix |
|
||||
|---------|-------|-----|
|
||||
| Server won't start | Not enough RAM | Use smaller model or lower quantization |
|
||||
| Slow inference | Wrong thread count | Match `-t` to available cores |
|
||||
| Out of memory during load | Context too large | Reduce `-c` parameter |
|
||||
| Model not found | Wrong path | Check `ls /opt/models/llama/` |
|
||||
| Port already in use | Another process on 11435 | `lsof -i :11435` then kill |
|
||||
|
||||
## systemd Service
|
||||
|
||||
See `systemd/llama-server.service` in this repo. Install:
|
||||
|
||||
```bash
|
||||
sudo cp systemd/llama-server.service /etc/systemd/system/
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now llama-server
|
||||
```
|
||||
207
nexus/llama_provider.py
Normal file
207
nexus/llama_provider.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
llama_provider.py — Hermes inference router provider for llama.cpp local server.
|
||||
|
||||
Integrates local llama.cpp as a first-class provider in the Hermes inference
|
||||
router. Activates when:
|
||||
- External API rate-limits or fails
|
||||
- Config flag LOCAL_ONLY=true is set
|
||||
- User explicitly requests a local model
|
||||
|
||||
Response format is normalized to match OpenAI-compatible chat completions.
|
||||
Token usage is estimated and logged (even if approximate).
|
||||
|
||||
Usage in Hermes inference router:
|
||||
|
||||
from nexus.llama_provider import LlamaProvider
|
||||
|
||||
provider = LlamaProvider()
|
||||
if provider.available():
|
||||
response = provider.infer(messages, max_tokens=512)
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from bin.llama_client import ChatMessage, LlamaClient
|
||||
|
||||
logger = logging.getLogger("nexus.llama_provider")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
LLAMA_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
|
||||
LLAMA_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
|
||||
LOCAL_ONLY = os.environ.get("LOCAL_ONLY", "false").lower() in ("true", "1", "yes")
|
||||
FALLBACK_ON_FAILURE = os.environ.get("LLAMA_FALLBACK", "true").lower() in ("true", "1", "yes")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Provider result
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class ProviderResult:
|
||||
"""Normalized response from any inference provider."""
|
||||
text: str
|
||||
provider: str = "llama.cpp"
|
||||
model: str = ""
|
||||
tokens_used: int = 0
|
||||
latency_ms: float = 0.0
|
||||
finish_reason: str = ""
|
||||
is_local: bool = True
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlamaProvider
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class LlamaProvider:
|
||||
"""
|
||||
Hermes-compatible provider for local llama.cpp inference.
|
||||
|
||||
Priority logic:
|
||||
1. If LOCAL_ONLY=true → always use llama.cpp
|
||||
2. If external provider fails → fallback to llama.cpp (if FALLBACK_ON_FAILURE)
|
||||
3. If user requests local model → use llama.cpp
|
||||
4. Otherwise → external provider takes priority
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
endpoint: str = LLAMA_ENDPOINT,
|
||||
model: str = LLAMA_MODEL,
|
||||
local_only: bool = LOCAL_ONLY,
|
||||
):
|
||||
self.client = LlamaClient(endpoint=endpoint, model=model)
|
||||
self.local_only = local_only
|
||||
self.endpoint = endpoint
|
||||
self._last_health: Optional[bool] = None
|
||||
self._last_health_check: float = 0.0
|
||||
self._health_ttl: float = 30.0 # seconds
|
||||
|
||||
def available(self) -> bool:
|
||||
"""Check if llama.cpp server is reachable and healthy."""
|
||||
now = time.time()
|
||||
if self._last_health is not None and (now - self._last_health_check) < self._health_ttl:
|
||||
return self._last_health
|
||||
|
||||
status = self.client.health_check()
|
||||
self._last_health = status.healthy and status.model_loaded
|
||||
self._last_health_check = now
|
||||
|
||||
if not self._last_health:
|
||||
logger.warning("llama.cpp server unhealthy: %s", status.error or "model not loaded")
|
||||
|
||||
return self._last_health
|
||||
|
||||
def infer(
|
||||
self,
|
||||
messages: list[dict],
|
||||
max_tokens: int = 512,
|
||||
temperature: float = 0.7,
|
||||
model: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> ProviderResult:
|
||||
"""
|
||||
Run inference through llama.cpp.
|
||||
|
||||
Args:
|
||||
messages: List of {"role": "user/assistant/system", "content": "..."} dicts
|
||||
max_tokens: Maximum tokens to generate
|
||||
temperature: Sampling temperature
|
||||
model: Override model name (ignored for llama.cpp — uses server default)
|
||||
|
||||
Returns:
|
||||
ProviderResult with normalized response
|
||||
"""
|
||||
if not self.available():
|
||||
return ProviderResult(
|
||||
text="",
|
||||
error=f"llama.cpp server at {self.endpoint} is not available",
|
||||
)
|
||||
|
||||
# Convert dict messages to ChatMessage objects
|
||||
chat_messages = [
|
||||
ChatMessage(role=m["role"], content=m["content"])
|
||||
for m in messages
|
||||
if "role" in m and "content" in m
|
||||
]
|
||||
|
||||
if not chat_messages:
|
||||
return ProviderResult(text="", error="No valid messages provided")
|
||||
|
||||
start = time.time()
|
||||
try:
|
||||
response = self.client.chat(
|
||||
chat_messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
)
|
||||
latency = (time.time() - start) * 1000
|
||||
|
||||
return ProviderResult(
|
||||
text=response.text,
|
||||
provider="llama.cpp",
|
||||
model=response.model or self.client.model,
|
||||
tokens_used=response.tokens_used,
|
||||
latency_ms=latency,
|
||||
finish_reason=response.finish_reason,
|
||||
is_local=True,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("llama.cpp inference failed: %s", e)
|
||||
return ProviderResult(
|
||||
text="",
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
def should_use_local(
|
||||
self,
|
||||
external_failed: bool = False,
|
||||
explicit_local: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if local llama.cpp should be used.
|
||||
|
||||
Args:
|
||||
external_failed: True if external provider just failed
|
||||
explicit_local: True if user explicitly requested local
|
||||
|
||||
Returns:
|
||||
True if local inference should be used
|
||||
"""
|
||||
if self.local_only:
|
||||
return True
|
||||
if explicit_local:
|
||||
return True
|
||||
if external_failed and FALLBACK_ON_FAILURE:
|
||||
return self.available()
|
||||
return False
|
||||
|
||||
def status(self) -> dict:
|
||||
"""Return provider status for health dashboards."""
|
||||
health = self.client.health_check()
|
||||
models = self.client.list_models()
|
||||
return {
|
||||
"provider": "llama.cpp",
|
||||
"endpoint": self.endpoint,
|
||||
"healthy": health.healthy,
|
||||
"model_loaded": health.model_loaded,
|
||||
"model_name": health.model_name,
|
||||
"available_models": [m.get("id", "") for m in models],
|
||||
"local_only": self.local_only,
|
||||
"fallback_enabled": FALLBACK_ON_FAILURE,
|
||||
}
|
||||
|
||||
def get_name(self) -> str:
|
||||
return "llama.cpp"
|
||||
|
||||
def get_priority(self) -> int:
|
||||
"""Lower number = higher priority. Local is last resort."""
|
||||
if self.local_only:
|
||||
return 0 # highest priority in local-only mode
|
||||
return 100 # fallback priority
|
||||
51
systemd/llama-server.service
Normal file
51
systemd/llama-server.service
Normal file
@@ -0,0 +1,51 @@
|
||||
[Unit]
|
||||
Description=llama.cpp Local LLM Server
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=root
|
||||
Group=root
|
||||
|
||||
# Model and server configuration
|
||||
Environment=MODEL_PATH=/opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf
|
||||
Environment=LLAMA_HOST=0.0.0.0
|
||||
Environment=LLAMA_PORT=11435
|
||||
Environment=LLAMA_CTX_SIZE=4096
|
||||
Environment=LLAMA_THREADS=4
|
||||
|
||||
ExecStart=/usr/local/bin/llama-server \
|
||||
-m ${MODEL_PATH} \
|
||||
--host ${LLAMA_HOST} \
|
||||
--port ${LLAMA_PORT} \
|
||||
-c ${LLAMA_CTX_SIZE} \
|
||||
-t ${LLAMA_THREADS} \
|
||||
--cont-batching
|
||||
|
||||
Restart=on-failure
|
||||
RestartSec=10
|
||||
StartLimitBurst=3
|
||||
StartLimitIntervalSec=60
|
||||
|
||||
# Resource limits
|
||||
MemoryMax=12G
|
||||
CPUQuota=90%
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ReadWritePaths=/opt/models
|
||||
PrivateTmp=true
|
||||
ProtectKernelTunables=true
|
||||
ProtectControlGroups=true
|
||||
RestrictSUIDSGID=true
|
||||
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=llama-server
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
207
tests/test_llama_client.py
Normal file
207
tests/test_llama_client.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""Tests for llama_client — OpenAI-compatible client for llama.cpp."""
|
||||
import json
|
||||
from unittest.mock import MagicMock, patch
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from bin.llama_client import (
|
||||
LlamaClient,
|
||||
ChatMessage,
|
||||
CompletionResponse,
|
||||
HealthStatus,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ChatMessage
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestChatMessage:
|
||||
def test_creation(self):
|
||||
msg = ChatMessage(role="user", content="Hello")
|
||||
assert msg.role == "user"
|
||||
assert msg.content == "Hello"
|
||||
|
||||
def test_system_message(self):
|
||||
msg = ChatMessage(role="system", content="You are helpful.")
|
||||
assert msg.role == "system"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HealthStatus
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHealthStatus:
|
||||
def test_healthy(self):
|
||||
status = HealthStatus(healthy=True, endpoint="http://localhost:11435", model_loaded=True)
|
||||
assert status.healthy is True
|
||||
assert status.model_loaded is True
|
||||
|
||||
def test_unhealthy(self):
|
||||
status = HealthStatus(healthy=False, endpoint="http://localhost:11435", error="Connection refused")
|
||||
assert status.healthy is False
|
||||
assert status.error == "Connection refused"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LlamaClient
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLlamaClient:
|
||||
def test_default_endpoint(self):
|
||||
client = LlamaClient()
|
||||
assert client.endpoint == "http://localhost:11435"
|
||||
|
||||
def test_custom_endpoint(self):
|
||||
client = LlamaClient(endpoint="http://192.168.1.10:8080")
|
||||
assert client.endpoint == "http://192.168.1.10:8080"
|
||||
|
||||
def test_trailing_slash_stripped(self):
|
||||
client = LlamaClient(endpoint="http://localhost:11435/")
|
||||
assert client.endpoint == "http://localhost:11435"
|
||||
|
||||
def test_custom_model(self):
|
||||
client = LlamaClient(model="mistral-7b")
|
||||
assert client.model == "mistral-7b"
|
||||
|
||||
@patch("bin.llama_client._http_get")
|
||||
def test_health_check_success(self, mock_get):
|
||||
mock_get.return_value = {"status": "ok", "model_loaded": True}
|
||||
client = LlamaClient()
|
||||
status = client.health_check()
|
||||
assert status.healthy is True
|
||||
assert status.model_loaded is True
|
||||
mock_get.assert_called_once_with("http://localhost:11435/health")
|
||||
|
||||
@patch("bin.llama_client._http_get")
|
||||
def test_health_check_failure(self, mock_get):
|
||||
mock_get.side_effect = ConnectionError("refused")
|
||||
client = LlamaClient()
|
||||
status = client.health_check()
|
||||
assert status.healthy is False
|
||||
assert "refused" in status.error
|
||||
|
||||
@patch("bin.llama_client._http_get")
|
||||
def test_is_healthy_true(self, mock_get):
|
||||
mock_get.return_value = {"status": "ok"}
|
||||
client = LlamaClient()
|
||||
assert client.is_healthy() is True
|
||||
|
||||
@patch("bin.llama_client._http_get")
|
||||
def test_is_healthy_false(self, mock_get):
|
||||
mock_get.side_effect = ConnectionError("down")
|
||||
client = LlamaClient()
|
||||
assert client.is_healthy() is False
|
||||
|
||||
@patch("bin.llama_client._http_get")
|
||||
def test_list_models(self, mock_get):
|
||||
mock_get.return_value = {
|
||||
"data": [{"id": "qwen2.5-7b", "object": "model"}]
|
||||
}
|
||||
client = LlamaClient()
|
||||
models = client.list_models()
|
||||
assert len(models) == 1
|
||||
assert models[0]["id"] == "qwen2.5-7b"
|
||||
|
||||
@patch("bin.llama_client._http_get")
|
||||
def test_list_models_empty(self, mock_get):
|
||||
mock_get.side_effect = ConnectionError("down")
|
||||
client = LlamaClient()
|
||||
models = client.list_models()
|
||||
assert models == []
|
||||
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_chat_success(self, mock_post):
|
||||
mock_post.return_value = {
|
||||
"model": "qwen2.5-7b",
|
||||
"choices": [{"message": {"content": "Hello! How can I help?"}, "finish_reason": "stop"}],
|
||||
"usage": {"total_tokens": 25},
|
||||
}
|
||||
client = LlamaClient()
|
||||
messages = [ChatMessage(role="user", content="Hello")]
|
||||
response = client.chat(messages)
|
||||
assert response.text == "Hello! How can I help?"
|
||||
assert response.tokens_used == 25
|
||||
assert response.finish_reason == "stop"
|
||||
assert response.latency_ms > 0
|
||||
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_chat_custom_params(self, mock_post):
|
||||
mock_post.return_value = {
|
||||
"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
|
||||
"usage": {},
|
||||
}
|
||||
client = LlamaClient()
|
||||
messages = [ChatMessage(role="user", content="test")]
|
||||
client.chat(messages, max_tokens=100, temperature=0.3)
|
||||
call_data = mock_post.call_args[0][1]
|
||||
assert call_data["max_tokens"] == 100
|
||||
assert call_data["temperature"] == 0.3
|
||||
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_chat_connection_error(self, mock_post):
|
||||
mock_post.side_effect = ConnectionError("down")
|
||||
client = LlamaClient()
|
||||
messages = [ChatMessage(role="user", content="test")]
|
||||
with pytest.raises(ConnectionError):
|
||||
client.chat(messages)
|
||||
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_simple_chat(self, mock_post):
|
||||
mock_post.return_value = {
|
||||
"choices": [{"message": {"content": "I am well!"}, "finish_reason": "stop"}],
|
||||
"usage": {"total_tokens": 15},
|
||||
}
|
||||
client = LlamaClient()
|
||||
result = client.simple_chat("How are you?")
|
||||
assert result == "I am well!"
|
||||
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_simple_chat_with_system(self, mock_post):
|
||||
mock_post.return_value = {
|
||||
"choices": [{"message": {"content": "Yes"}, "finish_reason": "stop"}],
|
||||
"usage": {},
|
||||
}
|
||||
client = LlamaClient()
|
||||
client.simple_chat("Are you helpful?", system="You are helpful.")
|
||||
call_data = mock_post.call_args[0][1]
|
||||
assert len(call_data["messages"]) == 2
|
||||
assert call_data["messages"][0]["role"] == "system"
|
||||
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_complete(self, mock_post):
|
||||
mock_post.return_value = {
|
||||
"content": "Once upon a time...",
|
||||
"tokens_predicted": 50,
|
||||
}
|
||||
client = LlamaClient()
|
||||
response = client.complete("Once upon a time")
|
||||
assert response.text == "Once upon a time..."
|
||||
assert response.tokens_used == 50
|
||||
|
||||
@patch("bin.llama_client.time.time")
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_benchmark(self, mock_post, mock_time):
|
||||
mock_post.return_value = {
|
||||
"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}],
|
||||
"usage": {"total_tokens": 10},
|
||||
}
|
||||
# Simulate 50ms latency per call
|
||||
mock_time.side_effect = [0.0, 0.05, 0.05, 0.1, 0.1, 0.15, 0.15, 0.2, 0.2, 0.25]
|
||||
client = LlamaClient()
|
||||
result = client.benchmark(iterations=3)
|
||||
assert result["iterations"] == 3
|
||||
assert result["avg_latency_ms"] > 0
|
||||
assert result["tok_per_sec"] > 0
|
||||
|
||||
def test_env_override(self):
|
||||
with patch.dict("os.environ", {"LLAMA_ENDPOINT": "http://custom:9999"}):
|
||||
from importlib import reload
|
||||
import bin.llama_client as mod
|
||||
reload(mod)
|
||||
# Default endpoint reads from env at import time
|
||||
assert mod.DEFAULT_ENDPOINT == "http://custom:9999"
|
||||
Reference in New Issue
Block a user