Closes #1123. Implements all three phases of the local LLM standardization: PHASE 1 — Deployment: - docs/local-llm.md: full deployment guide (build, model download, health check, model path convention /opt/models/llama/, hardware recommendations) - systemd/llama-server.service: hardened unit with resource limits and auto-restart - Health check: /health endpoint + model loaded verification PHASE 2 — Hermes Integration: - bin/llama_client.py: OpenAI-compatible Python client wrapping llama.cpp HTTP API (chat completions, streaming, raw completions, health check, model listing, benchmarking, full CLI interface) - nexus/llama_provider.py: Hermes inference router provider adapter - Activates when external APIs fail, LOCAL_ONLY=true, or explicit local request - Response format normalized to OpenAI-compatible chat completions - Token usage estimated and logged - Health caching with TTL for efficiency PHASE 3 — Optimization & Ops: - Benchmarking: client.benchmark() + CLI benchmark command - Quantization guide: Q4_K_M recommended for fleet, Q6_K for high-RAM, Q3_K for low - Model recommendations for VPS Beta (3B), VPS Alpha (7B), Mac (7B Q6_K) - Night watch integration: health probe script with auto-restart Fleet standard model: Qwen2.5-7B-Instruct-Q4_K_M.gguf Default endpoint: http://localhost:11435 22 tests pass.
355 lines
12 KiB
Python
355 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
llama_client.py — OpenAI-compatible client for llama.cpp HTTP API.
|
|
|
|
Wraps the llama-server endpoint for use as a sovereign local LLM backend.
|
|
Supports chat completions, raw completions, streaming, health checks,
|
|
model listing, and benchmarking.
|
|
|
|
Usage:
|
|
python3 bin/llama_client.py chat "Hello, how are you?"
|
|
python3 bin/llama_client.py health
|
|
python3 bin/llama_client.py models
|
|
python3 bin/llama_client.py benchmark --iterations 10
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Generator, Optional
|
|
|
|
try:
|
|
import requests
|
|
except ImportError:
|
|
requests = None # fallback to urllib
|
|
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
DEFAULT_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
|
|
DEFAULT_MODEL = os.environ.get("LLAMA_MODEL", "qwen2.5-7b")
|
|
DEFAULT_MAX_TOKENS = int(os.environ.get("LLAMA_MAX_TOKENS", "512"))
|
|
DEFAULT_TEMPERATURE = float(os.environ.get("LLAMA_TEMPERATURE", "0.7"))
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
@dataclass
|
|
class ChatMessage:
|
|
role: str # "system", "user", "assistant"
|
|
content: str
|
|
|
|
|
|
@dataclass
|
|
class CompletionResponse:
|
|
text: str
|
|
tokens_used: int = 0
|
|
latency_ms: float = 0.0
|
|
model: str = ""
|
|
finish_reason: str = ""
|
|
|
|
|
|
@dataclass
|
|
class HealthStatus:
|
|
healthy: bool
|
|
endpoint: str
|
|
model_loaded: bool = False
|
|
model_name: str = ""
|
|
error: str = ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP helper (works with or without requests library)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _http_post(url: str, data: dict, timeout: int = 120) -> dict:
|
|
"""POST JSON to URL, return parsed JSON response."""
|
|
body = json.dumps(data).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
url,
|
|
data=body,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
except urllib.error.URLError as e:
|
|
raise ConnectionError(f"Cannot reach {url}: {e}")
|
|
|
|
|
|
def _http_get(url: str, timeout: int = 10) -> dict:
|
|
"""GET URL, return parsed JSON response."""
|
|
req = urllib.request.Request(url, headers={"Accept": "application/json"})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
except urllib.error.URLError as e:
|
|
raise ConnectionError(f"Cannot reach {url}: {e}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LlamaClient
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class LlamaClient:
|
|
"""OpenAI-compatible client for llama.cpp HTTP server."""
|
|
|
|
def __init__(self, endpoint: str = DEFAULT_ENDPOINT, model: str = DEFAULT_MODEL):
|
|
self.endpoint = endpoint.rstrip("/")
|
|
self.model = model
|
|
|
|
# --- Health ---
|
|
|
|
def health_check(self) -> HealthStatus:
|
|
"""Probe the /health endpoint."""
|
|
try:
|
|
data = _http_get(f"{self.endpoint}/health")
|
|
model_loaded = data.get("status", "") == "ok" or data.get("model_loaded", False)
|
|
return HealthStatus(
|
|
healthy=True,
|
|
endpoint=self.endpoint,
|
|
model_loaded=model_loaded,
|
|
model_name=data.get("model_path", self.model),
|
|
)
|
|
except Exception as e:
|
|
return HealthStatus(healthy=False, endpoint=self.endpoint, error=str(e))
|
|
|
|
def is_healthy(self) -> bool:
|
|
"""Quick boolean health check."""
|
|
return self.health_check().healthy
|
|
|
|
# --- Models ---
|
|
|
|
def list_models(self) -> list[dict]:
|
|
"""List loaded models (OpenAI-compatible /v1/models)."""
|
|
try:
|
|
data = _http_get(f"{self.endpoint}/v1/models")
|
|
return data.get("data", [])
|
|
except Exception:
|
|
return []
|
|
|
|
# --- Chat completions ---
|
|
|
|
def chat(
|
|
self,
|
|
messages: list[ChatMessage],
|
|
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
temperature: float = DEFAULT_TEMPERATURE,
|
|
stream: bool = False,
|
|
) -> CompletionResponse:
|
|
"""Send a chat completion request (OpenAI-compatible /v1/chat/completions)."""
|
|
payload = {
|
|
"model": self.model,
|
|
"messages": [{"role": m.role, "content": m.content} for m in messages],
|
|
"max_tokens": max_tokens,
|
|
"temperature": temperature,
|
|
"stream": stream,
|
|
}
|
|
|
|
start = time.time()
|
|
data = _http_post(f"{self.endpoint}/v1/chat/completions", payload)
|
|
latency = (time.time() - start) * 1000
|
|
|
|
choice = data.get("choices", [{}])[0]
|
|
message = choice.get("message", {})
|
|
usage = data.get("usage", {})
|
|
|
|
return CompletionResponse(
|
|
text=message.get("content", ""),
|
|
tokens_used=usage.get("total_tokens", 0),
|
|
latency_ms=latency,
|
|
model=data.get("model", self.model),
|
|
finish_reason=choice.get("finish_reason", ""),
|
|
)
|
|
|
|
def chat_stream(
|
|
self,
|
|
messages: list[ChatMessage],
|
|
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
temperature: float = DEFAULT_TEMPERATURE,
|
|
) -> Generator[str, None, None]:
|
|
"""Stream chat completion tokens."""
|
|
payload = {
|
|
"model": self.model,
|
|
"messages": [{"role": m.role, "content": m.content} for m in messages],
|
|
"max_tokens": max_tokens,
|
|
"temperature": temperature,
|
|
"stream": True,
|
|
}
|
|
body = json.dumps(payload).encode("utf-8")
|
|
req = urllib.request.Request(
|
|
f"{self.endpoint}/v1/chat/completions",
|
|
data=body,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=300) as resp:
|
|
for line in resp:
|
|
line = line.decode("utf-8").strip()
|
|
if line.startswith("data: "):
|
|
chunk = line[6:]
|
|
if chunk == "[DONE]":
|
|
break
|
|
try:
|
|
data = json.loads(chunk)
|
|
delta = data.get("choices", [{}])[0].get("delta", {})
|
|
content = delta.get("content", "")
|
|
if content:
|
|
yield content
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
# --- Simple helpers ---
|
|
|
|
def simple_chat(
|
|
self,
|
|
prompt: str,
|
|
system: Optional[str] = None,
|
|
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
) -> str:
|
|
"""One-shot chat: send prompt, return text response."""
|
|
messages = []
|
|
if system:
|
|
messages.append(ChatMessage(role="system", content=system))
|
|
messages.append(ChatMessage(role="user", content=prompt))
|
|
response = self.chat(messages, max_tokens=max_tokens)
|
|
return response.text
|
|
|
|
# --- Raw completion ---
|
|
|
|
def complete(
|
|
self,
|
|
prompt: str,
|
|
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
temperature: float = DEFAULT_TEMPERATURE,
|
|
) -> CompletionResponse:
|
|
"""Raw text completion (llama.cpp /completion endpoint)."""
|
|
payload = {
|
|
"prompt": prompt,
|
|
"n_predict": max_tokens,
|
|
"temperature": temperature,
|
|
}
|
|
start = time.time()
|
|
data = _http_post(f"{self.endpoint}/completion", payload)
|
|
latency = (time.time() - start) * 1000
|
|
|
|
return CompletionResponse(
|
|
text=data.get("content", ""),
|
|
tokens_used=data.get("tokens_predicted", 0),
|
|
latency_ms=latency,
|
|
model=self.model,
|
|
)
|
|
|
|
# --- Benchmark ---
|
|
|
|
def benchmark(
|
|
self,
|
|
prompt: str = "Explain sovereignty in 3 sentences.",
|
|
iterations: int = 5,
|
|
max_tokens: int = 128,
|
|
) -> dict:
|
|
"""Run N iterations and report latency + throughput stats."""
|
|
latencies = []
|
|
token_counts = []
|
|
|
|
for i in range(iterations):
|
|
messages = [ChatMessage(role="user", content=prompt)]
|
|
resp = self.chat(messages, max_tokens=max_tokens)
|
|
latencies.append(resp.latency_ms)
|
|
token_counts.append(resp.tokens_used)
|
|
|
|
avg_latency = sum(latencies) / len(latencies)
|
|
avg_tokens = sum(token_counts) / len(token_counts)
|
|
tok_per_sec = (avg_tokens / avg_latency) * 1000 if avg_latency > 0 else 0
|
|
|
|
return {
|
|
"iterations": iterations,
|
|
"prompt": prompt,
|
|
"avg_latency_ms": round(avg_latency, 1),
|
|
"min_latency_ms": round(min(latencies), 1),
|
|
"max_latency_ms": round(max(latencies), 1),
|
|
"avg_tokens": round(avg_tokens, 1),
|
|
"tok_per_sec": round(tok_per_sec, 1),
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="llama.cpp client CLI")
|
|
parser.add_argument("--url", default=DEFAULT_ENDPOINT, help="llama-server endpoint")
|
|
parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name")
|
|
|
|
sub = parser.add_subparsers(dest="command")
|
|
|
|
# health
|
|
sub.add_parser("health", help="Check server health")
|
|
|
|
# models
|
|
sub.add_parser("models", help="List loaded models")
|
|
|
|
# chat
|
|
chat_p = sub.add_parser("chat", help="One-shot chat")
|
|
chat_p.add_argument("prompt", help="User message")
|
|
chat_p.add_argument("--system", default=None, help="System prompt")
|
|
chat_p.add_argument("--max-tokens", type=int, default=DEFAULT_MAX_TOKENS)
|
|
chat_p.add_argument("--stream", action="store_true", help="Stream response")
|
|
|
|
# benchmark
|
|
bench_p = sub.add_parser("benchmark", help="Run benchmark")
|
|
bench_p.add_argument("--prompt", default="Explain sovereignty in 3 sentences.")
|
|
bench_p.add_argument("--iterations", type=int, default=5)
|
|
bench_p.add_argument("--max-tokens", type=int, default=128)
|
|
|
|
args = parser.parse_args()
|
|
client = LlamaClient(endpoint=args.url, model=args.model)
|
|
|
|
if args.command == "health":
|
|
status = client.health_check()
|
|
print(json.dumps(status.__dict__, indent=2))
|
|
sys.exit(0 if status.healthy else 1)
|
|
|
|
elif args.command == "models":
|
|
models = client.list_models()
|
|
print(json.dumps(models, indent=2))
|
|
|
|
elif args.command == "chat":
|
|
if args.stream:
|
|
messages = []
|
|
if args.system:
|
|
messages.append(ChatMessage(role="system", content=args.system))
|
|
messages.append(ChatMessage(role="user", content=args.prompt))
|
|
for chunk in client.chat_stream(messages, max_tokens=args.max_tokens):
|
|
print(chunk, end="", flush=True)
|
|
print()
|
|
else:
|
|
result = client.simple_chat(args.prompt, system=args.system, max_tokens=args.max_tokens)
|
|
print(result)
|
|
|
|
elif args.command == "benchmark":
|
|
result = client.benchmark(
|
|
prompt=args.prompt,
|
|
iterations=args.iterations,
|
|
max_tokens=args.max_tokens,
|
|
)
|
|
print(json.dumps(result, indent=2))
|
|
|
|
else:
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|