Compare commits
11 Commits
queue/1123
...
dawn/1123-
| Author | SHA1 | Date | |
|---|---|---|---|
| ad98bd5ead | |||
| e847b0e473 | |||
| 63c6829ef8 | |||
| a55647d5d3 | |||
| 64719324e0 | |||
| ee6d12ccf6 | |||
| 84eb8104d8 | |||
| 93228388d7 | |||
| e27c51c6da | |||
| ed79826608 | |||
| e438662c97 |
48
.gitattributes
vendored
Normal file
48
.gitattributes
vendored
Normal file
@@ -0,0 +1,48 @@
|
||||
# .gitattributes
|
||||
# Controls git archive exports and helps categorize repo contents.
|
||||
# export-ignore: excluded from `git archive` tarballs and sparse-export contexts.
|
||||
#
|
||||
# For agents blocked by repo size on clone, see CONTRIBUTING.md §"Large-Repo Clone Strategy".
|
||||
|
||||
# ── Documentation & reports (not needed for runtime or tests) ──────────────────
|
||||
docs/ export-ignore
|
||||
reports/ export-ignore
|
||||
audits/ export-ignore
|
||||
reviews/ export-ignore
|
||||
paper/ export-ignore
|
||||
scaffold/ export-ignore
|
||||
playground/ export-ignore
|
||||
examples/ export-ignore
|
||||
intelligence/ export-ignore
|
||||
|
||||
# Root-level narrative docs (keep CLAUDE.md, README.md, CONTRIBUTING.md)
|
||||
FINDINGS-*.md export-ignore
|
||||
FIRST_LIGHT_REPORT*.md export-ignore
|
||||
INVESTIGATION_*.md export-ignore
|
||||
LEGACY_MATRIX_AUDIT.md export-ignore
|
||||
SOUL.md export-ignore
|
||||
POLICY.md export-ignore
|
||||
BROWSER_CONTRACT.md export-ignore
|
||||
EVENNIA_NEXUS_EVENT_PROTOCOL.md export-ignore
|
||||
GAMEPORTAL_PROTOCOL.md export-ignore
|
||||
DEVELOPMENT.md export-ignore
|
||||
|
||||
# ── Operation-specific directories ────────────────────────────────────────────
|
||||
operation-get-a-job/ export-ignore
|
||||
operations/ export-ignore
|
||||
org/ export-ignore
|
||||
concept-packs/ export-ignore
|
||||
evolution/ export-ignore
|
||||
|
||||
# ── Assets (binary/media files not needed for CI) ─────────────────────────────
|
||||
assets/ export-ignore
|
||||
icons/ export-ignore
|
||||
|
||||
# ── Linguist overrides (GitHub/Gitea language stats) ──────────────────────────
|
||||
docs/ linguist-documentation
|
||||
scaffold/ linguist-documentation
|
||||
paper/ linguist-documentation
|
||||
reports/ linguist-documentation
|
||||
audits/ linguist-documentation
|
||||
|
||||
*.md linguist-documentation
|
||||
@@ -136,6 +136,44 @@ Hotfixes require:
|
||||
|
||||
---
|
||||
|
||||
## Large-Repo Clone Strategy
|
||||
|
||||
Some repos in this org (hermes-agent, the-nexus as it grows) can exceed 1000 tracked files, which causes `git clone --depth 1` to time out and also hits the Gitea tree-API cap of 1000 entries.
|
||||
|
||||
### Recommended clone patterns for agents
|
||||
|
||||
**Blobless partial clone** — fastest overall; metadata arrives immediately, blobs are fetched on demand:
|
||||
```sh
|
||||
git clone --filter=blob:none --depth 1 <repo-url>
|
||||
```
|
||||
|
||||
**Treeless partial clone** — skips tree objects for past commits; best when you need full working tree but not history:
|
||||
```sh
|
||||
git clone --filter=tree:0 <repo-url>
|
||||
```
|
||||
|
||||
**Sparse checkout** — only materialise the subdirectories you actually need:
|
||||
```sh
|
||||
git clone --filter=blob:none --no-checkout <repo-url> myrepo
|
||||
cd myrepo
|
||||
git sparse-checkout init --cone
|
||||
git sparse-checkout set nexus tests # only check out these dirs
|
||||
git checkout main
|
||||
```
|
||||
|
||||
### Gitea tree API workaround
|
||||
|
||||
When the tree endpoint returns exactly 1000 entries and you suspect truncation, pass `recursive=1` and page through with the `page` parameter:
|
||||
```
|
||||
GET /api/v1/repos/{owner}/{repo}/git/trees/{sha}?recursive=1&page=2
|
||||
```
|
||||
|
||||
### Why `.gitattributes` export-ignore exists
|
||||
|
||||
Directories marked `export-ignore` in `.gitattributes` are excluded from `git archive` tarballs and future sparse-export tooling. This reduces the surface area for export-based agent workflows. It does **not** affect `git clone` directly — use the partial-clone flags above for that.
|
||||
|
||||
---
|
||||
|
||||
## Stale PR Policy
|
||||
|
||||
A cron job runs every 6 hours and auto-closes PRs that are:
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
"""llama_client.py — OpenAI-compatible client for llama.cpp HTTP API."""
|
||||
import argparse, json, os, sys, time
|
||||
from dataclasses import dataclass
|
||||
from typing import Generator, Optional
|
||||
import urllib.request, urllib.error
|
||||
|
||||
DEFAULT_ENDPOINT = os.environ.get("LLAMA_ENDPOINT", "http://localhost:11435")
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# Local LLM Deployment Guide — llama.cpp Sovereign Inference
|
||||
# Local LLM Deployment Guide — llama.cpp
|
||||
|
||||
llama.cpp provides sovereign, offline-capable inference on CPU, CUDA, and
|
||||
Apple Silicon. One binary, one model path, one health endpoint.
|
||||
Standardizes local LLM inference across the fleet using llama.cpp.
|
||||
|
||||
## Quick Start
|
||||
|
||||
@@ -11,44 +10,41 @@ Apple Silicon. One binary, one model path, one health endpoint.
|
||||
mkdir -p /opt/models/llama
|
||||
wget -O /opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q4_k_m.gguf"
|
||||
llama-server -m /opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf --host 0.0.0.0 --port 11435 -c 4096 -t $(nproc) --cont-batching
|
||||
curl http://localhost:11435/health
|
||||
|
||||
## Model Path Convention
|
||||
## Model Paths
|
||||
|
||||
- /opt/models/llama/ — Production (system-wide)
|
||||
- ~/models/llama/ — Per-user (dev)
|
||||
- /opt/models/llama/ — Production
|
||||
- ~/models/llama/ — Dev
|
||||
- MODEL_DIR env var — Override
|
||||
|
||||
## Recommended Models
|
||||
## Models
|
||||
|
||||
- Qwen2.5-7B-Instruct (4.7GB, 8GB RAM, 25-40 tok/s) — Fleet standard
|
||||
- Qwen2.5-3B-Instruct (2.0GB, 4GB RAM, 50-80 tok/s) — VPS Beta
|
||||
- Mistral-7B-Instruct-v0.3 (4.4GB, 8GB RAM) — Alternative
|
||||
- Qwen2.5-7B-Instruct-Q4_K_M (4.7GB) — Fleet standard, VPS Alpha
|
||||
- Qwen2.5-3B-Instruct-Q4_K_M (2.0GB) — VPS Beta
|
||||
- Mistral-7B-Instruct-v0.3-Q4_K_M (4.4GB) — Alternative
|
||||
|
||||
## Quantization Guide
|
||||
## Quantization
|
||||
|
||||
- Q6_K (5.5GB) — Best quality/speed, RAM > 12GB
|
||||
- Q4_K_M (4.7GB) — Fleet standard
|
||||
- Q3_K_M (3.4GB) — < 6GB RAM fallback
|
||||
- Q6_K (5.5GB) — Best quality/speed, 12GB+ RAM
|
||||
- Q4_K_M (4.7GB) — Fleet standard, 8GB RAM
|
||||
- Q3_K_M (3.4GB) — Low-RAM fallback, 4GB
|
||||
|
||||
## Hardware Targets
|
||||
## Hardware
|
||||
|
||||
- VPS Beta (2 vCPU, 4GB): Qwen2.5-3B-Q4_K_M, ctx 2048, ~40-60 tok/s
|
||||
- VPS Alpha (4 vCPU, 8GB): Qwen2.5-7B-Q4_K_M, ctx 4096, ~20-35 tok/s
|
||||
- Mac Apple Silicon: Qwen2.5-7B-Q6_K, Metal, ~30-50 tok/s
|
||||
- VPS Beta (2c/4GB): 3B-Q4_K_M, ctx 2048, ~40-60 tok/s
|
||||
- VPS Alpha (4c/8GB): 7B-Q4_K_M, ctx 4096, ~20-35 tok/s
|
||||
- Mac (AS/16GB+): 7B-Q6_K, Metal, ~30-50 tok/s
|
||||
|
||||
## Health Check
|
||||
## Health
|
||||
|
||||
curl -sf http://localhost:11435/health
|
||||
curl -s http://localhost:11435/v1/models
|
||||
|
||||
## API Compatibility
|
||||
|
||||
llama-server exposes OpenAI-compatible API at /v1/chat/completions.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
- Won't start: use smaller model or lower quant
|
||||
- Slow: match -t to available cores
|
||||
- OOM: reduce -c context size
|
||||
- Port in use: lsof -i :11435
|
||||
- Won't start → smaller model / lower quant
|
||||
- Slow → -t to core count
|
||||
- OOM → reduce -c
|
||||
- Port conflict → lsof -i :11435
|
||||
|
||||
See systemd/llama-server.service for production deployment.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""llama_provider.py — Hermes inference router provider for llama.cpp local server."""
|
||||
"""llama_provider.py — Hermes inference router provider for llama.cpp."""
|
||||
import logging, os, time
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
@@ -22,7 +22,6 @@ ProtectHome=read-only
|
||||
ReadWritePaths=/opt/models
|
||||
PrivateTmp=true
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=llama-server
|
||||
|
||||
[Install]
|
||||
|
||||
@@ -18,8 +18,7 @@ class TestHealthStatus:
|
||||
class TestLlamaClient:
|
||||
def test_defaults(self):
|
||||
c = LlamaClient()
|
||||
assert c.endpoint == "http://localhost:11435"
|
||||
assert c.model == "qwen2.5-7b"
|
||||
assert c.endpoint == "http://localhost:11435" and c.model == "qwen2.5-7b"
|
||||
|
||||
def test_custom(self):
|
||||
c = LlamaClient("http://x:8080", "mistral")
|
||||
@@ -43,8 +42,6 @@ class TestLlamaClient:
|
||||
def test_is_healthy(self, m):
|
||||
m.return_value = {"status": "ok"}
|
||||
assert LlamaClient().is_healthy() is True
|
||||
m.side_effect = ConnectionError()
|
||||
assert LlamaClient().is_healthy() is False
|
||||
|
||||
@patch("bin.llama_client._http_get")
|
||||
def test_list_models(self, m):
|
||||
@@ -77,7 +74,7 @@ class TestLlamaClient:
|
||||
@patch("bin.llama_client._http_post")
|
||||
def test_simple_chat_system(self, m):
|
||||
m.return_value = {"choices": [{"message": {"content": "OK"}, "finish_reason": "stop"}], "usage": {}}
|
||||
LlamaClient().simple_chat("t", system="be helpful")
|
||||
LlamaClient().simple_chat("t", system="helpful")
|
||||
assert len(m.call_args[0][1]["messages"]) == 2
|
||||
|
||||
@patch("bin.llama_client._http_post")
|
||||
|
||||
Reference in New Issue
Block a user