Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8694261ee2 |
@@ -1,28 +0,0 @@
|
||||
name: Lint
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 5
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Check for hardcoded paths
|
||||
run: python3 scripts/lint_hardcoded_paths.py
|
||||
continue-on-error: true
|
||||
|
||||
- name: Check Python syntax
|
||||
run: |
|
||||
find . -name "*.py" -not -path "./.git/*" -not -path "./node_modules/*" | head -100 | xargs python3 -m py_compile || true
|
||||
@@ -1,78 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Pre-commit hook: Reject hardcoded home-directory paths.
|
||||
|
||||
Install:
|
||||
cp pre-commit-hardcoded-path.py .git/hooks/pre-commit-hardcoded-path
|
||||
chmod +x .git/hooks/pre-commit-hardcoded-path
|
||||
|
||||
Or add to .pre-commit-config.yaml
|
||||
"""
|
||||
|
||||
import sys
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
PATTERNS = [
|
||||
(r"/Users/[\w.\-]+/", "macOS home directory"),
|
||||
(r"/home/[\w.\-]+/", "Linux home directory"),
|
||||
(r"(?<![\w/])~/", "unexpanded tilde"),
|
||||
]
|
||||
|
||||
NOQA = re.compile(r"#\s*noqa:?\s*hardcoded-path-ok")
|
||||
|
||||
def get_staged_files():
|
||||
result = subprocess.run(
|
||||
["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
return [f for f in result.stdout.strip().split("\n") if f.endswith(".py")]
|
||||
|
||||
def check_file(filepath):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "show", f":{filepath}"],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
content = result.stdout
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
violations = []
|
||||
for i, line in enumerate(content.split("\n"), 1):
|
||||
if line.strip().startswith("#"):
|
||||
continue
|
||||
if line.strip().startswith(("import ", "from ")):
|
||||
continue
|
||||
if NOQA.search(line):
|
||||
continue
|
||||
for pattern, desc in PATTERNS:
|
||||
if re.search(pattern, line):
|
||||
violations.append((filepath, i, line.strip(), desc))
|
||||
break
|
||||
return violations
|
||||
|
||||
def main():
|
||||
files = get_staged_files()
|
||||
if not files:
|
||||
sys.exit(0)
|
||||
|
||||
all_violations = []
|
||||
for f in files:
|
||||
all_violations.extend(check_file(f))
|
||||
|
||||
if all_violations:
|
||||
print("ERROR: Hardcoded home directory paths detected:")
|
||||
print()
|
||||
for filepath, line_no, line, desc in all_violations:
|
||||
print(f" {filepath}:{line_no}: {desc}")
|
||||
print(f" {line[:100]}")
|
||||
print()
|
||||
print("Fix: Use $HOME, relative paths, or get_hermes_home().")
|
||||
print("Override: Add '# noqa: hardcoded-path-ok' to the line.")
|
||||
sys.exit(1)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
@@ -25,10 +25,6 @@ jobs:
|
||||
- name: Install system dependencies
|
||||
run: sudo apt-get update && sudo apt-get install -y ripgrep
|
||||
|
||||
- name: Check for hardcoded paths
|
||||
run: python3 scripts/lint_hardcoded_paths.py || true
|
||||
continue-on-error: true
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
|
||||
|
||||
|
||||
@@ -1,172 +0,0 @@
|
||||
# Vector Database SOTA Research Report
|
||||
## For AI Agent Semantic Retrieval — April 2026
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
Analysis of current vector database benchmarks, documentation, and production deployments for semantic retrieval in AI agents. Compared against existing Hermes session_search (SQLite FTS5) and holographic memory systems.
|
||||
|
||||
---
|
||||
|
||||
## 1. Retrieval Accuracy (Recall@10)
|
||||
|
||||
| Database | HNSW Recall | IVF Recall | Notes |
|
||||
|----------|-------------|------------|-------|
|
||||
| **Qdrant** | 0.95-0.99 | N/A | Tunable via ef parameter |
|
||||
| **Milvus** | 0.95-0.99 | 0.85-0.95 | Multiple index support |
|
||||
| **Weaviate** | 0.95-0.98 | N/A | HNSW primary |
|
||||
| **Pinecone** | 0.95-0.99 | N/A | Managed, opaque tuning |
|
||||
| **ChromaDB** | 0.90-0.95 | N/A | Simpler, uses HNSW via hnswlib |
|
||||
| **pgvector** | 0.85-0.95 | 0.80-0.90 | Depends on tuning |
|
||||
| **SQLite-vss** | 0.80-0.90 | N/A | HNSW via sqlite-vss |
|
||||
| **Current FTS5** | ~0.60-0.75* | N/A | Keyword matching only |
|
||||
|
||||
*FTS5 "recall" estimated: good for exact keywords, poor for semantic/paraphrased queries.
|
||||
|
||||
---
|
||||
|
||||
## 2. Latency Benchmarks (1M vectors, 768-dim, 10 neighbors)
|
||||
|
||||
| Database | p50 (ms) | p99 (ms) | QPS | Notes |
|
||||
|----------|----------|----------|-----|-------|
|
||||
| **Qdrant** | 1-3 | 5-10 | 5,000-15,000 | Best self-hosted |
|
||||
| **Milvus** | 2-5 | 8-15 | 3,000-12,000 | Good distributed |
|
||||
| **Weaviate** | 3-8 | 10-25 | 2,000-8,000 | |
|
||||
| **Pinecone** | 5-15 | 20-50 | 1,000-5,000 | Managed overhead |
|
||||
| **ChromaDB** | 5-15 | 20-50 | 500-2,000 | Embedded mode |
|
||||
| **pgvector** | 10-50 | 50-200 | 200-1,000 | SQL overhead |
|
||||
| **SQLite-vss** | 10-30 | 50-150 | 300-800 | Limited scalability |
|
||||
| **Current FTS5** | 2-10 | 15-50 | 1,000-5,000 | No embedding cost |
|
||||
|
||||
---
|
||||
|
||||
## 3. Index Types Comparison
|
||||
|
||||
### HNSW (Hierarchical Navigable Small World)
|
||||
- Best for: High recall, moderate memory, fast queries
|
||||
- Used by: Qdrant, Weaviate, ChromaDB, Milvus, pgvector, SQLite-vss
|
||||
- Memory: High (~1.5GB per 1M 768-dim vectors)
|
||||
- Key parameters: ef_construction (100-500), M (16-64), ef (64-256)
|
||||
|
||||
### IVF (Inverted File Index)
|
||||
- Best for: Large datasets, memory-constrained
|
||||
- Used by: Milvus, pgvector
|
||||
- Memory: Lower (~0.5GB per 1M vectors)
|
||||
- Key parameters: nlist (100-10000), nprobe (10-100)
|
||||
|
||||
### DiskANN / SPANN
|
||||
- Best for: 100M+ vectors on disk
|
||||
- Memory: Very low (~100MB index)
|
||||
|
||||
### Quantization (SQ/PQ)
|
||||
- Memory reduction: 4-8x
|
||||
- Recall impact: -5-15%
|
||||
|
||||
---
|
||||
|
||||
## 4. Multi-Modal Support
|
||||
|
||||
| Database | Text | Image | Audio | Video | Mixed Queries |
|
||||
|----------|------|-------|-------|-------|---------------|
|
||||
| Qdrant | ✅ | ✅ | ✅ | ✅ | ✅ (multi-vector) |
|
||||
| Milvus | ✅ | ✅ | ✅ | ✅ | ✅ (hybrid) |
|
||||
| Weaviate | ✅ | ✅ | ✅ | ✅ | ✅ (named vectors) |
|
||||
| Pinecone | ✅ | ✅ | ✅ | ✅ | Limited |
|
||||
| ChromaDB | ✅ | Via emb | Via emb | Via emb | Limited |
|
||||
| pgvector | ✅ | Via emb | Via emb | Via emb | Limited |
|
||||
| SQLite-vss | ✅ | Via emb | Via emb | Via emb | Limited |
|
||||
|
||||
---
|
||||
|
||||
## 5. Integration Patterns for AI Agents
|
||||
|
||||
### Pattern A: Direct Search
|
||||
Query → Embedding → Vector DB → Top-K → LLM
|
||||
|
||||
### Pattern B: Hybrid Search
|
||||
Query → BM25 + Vector → Merge/Rerank → LLM
|
||||
|
||||
### Pattern C: Multi-Stage
|
||||
Query → Vector DB (top-100) → Reranker (top-10) → LLM
|
||||
|
||||
### Pattern D: Agent Memory with Trust + Decay
|
||||
Query → Vector → Score × Trust × Decay → Top-K → Summarize
|
||||
|
||||
---
|
||||
|
||||
## 6. Comparison with Current Systems
|
||||
|
||||
### session_search (FTS5)
|
||||
Strengths: Zero deps, no embedding needed, fast for exact keywords
|
||||
Limitations: No semantic understanding, no cross-lingual, limited ranking
|
||||
|
||||
### holographic/retrieval.py (HRR)
|
||||
Strengths: Compositional queries, contradiction detection, trust + decay
|
||||
Limitations: Requires numpy, O(n) scan, non-standard embedding space
|
||||
|
||||
### Expected Gains from Vector DB:
|
||||
- Semantic recall: +30-50% for paraphrased queries
|
||||
- Cross-lingual: +60-80%
|
||||
- Fuzzy matching: +40-60%
|
||||
- Conceptual: +50-70%
|
||||
|
||||
---
|
||||
|
||||
## 7. Recommendations
|
||||
|
||||
### Option 1: Qdrant (RECOMMENDED)
|
||||
- Best self-hosted performance
|
||||
- Rust implementation, native multi-vector
|
||||
- Tradeoff: Separate service deployment
|
||||
|
||||
### Option 2: pgvector (CONSERVATIVE)
|
||||
- Zero new infrastructure if using PostgreSQL
|
||||
- Tradeoff: 5-10x slower than Qdrant
|
||||
|
||||
### Option 3: SQLite-vss (LIGHTWEIGHT)
|
||||
- Minimal changes, embedded deployment
|
||||
- Tradeoff: Limited scalability (<100K vectors)
|
||||
|
||||
### Option 4: Hybrid (BEST OF BOTH)
|
||||
Keep FTS5 + HRR and add Qdrant:
|
||||
- Vector (semantic) + FTS5 (keyword) + HRR (compositional)
|
||||
- Apply trust scoring + temporal decay
|
||||
|
||||
---
|
||||
|
||||
## 8. Embedding Models (2025-2026)
|
||||
|
||||
| Model | Dimensions | Quality | Cost |
|
||||
|-------|-----------|---------|------|
|
||||
| OpenAI text-embedding-3-large | 3072 | Best | $$$ |
|
||||
| OpenAI text-embedding-3-small | 1536 | Good | $ |
|
||||
| BGE-M3 | 1024 | Best self-hosted | Free |
|
||||
| GTE-Qwen2 | 768-1024 | Good | Free |
|
||||
|
||||
---
|
||||
|
||||
## 9. Hardware Requirements (1M vectors, 768-dim)
|
||||
|
||||
| Database | RAM (HNSW) | RAM (Quantized) |
|
||||
|----------|-----------|-----------------|
|
||||
| Qdrant | 8-16GB | 2-4GB |
|
||||
| Milvus | 16-32GB | 4-8GB |
|
||||
| pgvector | 4-8GB | N/A |
|
||||
| SQLite-vss | 2-4GB | N/A |
|
||||
|
||||
---
|
||||
|
||||
## 10. Conclusion
|
||||
|
||||
Primary: Qdrant with hybrid search (vector + FTS5 + HRR)
|
||||
Key insight: Augment existing HRR system, don't replace it.
|
||||
|
||||
Next steps:
|
||||
1. Deploy Qdrant in Docker for testing
|
||||
2. Benchmark embedding models
|
||||
3. Implement hybrid search prototype
|
||||
4. Measure recall improvement
|
||||
5. Evaluate operational complexity
|
||||
|
||||
Report: April 2026 | Sources: ANN-Benchmarks, VectorDBBench, official docs
|
||||
@@ -1,443 +0,0 @@
|
||||
"""
|
||||
A2A mutual-TLS server — secure agent-to-agent communication.
|
||||
|
||||
Each fleet agent runs an A2A server that:
|
||||
- Presents its own TLS certificate (signed by the fleet CA).
|
||||
- Requires the connecting peer to present a valid client certificate
|
||||
also signed by the fleet CA.
|
||||
- Rejects connections from unknown / self-signed peers.
|
||||
|
||||
Usage (standalone):
|
||||
python -m agent.a2a_mtls \\
|
||||
--cert ~/.hermes/pki/agents/timmy/timmy.crt \\
|
||||
--key ~/.hermes/pki/agents/timmy/timmy.key \\
|
||||
--ca ~/.hermes/pki/ca/fleet-ca.crt \\
|
||||
--host 0.0.0.0 --port 9443
|
||||
|
||||
Environment variables (alternative to CLI flags):
|
||||
HERMES_A2A_CERT path to agent certificate
|
||||
HERMES_A2A_KEY path to agent private key
|
||||
HERMES_A2A_CA path to fleet CA certificate
|
||||
|
||||
Refs #806
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import ssl
|
||||
import threading
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
from urllib.error import URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# mTLS SSL context helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_server_ssl_context(
|
||||
cert: str | Path,
|
||||
key: str | Path,
|
||||
ca: str | Path,
|
||||
) -> ssl.SSLContext:
|
||||
"""Return an SSLContext that presents *cert/key* and requires a valid
|
||||
client certificate signed by *ca*.
|
||||
|
||||
Raises ``FileNotFoundError`` if any path is missing.
|
||||
Raises ``ssl.SSLError`` if the files are malformed.
|
||||
"""
|
||||
cert, key, ca = Path(cert), Path(key), Path(ca)
|
||||
for p in (cert, key, ca):
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"mTLS: file not found: {p}")
|
||||
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
|
||||
ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
|
||||
ctx.load_verify_locations(cafile=str(ca))
|
||||
# CERT_REQUIRED — reject peers that don't present a cert signed by *ca*.
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
return ctx
|
||||
|
||||
|
||||
def build_client_ssl_context(
|
||||
cert: str | Path,
|
||||
key: str | Path,
|
||||
ca: str | Path,
|
||||
) -> ssl.SSLContext:
|
||||
"""Return an SSLContext for an outgoing mTLS connection.
|
||||
|
||||
Presents *cert/key* as the client identity and verifies the server
|
||||
certificate against *ca*.
|
||||
"""
|
||||
cert, key, ca = Path(cert), Path(key), Path(ca)
|
||||
for p in (cert, key, ca):
|
||||
if not p.exists():
|
||||
raise FileNotFoundError(f"mTLS client: file not found: {p}")
|
||||
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
|
||||
ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
|
||||
ctx.load_verify_locations(cafile=str(ca))
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
ctx.check_hostname = True
|
||||
return ctx
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Minimal A2A HTTP request handler
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class A2AHandler(BaseHTTPRequestHandler):
|
||||
"""Handles A2A requests over a mutually-authenticated TLS connection.
|
||||
|
||||
GET /.well-known/agent-card.json — returns the local agent card.
|
||||
POST /a2a/task — dispatches an A2A task (stub).
|
||||
"""
|
||||
|
||||
log_message = logger.debug # route access log to Python logger
|
||||
|
||||
def do_GET(self) -> None: # noqa: N802
|
||||
if self.path in ("/.well-known/agent-card.json", "/agent-card.json"):
|
||||
self._serve_agent_card()
|
||||
else:
|
||||
self._send_json(404, {"error": "not found"})
|
||||
|
||||
def do_POST(self) -> None: # noqa: N802
|
||||
if self.path == "/a2a/task":
|
||||
self._handle_task()
|
||||
else:
|
||||
self._send_json(404, {"error": "not found"})
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def _serve_agent_card(self) -> None:
|
||||
try:
|
||||
from agent.agent_card import get_agent_card_json
|
||||
body = get_agent_card_json().encode()
|
||||
except Exception as exc:
|
||||
logger.warning("agent-card unavailable: %s", exc)
|
||||
body = b'{"error": "agent card unavailable"}'
|
||||
self._send_raw(200, "application/json", body)
|
||||
|
||||
def _handle_task(self) -> None:
|
||||
length = int(self.headers.get("Content-Length", 0))
|
||||
_body = self.rfile.read(length) if length else b""
|
||||
# Stub: echo back a 202 Accepted with the peer CN so callers can
|
||||
# confirm which agent processed the request.
|
||||
peer_cn = _peer_cn(self.connection)
|
||||
self._send_json(202, {"status": "accepted", "handled_by": peer_cn})
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def _send_json(self, code: int, data: dict) -> None:
|
||||
import json
|
||||
body = json.dumps(data).encode()
|
||||
self._send_raw(code, "application/json", body)
|
||||
|
||||
def _send_raw(self, code: int, content_type: str, body: bytes) -> None:
|
||||
self.send_response(code)
|
||||
self.send_header("Content-Type", content_type)
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
|
||||
def log_message(self, fmt: str, *args: object) -> None: # type: ignore[override]
|
||||
logger.debug("a2a: " + fmt, *args)
|
||||
|
||||
|
||||
def _peer_cn(conn: ssl.SSLSocket) -> Optional[str]:
|
||||
"""Extract the Common Name from the peer certificate, or None."""
|
||||
try:
|
||||
peer = conn.getpeercert()
|
||||
if not peer:
|
||||
return None
|
||||
for rdn in peer.get("subject", ()):
|
||||
for key, val in rdn:
|
||||
if key == "commonName":
|
||||
return val
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Server lifecycle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class A2AServer:
|
||||
"""Mutual-TLS A2A server.
|
||||
|
||||
Example::
|
||||
|
||||
server = A2AServer(
|
||||
cert="~/.hermes/pki/agents/timmy/timmy.crt",
|
||||
key="~/.hermes/pki/agents/timmy/timmy.key",
|
||||
ca="~/.hermes/pki/ca/fleet-ca.crt",
|
||||
)
|
||||
server.start() # non-blocking (daemon thread)
|
||||
...
|
||||
server.stop()
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cert: str | Path,
|
||||
key: str | Path,
|
||||
ca: str | Path,
|
||||
host: str = "0.0.0.0",
|
||||
port: int = 9443,
|
||||
) -> None:
|
||||
self.cert = Path(cert).expanduser()
|
||||
self.key = Path(key).expanduser()
|
||||
self.ca = Path(ca).expanduser()
|
||||
self.host = host
|
||||
self.port = port
|
||||
self._httpd: Optional[HTTPServer] = None
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
|
||||
def start(self, daemon: bool = True) -> None:
|
||||
"""Start the server in a background thread (default: daemon)."""
|
||||
ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
|
||||
self._httpd = HTTPServer((self.host, self.port), A2AHandler)
|
||||
self._httpd.socket = ssl_ctx.wrap_socket(
|
||||
self._httpd.socket, server_side=True
|
||||
)
|
||||
self._thread = threading.Thread(
|
||||
target=self._httpd.serve_forever, daemon=daemon
|
||||
)
|
||||
self._thread.start()
|
||||
logger.info(
|
||||
"A2A mTLS server listening on %s:%s (cert=%s)",
|
||||
self.host, self.port, self.cert.name,
|
||||
)
|
||||
|
||||
def stop(self) -> None:
|
||||
if self._httpd:
|
||||
self._httpd.shutdown()
|
||||
self._httpd = None
|
||||
if self._thread:
|
||||
self._thread.join(timeout=5)
|
||||
self._thread = None
|
||||
|
||||
|
||||
def server_from_env() -> A2AServer:
|
||||
"""Build an A2AServer from environment variables / defaults."""
|
||||
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||
agent_name = os.environ.get("HERMES_AGENT_NAME", "hermes").lower()
|
||||
|
||||
default_cert = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.crt"
|
||||
default_key = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.key"
|
||||
default_ca = hermes_home / "pki" / "ca" / "fleet-ca.crt"
|
||||
|
||||
cert = os.environ.get("HERMES_A2A_CERT", str(default_cert))
|
||||
key = os.environ.get("HERMES_A2A_KEY", str(default_key))
|
||||
ca = os.environ.get("HERMES_A2A_CA", str(default_ca))
|
||||
host = os.environ.get("HERMES_A2A_HOST", "0.0.0.0")
|
||||
port = int(os.environ.get("HERMES_A2A_PORT", "9443"))
|
||||
|
||||
return A2AServer(cert=cert, key=key, ca=ca, host=host, port=port)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _main() -> None:
|
||||
import argparse
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Hermes A2A mutual-TLS server"
|
||||
)
|
||||
parser.add_argument("--cert", required=True, help="Path to agent certificate")
|
||||
parser.add_argument("--key", required=True, help="Path to agent private key")
|
||||
parser.add_argument("--ca", required=True, help="Path to fleet CA certificate")
|
||||
parser.add_argument("--host", default="0.0.0.0")
|
||||
parser.add_argument("--port", type=int, default=9443)
|
||||
args = parser.parse_args()
|
||||
|
||||
server = A2AServer(
|
||||
cert=args.cert, key=args.key, ca=args.ca,
|
||||
host=args.host, port=args.port,
|
||||
)
|
||||
server.start(daemon=False)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
_main()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# A2AMTLSServer — routing-based server with context-manager support
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _RoutingHandler(BaseHTTPRequestHandler):
|
||||
"""HTTP request handler that dispatches to per-path callables."""
|
||||
|
||||
routes: Dict[str, Callable] = {}
|
||||
|
||||
def log_message(self, fmt: str, *args: Any) -> None:
|
||||
logger.debug("A2AMTLSServer: " + fmt, *args)
|
||||
|
||||
def _peer_cn(self) -> Optional[str]:
|
||||
cert = self.connection.getpeercert() # type: ignore[attr-defined]
|
||||
if not cert:
|
||||
return None
|
||||
for rdn in cert.get("subject", ()):
|
||||
for attr, value in rdn:
|
||||
if attr == "commonName":
|
||||
return value
|
||||
return None
|
||||
|
||||
def do_POST(self) -> None:
|
||||
handler = self.routes.get(self.path)
|
||||
if handler is None:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
length = int(self.headers.get("Content-Length", 0))
|
||||
body = self.rfile.read(length) if length else b""
|
||||
try:
|
||||
payload = json.loads(body) if body else {}
|
||||
except json.JSONDecodeError:
|
||||
self.send_response(400)
|
||||
self.end_headers()
|
||||
return
|
||||
result = handler(payload, peer_cn=self._peer_cn())
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(result).encode())
|
||||
|
||||
def do_GET(self) -> None:
|
||||
handler = self.routes.get(self.path)
|
||||
if handler is None:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
return
|
||||
result = handler({}, peer_cn=self._peer_cn())
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps(result).encode())
|
||||
|
||||
|
||||
class A2AMTLSServer:
|
||||
"""Routing-based mTLS HTTPS server with context-manager support.
|
||||
|
||||
Unlike ``A2AServer`` (which serves fixed A2A paths), this server lets
|
||||
callers register arbitrary path handlers — useful for tests and custom
|
||||
A2A endpoint implementations.
|
||||
|
||||
handler signature: ``handler(payload: dict, *, peer_cn: str | None) -> dict``
|
||||
|
||||
Example::
|
||||
|
||||
server = A2AMTLSServer(cert="timmy.crt", key="timmy.key", ca="fleet-ca.crt")
|
||||
server.add_route("/tasks/send", my_handler)
|
||||
with server:
|
||||
... # server runs for the duration of the block
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cert: str | Path,
|
||||
key: str | Path,
|
||||
ca: str | Path,
|
||||
host: str = "127.0.0.1",
|
||||
port: int = 9443,
|
||||
) -> None:
|
||||
self.cert = Path(cert).expanduser()
|
||||
self.key = Path(key).expanduser()
|
||||
self.ca = Path(ca).expanduser()
|
||||
self.host = host
|
||||
self.port = port
|
||||
self._routes: Dict[str, Callable] = {}
|
||||
self._httpd: Optional[HTTPServer] = None
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
|
||||
def add_route(self, path: str, handler: Callable) -> None:
|
||||
self._routes[path] = handler
|
||||
|
||||
def start(self) -> None:
|
||||
ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
|
||||
|
||||
class _Handler(_RoutingHandler):
|
||||
routes = self._routes
|
||||
|
||||
self._httpd = HTTPServer((self.host, self.port), _Handler)
|
||||
self._httpd.socket = ssl_ctx.wrap_socket(self._httpd.socket, server_side=True)
|
||||
self._thread = threading.Thread(
|
||||
target=self._httpd.serve_forever,
|
||||
daemon=True,
|
||||
name=f"a2a-mtls-{self.port}",
|
||||
)
|
||||
self._thread.start()
|
||||
logger.info("A2AMTLSServer on %s:%d (mTLS)", self.host, self.port)
|
||||
|
||||
def stop(self) -> None:
|
||||
if self._httpd:
|
||||
self._httpd.shutdown()
|
||||
self._httpd = None
|
||||
if self._thread:
|
||||
self._thread.join(timeout=5)
|
||||
self._thread = None
|
||||
|
||||
def __enter__(self) -> "A2AMTLSServer":
|
||||
self.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, *_: Any) -> None:
|
||||
self.stop()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# A2AMTLSClient — mTLS HTTP client
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class A2AMTLSClient:
|
||||
"""HTTP client that presents a fleet cert on every outgoing connection.
|
||||
|
||||
Example::
|
||||
|
||||
client = A2AMTLSClient(cert="allegro.crt", key="allegro.key", ca="fleet-ca.crt")
|
||||
result = client.post("https://timmy:9443/tasks/send", json={"task": "..."})
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cert: str | Path,
|
||||
key: str | Path,
|
||||
ca: str | Path,
|
||||
) -> None:
|
||||
self._ssl_ctx = build_client_ssl_context(cert, key, ca)
|
||||
self._ssl_ctx.check_hostname = False # callers connecting by IP
|
||||
|
||||
def _request(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
data: Optional[bytes] = None,
|
||||
timeout: float = 10.0,
|
||||
) -> Dict[str, Any]:
|
||||
headers = {"Content-Type": "application/json"}
|
||||
req = Request(url, data=data, headers=headers, method=method)
|
||||
try:
|
||||
with urlopen(req, context=self._ssl_ctx, timeout=timeout) as resp:
|
||||
body = resp.read()
|
||||
return json.loads(body) if body else {}
|
||||
except URLError as exc:
|
||||
raise ConnectionError(f"A2AMTLSClient {method} {url} failed: {exc.reason}") from exc
|
||||
|
||||
def get(self, url: str, **kwargs: Any) -> Dict[str, Any]:
|
||||
return self._request("GET", url, **kwargs)
|
||||
|
||||
def post(self, url: str, json: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
|
||||
data = (__import__("json").dumps(json).encode() if json is not None else None)
|
||||
return self._request("POST", url, data=data, **kwargs)
|
||||
@@ -1,135 +0,0 @@
|
||||
"""
|
||||
Agent Card — A2A-compliant agent discovery.
|
||||
Part of #843: fix: implement A2A agent card for fleet discovery (#819)
|
||||
|
||||
Provides metadata about the agent's identity, capabilities, and installed skills
|
||||
for discovery by other agents in the fleet.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from hermes_cli import __version__
|
||||
from hermes_cli.config import load_config, get_hermes_home
|
||||
from agent.skill_utils import (
|
||||
iter_skill_index_files,
|
||||
parse_frontmatter,
|
||||
get_all_skills_dirs,
|
||||
get_disabled_skill_names,
|
||||
skill_matches_platform
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class AgentSkill:
|
||||
id: str
|
||||
name: str
|
||||
description: str = ""
|
||||
version: str = "1.0.0"
|
||||
|
||||
@dataclass
|
||||
class AgentCapabilities:
|
||||
streaming: bool = True
|
||||
tools: bool = True
|
||||
vision: bool = False
|
||||
reasoning: bool = False
|
||||
|
||||
@dataclass
|
||||
class AgentCard:
|
||||
name: str
|
||||
description: str
|
||||
url: str
|
||||
version: str = __version__
|
||||
capabilities: AgentCapabilities = field(default_factory=AgentCapabilities)
|
||||
skills: List[AgentSkill] = field(default_factory=list)
|
||||
defaultInputModes: List[str] = field(default_factory=lambda: ["text/plain"])
|
||||
defaultOutputModes: List[str] = field(default_factory=lambda: ["text/plain"])
|
||||
|
||||
def _load_skills() -> List[AgentSkill]:
|
||||
"""Scan all enabled skills and return metadata."""
|
||||
skills = []
|
||||
disabled = get_disabled_skill_names()
|
||||
|
||||
for skills_dir in get_all_skills_dirs():
|
||||
if not skills_dir.is_dir():
|
||||
continue
|
||||
for skill_file in iter_skill_index_files(skills_dir, "SKILL.md"):
|
||||
try:
|
||||
raw = skill_file.read_text(encoding="utf-8")
|
||||
frontmatter, _ = parse_frontmatter(raw)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
skill_name = frontmatter.get("name") or skill_file.parent.name
|
||||
if str(skill_name) in disabled:
|
||||
continue
|
||||
if not skill_matches_platform(frontmatter):
|
||||
continue
|
||||
|
||||
skills.append(AgentSkill(
|
||||
id=str(skill_name),
|
||||
name=str(frontmatter.get("name", skill_name)),
|
||||
description=str(frontmatter.get("description", "")),
|
||||
version=str(frontmatter.get("version", "1.0.0"))
|
||||
))
|
||||
return skills
|
||||
|
||||
def build_agent_card() -> AgentCard:
|
||||
"""Build the agent card from current configuration and environment."""
|
||||
config = load_config()
|
||||
|
||||
# Identity
|
||||
name = os.environ.get("HERMES_AGENT_NAME") or config.get("agent", {}).get("name") or "hermes"
|
||||
description = os.environ.get("HERMES_AGENT_DESCRIPTION") or config.get("agent", {}).get("description") or "Sovereign AI agent"
|
||||
|
||||
# URL - try to determine from environment or config
|
||||
port = os.environ.get("HERMES_WEB_PORT") or "9119"
|
||||
host = os.environ.get("HERMES_WEB_HOST") or "localhost"
|
||||
url = f"http://{host}:{port}"
|
||||
|
||||
# Capabilities
|
||||
# In a real scenario, we'd check model metadata for vision/reasoning
|
||||
capabilities = AgentCapabilities(
|
||||
streaming=True,
|
||||
tools=True,
|
||||
vision=False, # Default to false unless we can confirm
|
||||
reasoning=False
|
||||
)
|
||||
|
||||
# Skills
|
||||
skills = _load_skills()
|
||||
|
||||
return AgentCard(
|
||||
name=name,
|
||||
description=description,
|
||||
url=url,
|
||||
version=__version__,
|
||||
capabilities=capabilities,
|
||||
skills=skills
|
||||
)
|
||||
|
||||
def get_agent_card_json() -> str:
|
||||
"""Return the agent card as a JSON string."""
|
||||
try:
|
||||
card = build_agent_card()
|
||||
return json.dumps(asdict(card), indent=2)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to build agent card: {e}")
|
||||
# Minimal fallback card
|
||||
fallback = {
|
||||
"name": "hermes",
|
||||
"description": "Sovereign AI agent (fallback)",
|
||||
"version": __version__,
|
||||
"error": str(e)
|
||||
}
|
||||
return json.dumps(fallback, indent=2)
|
||||
|
||||
def validate_agent_card(card_data: Dict[str, Any]) -> bool:
|
||||
"""Check if the card data complies with the A2A schema."""
|
||||
required = ["name", "description", "url", "version"]
|
||||
return all(k in card_data for k in required)
|
||||
@@ -1,273 +0,0 @@
|
||||
"""
|
||||
Circuit Breaker for Error Cascading — #885
|
||||
|
||||
P(error | prev was error) = 58.6% vs P(error | prev was success) = 25.2%.
|
||||
That's a 2.33x cascade factor. After 3 consecutive errors, the circuit
|
||||
opens and the agent must take corrective action.
|
||||
|
||||
States:
|
||||
- CLOSED: Normal operation, errors are counted
|
||||
- OPEN: Too many consecutive errors, corrective action required
|
||||
- HALF_OPEN: Testing if errors have cleared
|
||||
|
||||
Usage:
|
||||
from agent.circuit_breaker import CircuitBreaker, ToolCircuitBreaker
|
||||
|
||||
cb = ToolCircuitBreaker()
|
||||
|
||||
# After each tool call
|
||||
if not cb.record_result(success=True):
|
||||
# Circuit is open — take corrective action
|
||||
cb.get_recovery_action()
|
||||
"""
|
||||
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
class CircuitState(Enum):
|
||||
CLOSED = "closed" # Normal operation
|
||||
OPEN = "open" # Too many errors, block execution
|
||||
HALF_OPEN = "half_open" # Testing recovery
|
||||
|
||||
|
||||
@dataclass
|
||||
class CircuitBreaker:
|
||||
"""
|
||||
Generic circuit breaker with configurable thresholds.
|
||||
|
||||
Tracks consecutive errors and opens the circuit when the
|
||||
error streak exceeds the threshold.
|
||||
"""
|
||||
failure_threshold: int = 3
|
||||
recovery_timeout: float = 30.0 # seconds before trying half-open
|
||||
success_threshold: int = 2 # successes needed to close from half-open
|
||||
|
||||
state: CircuitState = field(default=CircuitState.CLOSED, init=False)
|
||||
consecutive_failures: int = field(default=0, init=False)
|
||||
consecutive_successes: int = field(default=0, init=False)
|
||||
last_failure_time: Optional[float] = field(default=None, init=False)
|
||||
total_trips: int = field(default=0, init=False)
|
||||
error_streaks: List[int] = field(default_factory=list, init=False)
|
||||
|
||||
def record_result(self, success: bool) -> bool:
|
||||
"""
|
||||
Record a tool call result. Returns True if circuit allows execution.
|
||||
|
||||
Returns:
|
||||
True if circuit is CLOSED or HALF_OPEN (execution allowed)
|
||||
False if circuit is OPEN (execution blocked)
|
||||
"""
|
||||
now = time.time()
|
||||
|
||||
if self.state == CircuitState.OPEN:
|
||||
# Check if recovery timeout has passed
|
||||
if self.last_failure_time and (now - self.last_failure_time) >= self.recovery_timeout:
|
||||
self.state = CircuitState.HALF_OPEN
|
||||
self.consecutive_successes = 0
|
||||
return True # Allow one test execution
|
||||
return False # Still open
|
||||
|
||||
if success:
|
||||
self.consecutive_failures = 0
|
||||
self.consecutive_successes += 1
|
||||
|
||||
if self.state == CircuitState.HALF_OPEN:
|
||||
if self.consecutive_successes >= self.success_threshold:
|
||||
self.state = CircuitState.CLOSED
|
||||
self.consecutive_successes = 0
|
||||
|
||||
return True
|
||||
else:
|
||||
self.consecutive_successes = 0
|
||||
self.consecutive_failures += 1
|
||||
self.last_failure_time = now
|
||||
|
||||
if self.state == CircuitState.HALF_OPEN:
|
||||
# Failed during recovery — reopen immediately
|
||||
self.state = CircuitState.OPEN
|
||||
self.total_trips += 1
|
||||
return False
|
||||
|
||||
if self.consecutive_failures >= self.failure_threshold:
|
||||
self.state = CircuitState.OPEN
|
||||
self.total_trips += 1
|
||||
self.error_streaks.append(self.consecutive_failures)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def can_execute(self) -> bool:
|
||||
"""Check if execution is allowed."""
|
||||
if self.state == CircuitState.OPEN:
|
||||
if self.last_failure_time:
|
||||
now = time.time()
|
||||
if (now - self.last_failure_time) >= self.recovery_timeout:
|
||||
self.state = CircuitState.HALF_OPEN
|
||||
self.consecutive_successes = 0
|
||||
return True
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_state(self) -> Dict[str, Any]:
|
||||
"""Get current circuit state."""
|
||||
return {
|
||||
"state": self.state.value,
|
||||
"consecutive_failures": self.consecutive_failures,
|
||||
"consecutive_successes": self.consecutive_successes,
|
||||
"total_trips": self.total_trips,
|
||||
"max_streak": max(self.error_streaks) if self.error_streaks else 0,
|
||||
"can_execute": self.can_execute(),
|
||||
}
|
||||
|
||||
def reset(self):
|
||||
"""Reset the circuit breaker."""
|
||||
self.state = CircuitState.CLOSED
|
||||
self.consecutive_failures = 0
|
||||
self.consecutive_successes = 0
|
||||
self.last_failure_time = None
|
||||
|
||||
|
||||
class ToolCircuitBreaker(CircuitBreaker):
|
||||
"""
|
||||
Circuit breaker specifically for tool call error cascading.
|
||||
|
||||
Provides recovery actions when the circuit opens.
|
||||
"""
|
||||
|
||||
# Tools that are most effective at recovery (from audit data)
|
||||
RECOVERY_TOOLS = [
|
||||
"terminal", # Most effective — 2300 recoveries
|
||||
"read_file", # Reset context by reading something
|
||||
"search_files", # Find what went wrong
|
||||
]
|
||||
|
||||
def get_recovery_action(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get the recommended recovery action when circuit is open.
|
||||
|
||||
Returns dict with action type and details.
|
||||
"""
|
||||
streak = self.consecutive_failures
|
||||
|
||||
if streak >= 9:
|
||||
# After 9 errors: 41/46 recoveries via terminal
|
||||
return {
|
||||
"action": "terminal_only",
|
||||
"reason": f"Error streak of {streak} — terminal is the only reliable recovery",
|
||||
"suggested_tool": "terminal",
|
||||
"suggested_command": "echo 'Resetting context'",
|
||||
"severity": "critical",
|
||||
}
|
||||
elif streak >= 5:
|
||||
return {
|
||||
"action": "switch_tool_type",
|
||||
"reason": f"Error streak of {streak} — switch to a different tool category",
|
||||
"suggested_tools": ["read_file", "search_files", "terminal"],
|
||||
"severity": "high",
|
||||
}
|
||||
elif streak >= self.failure_threshold:
|
||||
return {
|
||||
"action": "ask_user",
|
||||
"reason": f"{streak} consecutive errors — ask user for guidance",
|
||||
"suggested_response": "I'm encountering repeated errors. Would you like me to try a different approach?",
|
||||
"severity": "medium",
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"action": "continue",
|
||||
"reason": f"Error streak of {streak} — within tolerance",
|
||||
"severity": "low",
|
||||
}
|
||||
|
||||
def should_compress_context(self) -> bool:
|
||||
"""Determine if context compression would help recovery."""
|
||||
return self.consecutive_failures >= 5
|
||||
|
||||
def get_blocked_tool(self) -> Optional[str]:
|
||||
"""Get the tool that should be blocked (if any)."""
|
||||
if self.state == CircuitState.OPEN:
|
||||
return "last_failed_tool"
|
||||
return None
|
||||
|
||||
|
||||
class MultiToolCircuitBreaker:
|
||||
"""
|
||||
Manages per-tool circuit breakers and cross-tool cascade detection.
|
||||
|
||||
When one tool trips its breaker, related tools are also warned.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.breakers: Dict[str, ToolCircuitBreaker] = {}
|
||||
self.global_streak: int = 0
|
||||
self.last_tool: Optional[str] = None
|
||||
self.last_success: bool = True
|
||||
|
||||
def get_breaker(self, tool_name: str) -> ToolCircuitBreaker:
|
||||
"""Get or create a circuit breaker for a tool."""
|
||||
if tool_name not in self.breakers:
|
||||
self.breakers[tool_name] = ToolCircuitBreaker()
|
||||
return self.breakers[tool_name]
|
||||
|
||||
def record_result(self, tool_name: str, success: bool) -> bool:
|
||||
"""
|
||||
Record a tool call result. Returns True if execution should continue.
|
||||
"""
|
||||
breaker = self.get_breaker(tool_name)
|
||||
allowed = breaker.record_result(success)
|
||||
|
||||
# Track global streak
|
||||
if success:
|
||||
self.global_streak = 0
|
||||
self.last_success = True
|
||||
else:
|
||||
self.global_streak += 1
|
||||
self.last_success = False
|
||||
|
||||
self.last_tool = tool_name
|
||||
return allowed
|
||||
|
||||
def can_execute(self, tool_name: str) -> bool:
|
||||
"""Check if a specific tool can execute."""
|
||||
breaker = self.get_breaker(tool_name)
|
||||
return breaker.can_execute()
|
||||
|
||||
def get_global_state(self) -> Dict[str, Any]:
|
||||
"""Get overall circuit breaker state."""
|
||||
return {
|
||||
"global_streak": self.global_streak,
|
||||
"last_tool": self.last_tool,
|
||||
"last_success": self.last_success,
|
||||
"tool_states": {
|
||||
name: breaker.get_state()
|
||||
for name, breaker in self.breakers.items()
|
||||
if breaker.consecutive_failures > 0 or breaker.total_trips > 0
|
||||
},
|
||||
"any_open": any(b.state == CircuitState.OPEN for b in self.breakers.values()),
|
||||
}
|
||||
|
||||
def get_recovery_action(self) -> Dict[str, Any]:
|
||||
"""Get recovery action based on global state."""
|
||||
if self.global_streak == 0:
|
||||
return {"action": "continue", "reason": "No errors"}
|
||||
|
||||
# Find the breaker with the worst streak
|
||||
worst = max(self.breakers.values(), key=lambda b: b.consecutive_failures, default=None)
|
||||
if worst and worst.consecutive_failures > 0:
|
||||
return worst.get_recovery_action()
|
||||
|
||||
return {
|
||||
"action": "continue",
|
||||
"reason": f"Global streak: {self.global_streak}",
|
||||
"severity": "low",
|
||||
}
|
||||
|
||||
def reset_all(self):
|
||||
"""Reset all circuit breakers."""
|
||||
for breaker in self.breakers.values():
|
||||
breaker.reset()
|
||||
self.global_streak = 0
|
||||
self.last_success = True
|
||||
@@ -1,148 +0,0 @@
|
||||
"""
|
||||
Context Budget Tracker - Prevent context window overflow
|
||||
|
||||
Poka-yoke: Visual warnings at 70%%, 85%%, 95%% capacity.
|
||||
Auto-checkpoint at 85%%. Pre-flight token estimation.
|
||||
|
||||
Issue: #838
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HERMES_HOME = Path.home() / ".hermes"
|
||||
CHECKPOINT_DIR = HERMES_HOME / "checkpoints"
|
||||
CHARS_PER_TOKEN = 4
|
||||
|
||||
THRESHOLD_WARNING = 0.70
|
||||
THRESHOLD_CRITICAL = 0.85
|
||||
THRESHOLD_DANGER = 0.95
|
||||
|
||||
|
||||
class ContextBudget:
|
||||
def __init__(self, context_limit: int = 128000, system_tokens: int = 0,
|
||||
used_tokens: int = 0, reserved_tokens: int = 2000):
|
||||
self.context_limit = context_limit
|
||||
self.system_tokens = system_tokens
|
||||
self.used_tokens = used_tokens
|
||||
self.reserved_tokens = reserved_tokens
|
||||
|
||||
@property
|
||||
def total_used(self) -> int:
|
||||
return self.system_tokens + self.used_tokens
|
||||
|
||||
@property
|
||||
def available(self) -> int:
|
||||
return max(0, self.context_limit - self.reserved_tokens)
|
||||
|
||||
@property
|
||||
def remaining(self) -> int:
|
||||
return max(0, self.available - self.total_used)
|
||||
|
||||
@property
|
||||
def utilization(self) -> float:
|
||||
return self.total_used / self.available if self.available > 0 else 1.0
|
||||
|
||||
|
||||
def estimate_tokens(text: str) -> int:
|
||||
return len(text) // CHARS_PER_TOKEN if text else 0
|
||||
|
||||
|
||||
def estimate_messages_tokens(messages: List[Dict]) -> int:
|
||||
total = 0
|
||||
for msg in messages:
|
||||
content = msg.get("content", "")
|
||||
if isinstance(content, str):
|
||||
total += estimate_tokens(content)
|
||||
if msg.get("tool_calls"):
|
||||
total += 100
|
||||
return total
|
||||
|
||||
|
||||
class ContextBudgetTracker:
|
||||
def __init__(self, context_limit: int = 128000, session_id: str = ""):
|
||||
self.budget = ContextBudget(context_limit=context_limit)
|
||||
self.session_id = session_id
|
||||
self._checkpointed = False
|
||||
self._warnings_given = set()
|
||||
|
||||
def update_from_messages(self, messages: List[Dict]):
|
||||
self.budget.used_tokens = estimate_messages_tokens(messages)
|
||||
|
||||
def can_fit(self, additional_tokens: int) -> bool:
|
||||
return self.budget.remaining >= additional_tokens
|
||||
|
||||
def preflight_check(self, text: str) -> Tuple[bool, str]:
|
||||
tokens = estimate_tokens(text)
|
||||
if not self.can_fit(tokens):
|
||||
return False, f"Cannot load: ~{tokens:,} tokens needed, {self.budget.remaining:,} remaining"
|
||||
would_util = (self.budget.total_used + tokens) / self.budget.available if self.budget.available > 0 else 1.0
|
||||
if would_util >= THRESHOLD_DANGER:
|
||||
return False, f"Would reach {would_util:.0%%} capacity. Summarize or start new session."
|
||||
if would_util >= THRESHOLD_CRITICAL:
|
||||
return True, f"Warning: will reach {would_util:.0%%} capacity."
|
||||
return True, ""
|
||||
|
||||
def get_warning(self) -> Optional[str]:
|
||||
util = self.budget.utilization
|
||||
if util >= THRESHOLD_DANGER and "danger" not in self._warnings_given:
|
||||
self._warnings_given.add("danger")
|
||||
return f"[CONTEXT CRITICAL: {util:.0%%} used -- {self.budget.remaining:,} tokens left. Summarize or start new session.]"
|
||||
if util >= THRESHOLD_CRITICAL and "critical" not in self._warnings_given:
|
||||
self._warnings_given.add("critical")
|
||||
self._auto_checkpoint()
|
||||
return f"[CONTEXT WARNING: {util:.0%%} used -- consider summarizing. Auto-checkpoint saved.]"
|
||||
if util >= THRESHOLD_WARNING and "warning" not in self._warnings_given:
|
||||
self._warnings_given.add("warning")
|
||||
return f"[CONTEXT: {util:.0%%} used -- {self.budget.remaining:,} tokens remaining]"
|
||||
return None
|
||||
|
||||
def _auto_checkpoint(self):
|
||||
if self._checkpointed or not self.session_id:
|
||||
return
|
||||
try:
|
||||
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
path = CHECKPOINT_DIR / f"{self.session_id}.json"
|
||||
path.write_text(json.dumps({
|
||||
"session_id": self.session_id,
|
||||
"timestamp": time.time(),
|
||||
"budget": {"utilization": round(self.budget.utilization * 100, 1)}
|
||||
}, indent=2))
|
||||
self._checkpointed = True
|
||||
logger.info("Auto-checkpoint saved: %s", path)
|
||||
except Exception as e:
|
||||
logger.error("Auto-checkpoint failed: %s", e)
|
||||
|
||||
def get_status_line(self) -> str:
|
||||
util = self.budget.utilization
|
||||
remaining = self.budget.remaining
|
||||
if util >= THRESHOLD_DANGER:
|
||||
return f"RED {util:.0%%} used ({remaining:,} left)"
|
||||
elif util >= THRESHOLD_CRITICAL:
|
||||
return f"ORANGE {util:.0%%} used ({remaining:,} left)"
|
||||
elif util >= THRESHOLD_WARNING:
|
||||
return f"YELLOW {util:.0%%} used ({remaining:,} left)"
|
||||
return f"GREEN {util:.0%%} used ({remaining:,} left)"
|
||||
|
||||
|
||||
_tracker = None
|
||||
|
||||
def get_tracker(context_limit=128000, session_id=""):
|
||||
global _tracker
|
||||
if _tracker is None:
|
||||
_tracker = ContextBudgetTracker(context_limit, session_id)
|
||||
return _tracker
|
||||
|
||||
def check_context_budget(messages, context_limit=128000):
|
||||
tracker = get_tracker(context_limit)
|
||||
tracker.update_from_messages(messages)
|
||||
return tracker.get_warning()
|
||||
|
||||
def preflight_token_check(text):
|
||||
tracker = get_tracker()
|
||||
return tracker.preflight_check(text)
|
||||
@@ -1,149 +0,0 @@
|
||||
"""
|
||||
988 Suicide & Crisis Lifeline Integration (#673).
|
||||
|
||||
When crisis is detected, provides immediate access to help:
|
||||
- Phone: 988 (call or text)
|
||||
- Text: Text HOME to 988
|
||||
- Chat: 988lifeline.org/chat
|
||||
- Spanish: 1-888-628-9454
|
||||
- Emergency: 911
|
||||
|
||||
This module provides the resource data. agent/crisis_protocol.py
|
||||
handles detection. This module formats the resources for display.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrisisResource:
|
||||
"""A crisis support contact method."""
|
||||
name: str
|
||||
contact: str
|
||||
description: str
|
||||
url: str = ""
|
||||
available: str = "24/7"
|
||||
language: str = "English"
|
||||
|
||||
|
||||
# 988 Suicide & Crisis Lifeline — all channels
|
||||
LIFELINE_988 = CrisisResource(
|
||||
name="988 Suicide and Crisis Lifeline",
|
||||
contact="Call or text 988",
|
||||
description="Free, confidential support for people in suicidal crisis or emotional distress.",
|
||||
url="https://988lifeline.org",
|
||||
available="24/7",
|
||||
language="English",
|
||||
)
|
||||
|
||||
LIFELINE_988_TEXT = CrisisResource(
|
||||
name="988 Crisis Text Line",
|
||||
contact="Text HOME to 988",
|
||||
description="Free, 24/7 crisis support via text message.",
|
||||
url="",
|
||||
available="24/7",
|
||||
language="English",
|
||||
)
|
||||
|
||||
LIFELINE_988_CHAT = CrisisResource(
|
||||
name="988 Lifeline Chat",
|
||||
contact="988lifeline.org/chat",
|
||||
description="Free, confidential online chat with a trained crisis counselor.",
|
||||
url="https://988lifeline.org/chat",
|
||||
available="24/7",
|
||||
language="English",
|
||||
)
|
||||
|
||||
LIFELINE_988_SPANISH = CrisisResource(
|
||||
name="988 Lifeline (Spanish)",
|
||||
contact="1-888-628-9454",
|
||||
description="Línea de prevención del suicidio en español.",
|
||||
url="https://988lifeline.org/help-yourself/en-espanol/",
|
||||
available="24/7",
|
||||
language="Spanish",
|
||||
)
|
||||
|
||||
CRISIS_TEXT_LINE = CrisisResource(
|
||||
name="Crisis Text Line",
|
||||
contact="Text HOME to 741741",
|
||||
description="Free, 24/7 crisis support via text message.",
|
||||
url="https://www.crisistextline.org",
|
||||
available="24/7",
|
||||
language="English",
|
||||
)
|
||||
|
||||
EMERGENCY_911 = CrisisResource(
|
||||
name="Emergency Services",
|
||||
contact="911",
|
||||
description="Immediate danger — police, fire, ambulance.",
|
||||
url="",
|
||||
available="24/7",
|
||||
language="Any",
|
||||
)
|
||||
|
||||
# All resources in priority order
|
||||
ALL_RESOURCES: List[CrisisResource] = [
|
||||
EMERGENCY_911,
|
||||
LIFELINE_988,
|
||||
LIFELINE_988_TEXT,
|
||||
LIFELINE_988_CHAT,
|
||||
CRISIS_TEXT_LINE,
|
||||
LIFELINE_988_SPANISH,
|
||||
]
|
||||
|
||||
|
||||
def get_crisis_resources(language: str = None) -> List[CrisisResource]:
|
||||
"""Get crisis resources, optionally filtered by language.
|
||||
|
||||
Args:
|
||||
language: Filter by language ("English", "Spanish", or None for all)
|
||||
|
||||
Returns:
|
||||
List of CrisisResource objects
|
||||
"""
|
||||
if language:
|
||||
return [r for r in ALL_RESOURCES if r.language.lower() == language.lower()]
|
||||
return ALL_RESOURCES
|
||||
|
||||
|
||||
def format_crisis_resources(resources: List[CrisisResource] = None) -> str:
|
||||
"""Format crisis resources as a user-facing message.
|
||||
|
||||
Args:
|
||||
resources: List of resources to format. Defaults to all resources.
|
||||
|
||||
Returns:
|
||||
Formatted string suitable for displaying to a user in crisis.
|
||||
"""
|
||||
if resources is None:
|
||||
resources = ALL_RESOURCES
|
||||
|
||||
lines = ["**Please reach out — help is available right now:**
|
||||
"]
|
||||
|
||||
for r in resources:
|
||||
if r.url:
|
||||
lines.append(f"- **{r.name}:** {r.contact} ({r.url})")
|
||||
else:
|
||||
lines.append(f"- **{r.name}:** {r.contact}")
|
||||
|
||||
lines.append("")
|
||||
lines.append("All services are free, confidential, and available 24/7.")
|
||||
lines.append("You are not alone.")
|
||||
|
||||
return "
|
||||
".join(lines)
|
||||
|
||||
|
||||
def get_immediate_help_message() -> str:
|
||||
"""Get the most urgent crisis help message.
|
||||
|
||||
Used when crisis is detected at CRITICAL level.
|
||||
"""
|
||||
return (
|
||||
"If you are in immediate danger, call **911** right now.
|
||||
|
||||
"
|
||||
+ format_crisis_resources()
|
||||
)
|
||||
184
agent/mtls.py
184
agent/mtls.py
@@ -1,184 +0,0 @@
|
||||
"""
|
||||
agent/mtls.py — Mutual TLS support for Hermes A2A communication.
|
||||
|
||||
Provides:
|
||||
- build_server_ssl_context() — SSL context for uvicorn that requires client certs
|
||||
- build_client_ssl_context() — SSL context for httpx/aiohttp A2A clients
|
||||
- MTLSMiddleware — FastAPI middleware that enforces client cert on A2A routes
|
||||
- is_mtls_configured() — Check if env vars are set
|
||||
|
||||
Configuration (environment variables):
|
||||
HERMES_MTLS_CERT Path to this agent's TLS certificate (PEM)
|
||||
HERMES_MTLS_KEY Path to this agent's TLS private key (PEM)
|
||||
HERMES_MTLS_CA Path to the Fleet CA certificate (PEM) — used to verify peers
|
||||
|
||||
All three must be set to enable mTLS. If any is missing, mTLS is disabled and
|
||||
the server falls back to plain HTTP (or regular TLS without client auth).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import ssl
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# A2A routes that require a valid client certificate when mTLS is enabled.
|
||||
_A2A_PATH_PREFIXES = (
|
||||
"/.well-known/agent-card",
|
||||
"/agent-card",
|
||||
"/api/agent-card",
|
||||
"/a2a/",
|
||||
)
|
||||
|
||||
|
||||
def _get_env(key: str) -> Optional[str]:
|
||||
val = os.environ.get(key, "").strip()
|
||||
return val or None
|
||||
|
||||
|
||||
def is_mtls_configured() -> bool:
|
||||
"""Return True if all three mTLS env vars are set and the files exist."""
|
||||
cert = _get_env("HERMES_MTLS_CERT")
|
||||
key = _get_env("HERMES_MTLS_KEY")
|
||||
ca = _get_env("HERMES_MTLS_CA")
|
||||
if not (cert and key and ca):
|
||||
return False
|
||||
for label, path in (("HERMES_MTLS_CERT", cert), ("HERMES_MTLS_KEY", key), ("HERMES_MTLS_CA", ca)):
|
||||
if not Path(path).is_file():
|
||||
logger.warning("mTLS disabled: %s file not found: %s", label, path)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def build_server_ssl_context() -> ssl.SSLContext:
|
||||
"""
|
||||
Build an SSL context for the A2A server that:
|
||||
- presents its own certificate
|
||||
- requires and verifies the client's certificate against the Fleet CA
|
||||
|
||||
Raises:
|
||||
RuntimeError: if mTLS env vars are not set or files are missing
|
||||
ssl.SSLError: if cert/key/CA files are invalid
|
||||
"""
|
||||
cert = _get_env("HERMES_MTLS_CERT")
|
||||
key = _get_env("HERMES_MTLS_KEY")
|
||||
ca = _get_env("HERMES_MTLS_CA")
|
||||
|
||||
if not (cert and key and ca):
|
||||
raise RuntimeError(
|
||||
"mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
|
||||
)
|
||||
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
|
||||
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
|
||||
ctx.load_cert_chain(certfile=cert, keyfile=key)
|
||||
ctx.load_verify_locations(cafile=ca)
|
||||
# CERT_REQUIRED: reject connections without a valid client cert
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
logger.info("mTLS server context built (cert=%s, CA=%s)", cert, ca)
|
||||
return ctx
|
||||
|
||||
|
||||
def build_client_ssl_context() -> ssl.SSLContext:
|
||||
"""
|
||||
Build an SSL context for outbound A2A connections that:
|
||||
- presents this agent's certificate as a client cert
|
||||
- verifies the remote server against the Fleet CA
|
||||
|
||||
Raises:
|
||||
RuntimeError: if mTLS env vars are not set or files are missing
|
||||
ssl.SSLError: if cert/key/CA files are invalid
|
||||
"""
|
||||
cert = _get_env("HERMES_MTLS_CERT")
|
||||
key = _get_env("HERMES_MTLS_KEY")
|
||||
ca = _get_env("HERMES_MTLS_CA")
|
||||
|
||||
if not (cert and key and ca):
|
||||
raise RuntimeError(
|
||||
"mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
|
||||
)
|
||||
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
|
||||
ctx.load_cert_chain(certfile=cert, keyfile=key)
|
||||
ctx.load_verify_locations(cafile=ca)
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
ctx.check_hostname = True
|
||||
logger.info("mTLS client context built (cert=%s, CA=%s)", cert, ca)
|
||||
return ctx
|
||||
|
||||
|
||||
def get_peer_cn(ssl_object) -> Optional[str]:
|
||||
"""Extract the CN from the peer certificate's subject, or None."""
|
||||
try:
|
||||
peer_cert = ssl_object.getpeercert()
|
||||
if not peer_cert:
|
||||
return None
|
||||
for rdn in peer_cert.get("subject", ()):
|
||||
for attr, value in rdn:
|
||||
if attr == "commonName":
|
||||
return value
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
class MTLSMiddleware:
|
||||
"""
|
||||
ASGI middleware that enforces client certificate verification on A2A routes.
|
||||
|
||||
When mTLS is NOT configured (no env vars) or the route is not an A2A route,
|
||||
the request passes through unchanged.
|
||||
|
||||
When mTLS IS configured and the route matches an A2A prefix, the middleware
|
||||
checks that the request arrived over a TLS connection with a verified client
|
||||
certificate. If not, it returns HTTP 403.
|
||||
|
||||
Note: This middleware only provides defence-in-depth at the app layer.
|
||||
The primary enforcement is at the SSL context level (CERT_REQUIRED on the
|
||||
server context). This middleware is useful when the server runs behind a
|
||||
TLS-terminating proxy that forwards cert info via headers (not yet
|
||||
implemented) or for test-time injection.
|
||||
"""
|
||||
|
||||
def __init__(self, app):
|
||||
self.app = app
|
||||
self._enabled = is_mtls_configured()
|
||||
if self._enabled:
|
||||
logger.info("MTLSMiddleware enabled — A2A routes require client cert")
|
||||
|
||||
def _is_a2a_route(self, path: str) -> bool:
|
||||
return any(path.startswith(prefix) for prefix in _A2A_PATH_PREFIXES)
|
||||
|
||||
async def __call__(self, scope, receive, send):
|
||||
if scope["type"] == "http" and self._enabled and self._is_a2a_route(scope.get("path", "")):
|
||||
# Check for client cert in the SSL connection
|
||||
transport = scope.get("extensions", {}).get("tls", {})
|
||||
peer_cert = transport.get("peer_cert")
|
||||
if peer_cert is None:
|
||||
# No client cert — reject
|
||||
response = _forbidden_response("Client certificate required for A2A endpoints")
|
||||
await response(scope, receive, send)
|
||||
return
|
||||
|
||||
await self.app(scope, receive, send)
|
||||
|
||||
|
||||
def _forbidden_response(message: str):
|
||||
"""Return a minimal ASGI 403 response."""
|
||||
body = message.encode()
|
||||
|
||||
async def respond(scope, receive, send):
|
||||
await send({
|
||||
"type": "http.response.start",
|
||||
"status": 403,
|
||||
"headers": [
|
||||
(b"content-type", b"text/plain"),
|
||||
(b"content-length", str(len(body)).encode()),
|
||||
],
|
||||
})
|
||||
await send({"type": "http.response.body", "body": body})
|
||||
|
||||
return respond
|
||||
@@ -1,262 +0,0 @@
|
||||
"""
|
||||
Profile Session Isolation — #891
|
||||
|
||||
Tags sessions with their originating profile and provides
|
||||
filtered access so profiles cannot see each other's data.
|
||||
|
||||
Current state: All sessions share one state.db with no profile tag.
|
||||
This module adds profile tagging and filtered queries.
|
||||
|
||||
Usage:
|
||||
from agent.profile_isolation import tag_session, get_profile_sessions, get_active_profile
|
||||
|
||||
# Tag a new session with the current profile
|
||||
tag_session(session_id, profile_name)
|
||||
|
||||
# Get sessions for a specific profile
|
||||
sessions = get_profile_sessions("sprint")
|
||||
|
||||
# Get current active profile
|
||||
profile = get_active_profile()
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
from datetime import datetime, timezone
|
||||
|
||||
HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes")))
|
||||
SESSIONS_DB = HERMES_HOME / "sessions" / "state.db"
|
||||
PROFILE_TAGS_FILE = HERMES_HOME / "profile_session_tags.json"
|
||||
|
||||
|
||||
def get_active_profile() -> str:
|
||||
"""Get the currently active profile name."""
|
||||
config_path = HERMES_HOME / "config.yaml"
|
||||
if config_path.exists():
|
||||
try:
|
||||
import yaml
|
||||
with open(config_path) as f:
|
||||
cfg = yaml.safe_load(f) or {}
|
||||
return cfg.get("active_profile", "default")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Check environment
|
||||
return os.getenv("HERMES_PROFILE", "default")
|
||||
|
||||
|
||||
def _load_tags() -> Dict[str, str]:
|
||||
"""Load session-to-profile mapping."""
|
||||
if not PROFILE_TAGS_FILE.exists():
|
||||
return {}
|
||||
try:
|
||||
with open(PROFILE_TAGS_FILE) as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _save_tags(tags: Dict[str, str]):
|
||||
"""Save session-to-profile mapping."""
|
||||
PROFILE_TAGS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(PROFILE_TAGS_FILE, "w") as f:
|
||||
json.dump(tags, f, indent=2)
|
||||
|
||||
|
||||
def tag_session(session_id: str, profile: Optional[str] = None) -> str:
|
||||
"""
|
||||
Tag a session with its originating profile.
|
||||
|
||||
Returns the profile name used.
|
||||
"""
|
||||
if profile is None:
|
||||
profile = get_active_profile()
|
||||
|
||||
tags = _load_tags()
|
||||
tags[session_id] = profile
|
||||
_save_tags(tags)
|
||||
|
||||
# Also tag in SQLite if available
|
||||
_tag_session_in_db(session_id, profile)
|
||||
|
||||
return profile
|
||||
|
||||
|
||||
def _tag_session_in_db(session_id: str, profile: str):
|
||||
"""Add profile tag to SQLite session store."""
|
||||
if not SESSIONS_DB.exists():
|
||||
return
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(str(SESSIONS_DB))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check if sessions table has profile column
|
||||
cursor.execute("PRAGMA table_info(sessions)")
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
|
||||
if "profile" not in columns:
|
||||
# Add profile column
|
||||
cursor.execute("ALTER TABLE sessions ADD COLUMN profile TEXT DEFAULT 'default'")
|
||||
|
||||
# Update the session's profile
|
||||
cursor.execute(
|
||||
"UPDATE sessions SET profile = ? WHERE session_id = ?",
|
||||
(profile, session_id)
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass # SQLite might not be available or schema differs
|
||||
|
||||
|
||||
def get_session_profile(session_id: str) -> Optional[str]:
|
||||
"""Get the profile that owns a session."""
|
||||
# Check JSON tags first
|
||||
tags = _load_tags()
|
||||
if session_id in tags:
|
||||
return tags[session_id]
|
||||
|
||||
# Check SQLite
|
||||
if SESSIONS_DB.exists():
|
||||
try:
|
||||
conn = sqlite3.connect(str(SESSIONS_DB))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT profile FROM sessions WHERE session_id = ?",
|
||||
(session_id,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
if row:
|
||||
return row[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_profile_sessions(
|
||||
profile: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get sessions belonging to a specific profile.
|
||||
|
||||
Returns list of session dicts.
|
||||
"""
|
||||
if profile is None:
|
||||
profile = get_active_profile()
|
||||
|
||||
sessions = []
|
||||
|
||||
# Get from JSON tags
|
||||
tags = _load_tags()
|
||||
tagged_sessions = [sid for sid, p in tags.items() if p == profile]
|
||||
|
||||
# Get from SQLite with profile filter
|
||||
if SESSIONS_DB.exists():
|
||||
try:
|
||||
conn = sqlite3.connect(str(SESSIONS_DB))
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Try profile column first
|
||||
try:
|
||||
cursor.execute(
|
||||
"SELECT * FROM sessions WHERE profile = ? ORDER BY updated_at DESC LIMIT ?",
|
||||
(profile, limit)
|
||||
)
|
||||
for row in cursor.fetchall():
|
||||
sessions.append(dict(row))
|
||||
except Exception:
|
||||
# Fallback: filter by tagged session IDs
|
||||
if tagged_sessions:
|
||||
placeholders = ",".join("?" * len(tagged_sessions[:limit]))
|
||||
cursor.execute(
|
||||
f"SELECT * FROM sessions WHERE session_id IN ({placeholders}) ORDER BY updated_at DESC LIMIT ?",
|
||||
(*tagged_sessions[:limit], limit)
|
||||
)
|
||||
for row in cursor.fetchall():
|
||||
sessions.append(dict(row))
|
||||
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return sessions[:limit]
|
||||
|
||||
|
||||
def filter_sessions_by_profile(
|
||||
sessions: List[Dict[str, Any]],
|
||||
profile: Optional[str] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Filter a list of sessions to only include those belonging to a profile."""
|
||||
if profile is None:
|
||||
profile = get_active_profile()
|
||||
|
||||
tags = _load_tags()
|
||||
filtered = []
|
||||
|
||||
for session in sessions:
|
||||
sid = session.get("session_id") or session.get("id")
|
||||
if not sid:
|
||||
continue
|
||||
|
||||
# Check tag
|
||||
session_profile = tags.get(sid)
|
||||
if session_profile is None:
|
||||
# Check SQLite
|
||||
session_profile = get_session_profile(sid)
|
||||
|
||||
if session_profile == profile or session_profile is None:
|
||||
filtered.append(session)
|
||||
|
||||
return filtered
|
||||
|
||||
|
||||
def get_profile_stats() -> Dict[str, Any]:
|
||||
"""Get statistics about profile session distribution."""
|
||||
tags = _load_tags()
|
||||
|
||||
profile_counts = {}
|
||||
for sid, profile in tags.items():
|
||||
profile_counts[profile] = profile_counts.get(profile, 0) + 1
|
||||
|
||||
total_tagged = len(tags)
|
||||
profiles = list(profile_counts.keys())
|
||||
|
||||
return {
|
||||
"total_tagged_sessions": total_tagged,
|
||||
"profiles": profiles,
|
||||
"profile_counts": profile_counts,
|
||||
"active_profile": get_active_profile(),
|
||||
}
|
||||
|
||||
|
||||
def audit_untagged_sessions() -> List[str]:
|
||||
"""Find sessions without a profile tag."""
|
||||
if not SESSIONS_DB.exists():
|
||||
return []
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(str(SESSIONS_DB))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Get all session IDs
|
||||
cursor.execute("SELECT session_id FROM sessions")
|
||||
all_sessions = {row[0] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
# Get tagged sessions
|
||||
tags = _load_tags()
|
||||
tagged = set(tags.keys())
|
||||
|
||||
# Return untagged
|
||||
return list(all_sessions - tagged)
|
||||
except Exception:
|
||||
return []
|
||||
@@ -1,146 +0,0 @@
|
||||
"""Provider Preflight — Poka-yoke validation of provider/model config.
|
||||
|
||||
Validates provider and model configuration before session start.
|
||||
Prevents wasted context on misconfigured providers.
|
||||
|
||||
Usage:
|
||||
from agent.provider_preflight import preflight_check
|
||||
result = preflight_check(provider="openrouter", model="xiaomi/mimo-v2-pro")
|
||||
if not result["valid"]:
|
||||
print(result["error"])
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Provider -> required env var
|
||||
PROVIDER_KEYS = {
|
||||
"openrouter": "OPENROUTER_API_KEY",
|
||||
"anthropic": "ANTHROPIC_API_KEY",
|
||||
"openai": "OPENAI_API_KEY",
|
||||
"nous": "NOUS_API_KEY",
|
||||
"ollama": None, # Local, no key needed
|
||||
"local": None,
|
||||
}
|
||||
|
||||
|
||||
def check_provider_key(provider: str) -> Dict[str, Any]:
|
||||
"""Check if provider has a valid API key configured."""
|
||||
provider_lower = provider.lower().strip()
|
||||
|
||||
env_var = None
|
||||
for known, key in PROVIDER_KEYS.items():
|
||||
if known in provider_lower:
|
||||
env_var = key
|
||||
break
|
||||
|
||||
if env_var is None:
|
||||
# Unknown provider — assume OK (custom/local)
|
||||
return {"valid": True, "provider": provider, "key_status": "unknown"}
|
||||
|
||||
if env_var is None:
|
||||
# Local provider, no key needed
|
||||
return {"valid": True, "provider": provider, "key_status": "not_required"}
|
||||
|
||||
key_value = os.getenv(env_var, "").strip()
|
||||
if not key_value:
|
||||
return {
|
||||
"valid": False,
|
||||
"provider": provider,
|
||||
"key_status": "missing",
|
||||
"error": f"{env_var} is not set. Provider '{provider}' will fail.",
|
||||
"fix": f"Set {env_var} in ~/.hermes/.env",
|
||||
}
|
||||
|
||||
if len(key_value) < 10:
|
||||
return {
|
||||
"valid": False,
|
||||
"provider": provider,
|
||||
"key_status": "too_short",
|
||||
"error": f"{env_var} is suspiciously short ({len(key_value)} chars). May be invalid.",
|
||||
"fix": f"Verify {env_var} value in ~/.hermes/.env",
|
||||
}
|
||||
|
||||
return {"valid": True, "provider": provider, "key_status": "set"}
|
||||
|
||||
|
||||
def check_model_availability(model: str, provider: str) -> Dict[str, Any]:
|
||||
"""Check if model is likely available for provider."""
|
||||
if not model:
|
||||
return {"valid": False, "error": "No model specified"}
|
||||
|
||||
# Basic sanity checks
|
||||
model_lower = model.lower()
|
||||
|
||||
# Anthropic models should use anthropic provider
|
||||
if "claude" in model_lower and "anthropic" not in provider.lower():
|
||||
return {
|
||||
"valid": True, # Allow but warn
|
||||
"warning": f"Model '{model}' usually runs on Anthropic provider, not '{provider}'",
|
||||
}
|
||||
|
||||
# Ollama models
|
||||
ollama_indicators = ["llama", "mistral", "qwen", "gemma", "phi", "hermes"]
|
||||
if any(x in model_lower for x in ollama_indicators) and ":" not in model:
|
||||
return {
|
||||
"valid": True,
|
||||
"warning": f"Model '{model}' may need a version tag for Ollama (e.g., {model}:latest)",
|
||||
}
|
||||
|
||||
return {"valid": True}
|
||||
|
||||
|
||||
def preflight_check(
|
||||
provider: str = "",
|
||||
model: str = "",
|
||||
fallback_provider: str = "",
|
||||
fallback_model: str = "",
|
||||
) -> Dict[str, Any]:
|
||||
"""Full pre-flight check for provider/model configuration.
|
||||
|
||||
Returns:
|
||||
Dict with valid (bool), errors (list), warnings (list).
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
# Check primary provider
|
||||
if provider:
|
||||
result = check_provider_key(provider)
|
||||
if not result["valid"]:
|
||||
errors.append(result.get("error", f"Provider {provider} invalid"))
|
||||
|
||||
# Check primary model
|
||||
if model:
|
||||
result = check_model_availability(model, provider)
|
||||
if not result["valid"]:
|
||||
errors.append(result.get("error", f"Model {model} invalid"))
|
||||
elif result.get("warning"):
|
||||
warnings.append(result["warning"])
|
||||
|
||||
# Check fallback
|
||||
if fallback_provider:
|
||||
result = check_provider_key(fallback_provider)
|
||||
if not result["valid"]:
|
||||
warnings.append(f"Fallback provider {fallback_provider} also invalid: {result.get('error','')}")
|
||||
|
||||
if fallback_model:
|
||||
result = check_model_availability(fallback_model, fallback_provider)
|
||||
if not result["valid"]:
|
||||
warnings.append(f"Fallback model {fallback_model} invalid")
|
||||
elif result.get("warning"):
|
||||
warnings.append(result["warning"])
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
"errors": errors,
|
||||
"warnings": warnings,
|
||||
"provider": provider,
|
||||
"model": model,
|
||||
}
|
||||
@@ -1,302 +0,0 @@
|
||||
"""Self-Modifying Prompt Engine — agent learns from its own failures.
|
||||
|
||||
Analyzes session transcripts, identifies failure patterns, and generates
|
||||
prompt patches to prevent future failures.
|
||||
|
||||
The loop: fail → analyze → rewrite → retry → verify improvement.
|
||||
|
||||
Usage:
|
||||
from agent.self_modify import PromptLearner
|
||||
learner = PromptLearner()
|
||||
patches = learner.analyze_session(session_id)
|
||||
learner.apply_patches(patches)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HERMES_HOME = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
PATCHES_DIR = HERMES_HOME / "prompt_patches"
|
||||
ROLLBACK_DIR = HERMES_HOME / "prompt_rollback"
|
||||
|
||||
|
||||
@dataclass
|
||||
class FailurePattern:
|
||||
"""A detected failure pattern in session transcripts."""
|
||||
pattern_type: str # retry_loop, timeout, error_hallucination, context_loss
|
||||
description: str
|
||||
frequency: int
|
||||
example_messages: List[str] = field(default_factory=list)
|
||||
suggested_fix: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class PromptPatch:
|
||||
"""A modification to the system prompt based on failure analysis."""
|
||||
id: str
|
||||
failure_type: str
|
||||
original_rule: str
|
||||
new_rule: str
|
||||
confidence: float
|
||||
applied_at: Optional[float] = None
|
||||
reverted: bool = False
|
||||
|
||||
|
||||
# Failure detection patterns
|
||||
FAILURE_SIGNALS = {
|
||||
"retry_loop": {
|
||||
"patterns": [
|
||||
r"(?i)retry(?:ing)?\s*(?:attempt|again)",
|
||||
r"(?i)failed.*retrying",
|
||||
r"(?i)error.*again",
|
||||
r"(?i)attempt\s+\d+\s*(?:of|/)\s*\d+",
|
||||
],
|
||||
"description": "Agent stuck in retry loop",
|
||||
},
|
||||
"timeout": {
|
||||
"patterns": [
|
||||
r"(?i)timed?\s*out",
|
||||
r"(?i)deadline\s+exceeded",
|
||||
r"(?i)took\s+(?:too\s+)?long",
|
||||
],
|
||||
"description": "Operation timed out",
|
||||
},
|
||||
"hallucination": {
|
||||
"patterns": [
|
||||
r"(?i)i\s+(?:don't|do\s+not)\s+(?:have|see|find)\s+(?:any|that|this)\s+(?:information|data|file)",
|
||||
r"(?i)the\s+file\s+doesn't\s+exist",
|
||||
r"(?i)i\s+(?:made|invented|fabricated)\s+(?:that\s+up|this)",
|
||||
],
|
||||
"description": "Agent hallucinated or fabricated information",
|
||||
},
|
||||
"context_loss": {
|
||||
"patterns": [
|
||||
r"(?i)i\s+(?:don't|do\s+not)\s+(?:remember|recall|know)\s+(?:what|where|when|how)",
|
||||
r"(?i)could\s+you\s+remind\s+me",
|
||||
r"(?i)what\s+were\s+we\s+(?:doing|working|talking)\s+(?:on|about)",
|
||||
],
|
||||
"description": "Agent lost context from earlier in conversation",
|
||||
},
|
||||
"tool_failure": {
|
||||
"patterns": [
|
||||
r"(?i)tool\s+(?:call|execution)\s+failed",
|
||||
r"(?i)command\s+not\s+found",
|
||||
r"(?i)permission\s+denied",
|
||||
r"(?i)no\s+such\s+file",
|
||||
],
|
||||
"description": "Tool execution failed",
|
||||
},
|
||||
}
|
||||
|
||||
# Prompt improvement templates
|
||||
PROMPT_FIXES = {
|
||||
"retry_loop": (
|
||||
"If an operation fails more than twice, stop retrying. "
|
||||
"Report the failure and ask the user for guidance. "
|
||||
"Do not enter retry loops — they waste tokens."
|
||||
),
|
||||
"timeout": (
|
||||
"For operations that may take long, set a timeout and report "
|
||||
"progress. If an operation takes more than 30 seconds, report "
|
||||
"what you've done so far and ask if you should continue."
|
||||
),
|
||||
"hallucination": (
|
||||
"If you cannot find information, say 'I don't know' or "
|
||||
"'I couldn't find that.' Never fabricate information. "
|
||||
"If a file doesn't exist, say so — don't guess its contents."
|
||||
),
|
||||
"context_loss": (
|
||||
"When you need context from earlier in the conversation, "
|
||||
"use session_search to find it. Don't ask the user to repeat themselves."
|
||||
),
|
||||
"tool_failure": (
|
||||
"If a tool fails, check the error message and try a different approach. "
|
||||
"Don't retry the exact same command — diagnose first."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class PromptLearner:
|
||||
"""Analyze session transcripts and generate prompt improvements."""
|
||||
|
||||
def __init__(self):
|
||||
PATCHES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
ROLLBACK_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def analyze_session(self, session_data: dict) -> List[FailurePattern]:
|
||||
"""Analyze a session for failure patterns.
|
||||
|
||||
Args:
|
||||
session_data: Session dict with 'messages' list.
|
||||
|
||||
Returns:
|
||||
List of detected failure patterns.
|
||||
"""
|
||||
messages = session_data.get("messages", [])
|
||||
patterns_found: Dict[str, FailurePattern] = {}
|
||||
|
||||
for msg in messages:
|
||||
content = str(msg.get("content", ""))
|
||||
role = msg.get("role", "")
|
||||
|
||||
# Only analyze assistant messages and tool results
|
||||
if role not in ("assistant", "tool"):
|
||||
continue
|
||||
|
||||
for failure_type, config in FAILURE_SIGNALS.items():
|
||||
for pattern in config["patterns"]:
|
||||
if re.search(pattern, content):
|
||||
if failure_type not in patterns_found:
|
||||
patterns_found[failure_type] = FailurePattern(
|
||||
pattern_type=failure_type,
|
||||
description=config["description"],
|
||||
frequency=0,
|
||||
suggested_fix=PROMPT_FIXES.get(failure_type, ""),
|
||||
)
|
||||
patterns_found[failure_type].frequency += 1
|
||||
if len(patterns_found[failure_type].example_messages) < 3:
|
||||
patterns_found[failure_type].example_messages.append(
|
||||
content[:200]
|
||||
)
|
||||
break # One match per message per type is enough
|
||||
|
||||
return list(patterns_found.values())
|
||||
|
||||
def generate_patches(self, patterns: List[FailurePattern],
|
||||
min_confidence: float = 0.7) -> List[PromptPatch]:
|
||||
"""Generate prompt patches from failure patterns.
|
||||
|
||||
Args:
|
||||
patterns: Detected failure patterns.
|
||||
min_confidence: Minimum confidence to generate a patch.
|
||||
|
||||
Returns:
|
||||
List of prompt patches.
|
||||
"""
|
||||
patches = []
|
||||
for pattern in patterns:
|
||||
# Confidence based on frequency
|
||||
if pattern.frequency >= 3:
|
||||
confidence = 0.9
|
||||
elif pattern.frequency >= 2:
|
||||
confidence = 0.75
|
||||
else:
|
||||
confidence = 0.5
|
||||
|
||||
if confidence < min_confidence:
|
||||
continue
|
||||
|
||||
if not pattern.suggested_fix:
|
||||
continue
|
||||
|
||||
patch = PromptPatch(
|
||||
id=f"{pattern.pattern_type}-{int(time.time())}",
|
||||
failure_type=pattern.pattern_type,
|
||||
original_rule="(missing — no existing rule for this pattern)",
|
||||
new_rule=pattern.suggested_fix,
|
||||
confidence=confidence,
|
||||
)
|
||||
patches.append(patch)
|
||||
|
||||
return patches
|
||||
|
||||
def apply_patches(self, patches: List[PromptPatch],
|
||||
prompt_path: Optional[str] = None) -> int:
|
||||
"""Apply patches to the system prompt.
|
||||
|
||||
Args:
|
||||
patches: Patches to apply.
|
||||
prompt_path: Path to prompt file (default: ~/.hermes/system_prompt.md)
|
||||
|
||||
Returns:
|
||||
Number of patches applied.
|
||||
"""
|
||||
if prompt_path is None:
|
||||
prompt_path = str(HERMES_HOME / "system_prompt.md")
|
||||
|
||||
prompt_file = Path(prompt_path)
|
||||
|
||||
# Backup current prompt
|
||||
if prompt_file.exists():
|
||||
backup = ROLLBACK_DIR / f"{prompt_file.name}.{int(time.time())}.bak"
|
||||
backup.write_text(prompt_file.read_text())
|
||||
|
||||
# Read current prompt
|
||||
current = prompt_file.read_text() if prompt_file.exists() else ""
|
||||
|
||||
# Apply patches
|
||||
applied = 0
|
||||
additions = []
|
||||
for patch in patches:
|
||||
if patch.new_rule not in current:
|
||||
additions.append(f"\n## Auto-learned: {patch.failure_type}\n{patch.new_rule}")
|
||||
patch.applied_at = time.time()
|
||||
applied += 1
|
||||
|
||||
if additions:
|
||||
new_content = current + "\n".join(additions)
|
||||
prompt_file.write_text(new_content)
|
||||
|
||||
# Log patches
|
||||
patches_file = PATCHES_DIR / f"patches-{int(time.time())}.json"
|
||||
with open(patches_file, "w") as f:
|
||||
json.dump([p.__dict__ for p in patches], f, indent=2, default=str)
|
||||
|
||||
logger.info("Applied %d prompt patches", applied)
|
||||
return applied
|
||||
|
||||
def rollback_last(self, prompt_path: Optional[str] = None) -> bool:
|
||||
"""Rollback to the most recent backup.
|
||||
|
||||
Args:
|
||||
prompt_path: Path to prompt file.
|
||||
|
||||
Returns:
|
||||
True if rollback succeeded.
|
||||
"""
|
||||
if prompt_path is None:
|
||||
prompt_path = str(HERMES_HOME / "system_prompt.md")
|
||||
|
||||
backups = sorted(ROLLBACK_DIR.glob("*.bak"), reverse=True)
|
||||
if not backups:
|
||||
logger.warning("No backups to rollback to")
|
||||
return False
|
||||
|
||||
latest = backups[0]
|
||||
Path(prompt_path).write_text(latest.read_text())
|
||||
logger.info("Rolled back to %s", latest.name)
|
||||
return True
|
||||
|
||||
def learn_from_session(self, session_data: dict) -> Dict[str, Any]:
|
||||
"""Full learning cycle: analyze → patch → apply.
|
||||
|
||||
Args:
|
||||
session_data: Session dict.
|
||||
|
||||
Returns:
|
||||
Summary of what was learned and applied.
|
||||
"""
|
||||
patterns = self.analyze_session(session_data)
|
||||
patches = self.generate_patches(patterns)
|
||||
applied = self.apply_patches(patches)
|
||||
|
||||
return {
|
||||
"patterns_detected": len(patterns),
|
||||
"patches_generated": len(patches),
|
||||
"patches_applied": applied,
|
||||
"patterns": [
|
||||
{"type": p.pattern_type, "frequency": p.frequency, "description": p.description}
|
||||
for p in patterns
|
||||
],
|
||||
}
|
||||
@@ -1,231 +0,0 @@
|
||||
"""Session compaction with fact extraction.
|
||||
|
||||
Before compressing conversation context, extracts durable facts
|
||||
(user preferences, corrections, project details) and saves them
|
||||
to the fact store so they survive compression.
|
||||
|
||||
Usage:
|
||||
from agent.session_compactor import extract_and_save_facts
|
||||
facts = extract_and_save_facts(messages)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedFact:
|
||||
"""A fact extracted from conversation."""
|
||||
category: str # "user_pref", "correction", "project", "tool_quirk", "general"
|
||||
entity: str # what the fact is about
|
||||
content: str # the fact itself
|
||||
confidence: float # 0.0-1.0
|
||||
source_turn: int # which message turn it came from
|
||||
timestamp: float = 0.0
|
||||
|
||||
|
||||
# Patterns that indicate user preferences
|
||||
_PREFERENCE_PATTERNS = [
|
||||
(r"(?:I|we) (?:prefer|like|want|need) (.+?)(?:\.|$)", "preference"),
|
||||
(r"(?:always|never) (?:use|do|run|deploy) (.+?)(?:\.|$)", "preference"),
|
||||
(r"(?:my|our) (?:default|preferred|usual) (.+?) (?:is|are) (.+?)(?:\.|$)", "preference"),
|
||||
(r"(?:make sure|ensure|remember) (?:to|that) (.+?)(?:\.|$)", "instruction"),
|
||||
(r"(?:don'?t|do not) (?:ever|ever again) (.+?)(?:\.|$)", "constraint"),
|
||||
]
|
||||
|
||||
# Patterns that indicate corrections
|
||||
_CORRECTION_PATTERNS = [
|
||||
(r"(?:actually|no[, ]|wait[, ]|correction[: ]|sorry[, ]) (.+)", "correction"),
|
||||
(r"(?:I meant|what I meant was|the correct) (.+?)(?:\.|$)", "correction"),
|
||||
(r"(?:it'?s|its) (?:not|shouldn'?t be|wrong) (.+?)(?:\.|$)", "correction"),
|
||||
]
|
||||
|
||||
# Patterns that indicate project/tool facts
|
||||
_PROJECT_PATTERNS = [
|
||||
(r"(?:the |our )?(?:project|repo|codebase|code) (?:is|uses|needs|requires) (.+?)(?:\.|$)", "project"),
|
||||
(r"(?:deploy|push|commit) (?:to|on) (.+?)(?:\.|$)", "project"),
|
||||
(r"(?:this|that|the) (?:server|host|machine|VPS) (?:is|runs|has) (.+?)(?:\.|$)", "infrastructure"),
|
||||
(r"(?:model|provider|engine) (?:is|should be|needs to be) (.+?)(?:\.|$)", "config"),
|
||||
]
|
||||
|
||||
|
||||
def extract_facts_from_messages(messages: List[Dict[str, Any]]) -> List[ExtractedFact]:
|
||||
"""Extract durable facts from conversation messages.
|
||||
|
||||
Scans user messages for preferences, corrections, project facts,
|
||||
and infrastructure details that should survive compression.
|
||||
"""
|
||||
facts = []
|
||||
seen_contents = set()
|
||||
|
||||
for turn_idx, msg in enumerate(messages):
|
||||
role = msg.get("role", "")
|
||||
content = msg.get("content", "")
|
||||
|
||||
# Only scan user messages and assistant responses with corrections
|
||||
if role not in ("user", "assistant"):
|
||||
continue
|
||||
if not content or not isinstance(content, str):
|
||||
continue
|
||||
if len(content) < 10:
|
||||
continue
|
||||
|
||||
# Skip tool results and system messages
|
||||
if role == "assistant" and msg.get("tool_calls"):
|
||||
continue
|
||||
|
||||
extracted = _extract_from_text(content, turn_idx, role)
|
||||
|
||||
# Deduplicate by content
|
||||
for fact in extracted:
|
||||
key = f"{fact.category}:{fact.content[:100]}"
|
||||
if key not in seen_contents:
|
||||
seen_contents.add(key)
|
||||
facts.append(fact)
|
||||
|
||||
return facts
|
||||
|
||||
|
||||
def _extract_from_text(text: str, turn_idx: int, role: str) -> List[ExtractedFact]:
|
||||
"""Extract facts from a single text block."""
|
||||
facts = []
|
||||
timestamp = time.time()
|
||||
|
||||
# Clean text for pattern matching
|
||||
clean = text.strip()
|
||||
|
||||
# User preference patterns (from user messages)
|
||||
if role == "user":
|
||||
for pattern, subcategory in _PREFERENCE_PATTERNS:
|
||||
for match in re.finditer(pattern, clean, re.IGNORECASE):
|
||||
content = match.group(1).strip() if match.lastindex else match.group(0).strip()
|
||||
if len(content) > 5:
|
||||
facts.append(ExtractedFact(
|
||||
category=f"user_pref.{subcategory}",
|
||||
entity="user",
|
||||
content=content[:200],
|
||||
confidence=0.7,
|
||||
source_turn=turn_idx,
|
||||
timestamp=timestamp,
|
||||
))
|
||||
|
||||
# Correction patterns (from user messages)
|
||||
if role == "user":
|
||||
for pattern, subcategory in _CORRECTION_PATTERNS:
|
||||
for match in re.finditer(pattern, clean, re.IGNORECASE):
|
||||
content = match.group(1).strip() if match.lastindex else match.group(0).strip()
|
||||
if len(content) > 5:
|
||||
facts.append(ExtractedFact(
|
||||
category=f"correction.{subcategory}",
|
||||
entity="user",
|
||||
content=content[:200],
|
||||
confidence=0.8,
|
||||
source_turn=turn_idx,
|
||||
timestamp=timestamp,
|
||||
))
|
||||
|
||||
# Project/infrastructure patterns (from both user and assistant)
|
||||
for pattern, subcategory in _PROJECT_PATTERNS:
|
||||
for match in re.finditer(pattern, clean, re.IGNORECASE):
|
||||
content = match.group(1).strip() if match.lastindex else match.group(0).strip()
|
||||
if len(content) > 5:
|
||||
facts.append(ExtractedFact(
|
||||
category=f"project.{subcategory}",
|
||||
entity=subcategory,
|
||||
content=content[:200],
|
||||
confidence=0.6,
|
||||
source_turn=turn_idx,
|
||||
timestamp=timestamp,
|
||||
))
|
||||
|
||||
return facts
|
||||
|
||||
|
||||
def save_facts_to_store(facts: List[ExtractedFact], fact_store_fn=None) -> int:
|
||||
"""Save extracted facts to the fact store.
|
||||
|
||||
Args:
|
||||
facts: List of extracted facts.
|
||||
fact_store_fn: Optional callable(category, entity, content, trust).
|
||||
If None, uses the holographic fact store if available.
|
||||
|
||||
Returns:
|
||||
Number of facts saved.
|
||||
"""
|
||||
saved = 0
|
||||
|
||||
if fact_store_fn:
|
||||
for fact in facts:
|
||||
try:
|
||||
fact_store_fn(
|
||||
category=fact.category,
|
||||
entity=fact.entity,
|
||||
content=fact.content,
|
||||
trust=fact.confidence,
|
||||
)
|
||||
saved += 1
|
||||
except Exception as e:
|
||||
logger.debug("Failed to save fact: %s", e)
|
||||
else:
|
||||
# Try holographic fact store
|
||||
try:
|
||||
from fact_store import fact_store as _fs
|
||||
for fact in facts:
|
||||
try:
|
||||
_fs(
|
||||
action="add",
|
||||
content=fact.content,
|
||||
category=fact.category,
|
||||
tags=fact.entity,
|
||||
trust_delta=fact.confidence - 0.5,
|
||||
)
|
||||
saved += 1
|
||||
except Exception as e:
|
||||
logger.debug("Failed to save fact via fact_store: %s", e)
|
||||
except ImportError:
|
||||
logger.debug("fact_store not available — facts not persisted")
|
||||
|
||||
return saved
|
||||
|
||||
|
||||
def extract_and_save_facts(
|
||||
messages: List[Dict[str, Any]],
|
||||
fact_store_fn=None,
|
||||
) -> Tuple[List[ExtractedFact], int]:
|
||||
"""Extract facts from messages and save them.
|
||||
|
||||
Returns (extracted_facts, saved_count).
|
||||
"""
|
||||
facts = extract_facts_from_messages(messages)
|
||||
if facts:
|
||||
logger.info("Extracted %d facts from conversation", len(facts))
|
||||
saved = save_facts_to_store(facts, fact_store_fn)
|
||||
logger.info("Saved %d/%d facts to store", saved, len(facts))
|
||||
else:
|
||||
saved = 0
|
||||
return facts, saved
|
||||
|
||||
|
||||
def format_facts_summary(facts: List[ExtractedFact]) -> str:
|
||||
"""Format extracted facts as a readable summary."""
|
||||
if not facts:
|
||||
return "No facts extracted."
|
||||
|
||||
by_category = {}
|
||||
for f in facts:
|
||||
by_category.setdefault(f.category, []).append(f)
|
||||
|
||||
lines = [f"Extracted {len(facts)} facts:", ""]
|
||||
for cat, cat_facts in sorted(by_category.items()):
|
||||
lines.append(f" {cat}:")
|
||||
for f in cat_facts:
|
||||
lines.append(f" - {f.content[:80]}")
|
||||
return "\n".join(lines)
|
||||
@@ -1,146 +0,0 @@
|
||||
"""Time-aware model routing for cron jobs.
|
||||
|
||||
Routes cron tasks to more capable models during off-hours when the user
|
||||
is not present to correct errors. Reduces error rates during high-error
|
||||
time windows (e.g., 18:00 evening batches).
|
||||
|
||||
Usage:
|
||||
from agent.time_aware_routing import resolve_time_aware_model
|
||||
model = resolve_time_aware_model(base_model="mimo-v2-pro", is_cron=True)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional
|
||||
|
||||
|
||||
# Error rate data from empirical audit (2026-04-12)
|
||||
# Higher error rates during these hours suggest routing to better models
|
||||
_HIGH_ERROR_HOURS = {
|
||||
18: 9.4, # 18:00 — 9.4% error rate (evening cron batches)
|
||||
19: 8.1,
|
||||
20: 7.5,
|
||||
21: 6.8,
|
||||
22: 6.2,
|
||||
23: 5.9,
|
||||
0: 5.5,
|
||||
1: 5.2,
|
||||
}
|
||||
|
||||
# Low error hours — default model is fine
|
||||
_LOW_ERROR_HOURS = set(range(6, 18)) # 06:00-17:59
|
||||
|
||||
# Default fallback models by time zone
|
||||
_DEFAULT_STRONG_MODEL = os.getenv("CRON_STRONG_MODEL", "xiaomi/mimo-v2-pro")
|
||||
_DEFAULT_CHEAP_MODEL = os.getenv("CRON_CHEAP_MODEL", "qwen2.5:7b")
|
||||
_ERROR_THRESHOLD = float(os.getenv("CRON_ERROR_THRESHOLD", "6.0")) # % error rate
|
||||
|
||||
|
||||
@dataclass
|
||||
class RoutingDecision:
|
||||
"""Result of time-aware routing."""
|
||||
model: str
|
||||
provider: str
|
||||
reason: str
|
||||
hour: int
|
||||
error_rate: float
|
||||
is_off_hours: bool
|
||||
|
||||
|
||||
def get_hour_error_rate(hour: int) -> float:
|
||||
"""Get expected error rate for a given hour (0-23)."""
|
||||
return _HIGH_ERROR_HOURS.get(hour, 4.0) # Default 4% for unlisted hours
|
||||
|
||||
|
||||
def is_off_hours(hour: int) -> bool:
|
||||
"""Check if hour is considered off-hours (higher error rates)."""
|
||||
return hour not in _LOW_ERROR_HOURS
|
||||
|
||||
|
||||
def resolve_time_aware_model(
|
||||
base_model: str = "",
|
||||
base_provider: str = "",
|
||||
is_cron: bool = False,
|
||||
hour: Optional[int] = None,
|
||||
) -> RoutingDecision:
|
||||
"""Resolve model based on time of day and task type.
|
||||
|
||||
During off-hours (evening/night), routes to stronger models for cron
|
||||
jobs to compensate for lack of human oversight.
|
||||
|
||||
Args:
|
||||
base_model: The model that would normally be used.
|
||||
base_provider: The provider for the base model.
|
||||
is_cron: Whether this is a cron job (vs interactive session).
|
||||
hour: Override hour (for testing). Defaults to current hour.
|
||||
|
||||
Returns:
|
||||
RoutingDecision with model, provider, and reasoning.
|
||||
"""
|
||||
if hour is None:
|
||||
hour = time.localtime().tm_hour
|
||||
|
||||
error_rate = get_hour_error_rate(hour)
|
||||
off_hours = is_off_hours(hour)
|
||||
|
||||
# Interactive sessions always use the base model (user can correct errors)
|
||||
if not is_cron:
|
||||
return RoutingDecision(
|
||||
model=base_model or _DEFAULT_CHEAP_MODEL,
|
||||
provider=base_provider,
|
||||
reason="Interactive session — user can correct errors",
|
||||
hour=hour,
|
||||
error_rate=error_rate,
|
||||
is_off_hours=off_hours,
|
||||
)
|
||||
|
||||
# Cron jobs during low-error hours: use base model
|
||||
if not off_hours and error_rate < _ERROR_THRESHOLD:
|
||||
return RoutingDecision(
|
||||
model=base_model or _DEFAULT_CHEAP_MODEL,
|
||||
provider=base_provider,
|
||||
reason=f"Low-error hours ({hour}:00, {error_rate}% expected)",
|
||||
hour=hour,
|
||||
error_rate=error_rate,
|
||||
is_off_hours=False,
|
||||
)
|
||||
|
||||
# Cron jobs during high-error hours: upgrade to stronger model
|
||||
if error_rate >= _ERROR_THRESHOLD:
|
||||
return RoutingDecision(
|
||||
model=_DEFAULT_STRONG_MODEL,
|
||||
provider="nous",
|
||||
reason=f"High-error hours ({hour}:00, {error_rate}% expected) — using stronger model",
|
||||
hour=hour,
|
||||
error_rate=error_rate,
|
||||
is_off_hours=True,
|
||||
)
|
||||
|
||||
# Off-hours but low error: use base model
|
||||
return RoutingDecision(
|
||||
model=base_model or _DEFAULT_CHEAP_MODEL,
|
||||
provider=base_provider,
|
||||
reason=f"Off-hours but low error ({hour}:00, {error_rate}%)",
|
||||
hour=hour,
|
||||
error_rate=error_rate,
|
||||
is_off_hours=off_hours,
|
||||
)
|
||||
|
||||
|
||||
def get_routing_report() -> str:
|
||||
"""Get a report of time-based routing decisions for the next 24 hours."""
|
||||
lines = ["Time-Aware Model Routing (24h forecast)", "=" * 40, ""]
|
||||
lines.append(f"Error threshold: {_ERROR_THRESHOLD}%")
|
||||
lines.append(f"Strong model: {_DEFAULT_STRONG_MODEL}")
|
||||
lines.append(f"Cheap model: {_DEFAULT_CHEAP_MODEL}")
|
||||
lines.append("")
|
||||
|
||||
for h in range(24):
|
||||
decision = resolve_time_aware_model(is_cron=True, hour=h)
|
||||
icon = "\U0001f7e2" if decision.model == _DEFAULT_CHEAP_MODEL else "\U0001f534"
|
||||
lines.append(f" {h:02d}:00 {icon} {decision.model:25s} ({decision.error_rate}% error)")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -1,316 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Token Budget — Poka-yoke guard against silent context overflow.
|
||||
|
||||
Progressive warning system with circuit breakers:
|
||||
- 60%: WARNING — log + suggest summarization
|
||||
- 80%: CAUTION — auto-compress, drop raw tool outputs
|
||||
- 90%: CRITICAL — block verbose tool calls, force wrap-up
|
||||
- 95%: STOP — graceful session termination with summary
|
||||
|
||||
Also provides tool output budgeting to truncate before overflow.
|
||||
|
||||
Usage:
|
||||
from agent.token_budget import TokenBudget
|
||||
|
||||
budget = TokenBudget(context_length=128_000)
|
||||
budget.update(8000) # from API response prompt_tokens
|
||||
|
||||
status = budget.check() # returns BudgetStatus with level + message
|
||||
budget.should_block_tools() # True at 90%+
|
||||
budget.should_terminate() # True at 95%+
|
||||
|
||||
# Tool output budgeting
|
||||
remaining = budget.tool_output_budget()
|
||||
truncated = budget.truncate_tool_output(output_text, max_chars=remaining)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Thresholds ────────────────────────────────────────────────────────
|
||||
|
||||
WARN_PERCENT = 0.60
|
||||
CAUTION_PERCENT = 0.80
|
||||
CRITICAL_PERCENT = 0.90
|
||||
STOP_PERCENT = 0.95
|
||||
|
||||
# Reserve 5% of context for system prompt, response, and overhead
|
||||
RESPONSE_RESERVE_RATIO = 0.05
|
||||
|
||||
# Max tool output chars at each level
|
||||
TOOL_OUTPUT_BUDGETS = {
|
||||
"NORMAL": 50_000,
|
||||
"WARNING": 20_000,
|
||||
"CAUTION": 8_000,
|
||||
"CRITICAL": 2_000,
|
||||
"STOP": 500,
|
||||
}
|
||||
|
||||
|
||||
class BudgetLevel(Enum):
|
||||
NORMAL = "NORMAL"
|
||||
WARNING = "WARNING"
|
||||
CAUTION = "CAUTION"
|
||||
CRITICAL = "CRITICAL"
|
||||
STOP = "STOP"
|
||||
|
||||
@property
|
||||
def percent_threshold(self) -> float:
|
||||
return {
|
||||
BudgetLevel.NORMAL: 0.0,
|
||||
BudgetLevel.WARNING: WARN_PERCENT,
|
||||
BudgetLevel.CAUTION: CAUTION_PERCENT,
|
||||
BudgetLevel.CRITICAL: CRITICAL_PERCENT,
|
||||
BudgetLevel.STOP: STOP_PERCENT,
|
||||
}[self]
|
||||
|
||||
@property
|
||||
def emoji(self) -> str:
|
||||
return {
|
||||
BudgetLevel.NORMAL: "",
|
||||
BudgetLevel.WARNING: "\u26a0\ufe0f",
|
||||
BudgetLevel.CAUTION: "\U0001f525",
|
||||
BudgetLevel.CRITICAL: "\U0001f6d1",
|
||||
BudgetLevel.STOP: "\U0001f6d1",
|
||||
}[self]
|
||||
|
||||
|
||||
@dataclass
|
||||
class BudgetStatus:
|
||||
"""Current token budget status."""
|
||||
level: BudgetLevel
|
||||
tokens_used: int
|
||||
context_length: int
|
||||
percent_used: float
|
||||
tokens_remaining: int
|
||||
message: str = ""
|
||||
should_compress: bool = False
|
||||
should_block_tools: bool = False
|
||||
should_terminate: bool = False
|
||||
|
||||
def to_indicator(self) -> str:
|
||||
"""Compact status indicator for CLI display."""
|
||||
pct = int(self.percent_used * 100)
|
||||
if self.level == BudgetLevel.NORMAL:
|
||||
return f"[{pct}%]"
|
||||
return f"{self.level.emoji} [{pct}%]"
|
||||
|
||||
def to_bar(self, width: int = 10) -> str:
|
||||
"""Visual progress bar."""
|
||||
filled = int(width * self.percent_used)
|
||||
bar = "\u2588" * filled + "\u2591" * (width - filled)
|
||||
color = self._bar_color()
|
||||
return f"{color}{bar}\033[0m {int(self.percent_used * 100)}%"
|
||||
|
||||
def _bar_color(self) -> str:
|
||||
if self.level == BudgetLevel.STOP:
|
||||
return "\033[41m" # red bg
|
||||
if self.level == BudgetLevel.CRITICAL:
|
||||
return "\033[31m" # red
|
||||
if self.level == BudgetLevel.CAUTION:
|
||||
return "\033[33m" # yellow
|
||||
if self.level == BudgetLevel.WARNING:
|
||||
return "\033[33m" # yellow
|
||||
return "\033[32m" # green
|
||||
|
||||
|
||||
class TokenBudget:
|
||||
"""
|
||||
Progressive token budget tracker with poka-yoke circuit breakers.
|
||||
|
||||
Tracks cumulative token usage against a context length and triggers
|
||||
escalating actions at each threshold.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
context_length: int,
|
||||
warn_percent: float = WARN_PERCENT,
|
||||
caution_percent: float = CAUTION_PERCENT,
|
||||
critical_percent: float = CRITICAL_PERCENT,
|
||||
stop_percent: float = STOP_PERCENT,
|
||||
response_reserve_ratio: float = RESPONSE_RESERVE_RATIO,
|
||||
):
|
||||
self.context_length = context_length
|
||||
self.warn_threshold = int(context_length * warn_percent)
|
||||
self.caution_threshold = int(context_length * caution_percent)
|
||||
self.critical_threshold = int(context_length * critical_percent)
|
||||
self.stop_threshold = int(context_length * stop_percent)
|
||||
self.response_reserve = int(context_length * response_reserve_ratio)
|
||||
|
||||
self.tokens_used = 0
|
||||
self.completions_tokens = 0
|
||||
self.total_tool_output_chars = 0
|
||||
self._level = BudgetLevel.NORMAL
|
||||
self._history: list[int] = []
|
||||
|
||||
def update(self, prompt_tokens: int, completion_tokens: int = 0) -> BudgetStatus:
|
||||
"""Update budget from API response usage."""
|
||||
self.tokens_used = prompt_tokens
|
||||
self.completions_tokens = completion_tokens
|
||||
self._history.append(prompt_tokens)
|
||||
return self.check()
|
||||
|
||||
def check(self) -> BudgetStatus:
|
||||
"""Evaluate current budget level and return status."""
|
||||
pct = self.tokens_used / self.context_length if self.context_length > 0 else 0
|
||||
remaining = max(0, self.context_length - self.tokens_used - self.response_reserve)
|
||||
|
||||
# Determine level
|
||||
if pct >= STOP_PERCENT:
|
||||
level = BudgetLevel.STOP
|
||||
elif pct >= CRITICAL_PERCENT:
|
||||
level = BudgetLevel.CRITICAL
|
||||
elif pct >= CAUTION_PERCENT:
|
||||
level = BudgetLevel.CAUTION
|
||||
elif pct >= WARN_PERCENT:
|
||||
level = BudgetLevel.WARNING
|
||||
else:
|
||||
level = BudgetLevel.NORMAL
|
||||
|
||||
# Log transitions (don\'t log every check)
|
||||
if level != self._level:
|
||||
self._log_transition(level, pct)
|
||||
self._level = level
|
||||
|
||||
messages = {
|
||||
BudgetLevel.NORMAL: "",
|
||||
BudgetLevel.WARNING: (
|
||||
f"Context at {int(pct*100)}%. Consider wrapping up soon or using /compress."
|
||||
),
|
||||
BudgetLevel.CAUTION: (
|
||||
f"Context at {int(pct*100)}%. Auto-compressing. "
|
||||
f"Tool outputs will be truncated."
|
||||
),
|
||||
BudgetLevel.CRITICAL: (
|
||||
f"Context at {int(pct*100)}%. Verbose tools blocked. "
|
||||
f"Session approaching limit — please wrap up."
|
||||
),
|
||||
BudgetLevel.STOP: (
|
||||
f"Context at {int(pct*100)}%. Session must terminate. "
|
||||
f"Saving summary before shutdown."
|
||||
),
|
||||
}
|
||||
|
||||
return BudgetStatus(
|
||||
level=level,
|
||||
tokens_used=self.tokens_used,
|
||||
context_length=self.context_length,
|
||||
percent_used=pct,
|
||||
tokens_remaining=remaining,
|
||||
message=messages[level],
|
||||
should_compress=level in (BudgetLevel.CAUTION, BudgetLevel.CRITICAL, BudgetLevel.STOP),
|
||||
should_block_tools=level in (BudgetLevel.CRITICAL, BudgetLevel.STOP),
|
||||
should_terminate=level == BudgetLevel.STOP,
|
||||
)
|
||||
|
||||
def should_compress(self) -> bool:
|
||||
"""True at 80%+ — auto-compression should trigger."""
|
||||
return self.tokens_used >= self.caution_threshold
|
||||
|
||||
def should_block_tools(self) -> bool:
|
||||
"""True at 90%+ — verbose tool calls should be blocked."""
|
||||
return self.tokens_used >= self.critical_threshold
|
||||
|
||||
def should_terminate(self) -> bool:
|
||||
"""True at 95%+ — session should gracefully terminate."""
|
||||
return self.tokens_used >= self.stop_threshold
|
||||
|
||||
def tool_output_budget(self) -> int:
|
||||
"""Max chars allowed for next tool output based on current level."""
|
||||
status = self.check()
|
||||
return TOOL_OUTPUT_BUDGETS.get(status.level.value, 50_000)
|
||||
|
||||
def truncate_tool_output(self, output: str, max_chars: int = None) -> str:
|
||||
"""Truncate tool output to fit budget. Adds truncation notice."""
|
||||
if max_chars is None:
|
||||
max_chars = self.tool_output_budget()
|
||||
|
||||
if len(output) <= max_chars:
|
||||
return output
|
||||
|
||||
# Preserve start and end, truncate middle
|
||||
if max_chars < 200:
|
||||
return output[:max_chars] + "\n[...truncated...]"
|
||||
|
||||
head = max_chars // 2
|
||||
tail = max_chars - head - 30 # reserve for truncation notice
|
||||
truncated = (
|
||||
output[:head]
|
||||
+ f"\n\n[...{len(output) - head - tail:,} chars truncated...]\n\n"
|
||||
+ output[-tail:]
|
||||
)
|
||||
return truncated
|
||||
|
||||
def remaining_for_response(self) -> int:
|
||||
"""Tokens available for the model\'s response."""
|
||||
return max(0, self.context_length - self.tokens_used - self.response_reserve)
|
||||
|
||||
def growth_rate(self) -> Optional[float]:
|
||||
"""Average token increase per turn (from history)."""
|
||||
if len(self._history) < 2:
|
||||
return None
|
||||
diffs = [self._history[i] - self._history[i-1] for i in range(1, len(self._history))]
|
||||
return sum(diffs) / len(diffs)
|
||||
|
||||
def turns_remaining(self) -> Optional[int]:
|
||||
"""Estimated turns until context is full (based on growth rate)."""
|
||||
rate = self.growth_rate()
|
||||
if rate is None or rate <= 0:
|
||||
return None
|
||||
remaining = self.context_length - self.tokens_used
|
||||
return int(remaining / rate)
|
||||
|
||||
def reset(self):
|
||||
"""Reset budget for new session."""
|
||||
self.tokens_used = 0
|
||||
self.completions_tokens = 0
|
||||
self.total_tool_output_chars = 0
|
||||
self._level = BudgetLevel.NORMAL
|
||||
self._history.clear()
|
||||
|
||||
def _log_transition(self, new_level: BudgetLevel, pct: float):
|
||||
"""Log budget level transitions."""
|
||||
msg = (
|
||||
f"Token budget: {self._level.value} -> {new_level.value} "
|
||||
f"({self.tokens_used}/{self.context_length} = {pct:.0%})"
|
||||
)
|
||||
if new_level == BudgetLevel.WARNING:
|
||||
logger.warning(msg)
|
||||
elif new_level == BudgetLevel.CAUTION:
|
||||
logger.warning(msg)
|
||||
elif new_level in (BudgetLevel.CRITICAL, BudgetLevel.STOP):
|
||||
logger.error(msg)
|
||||
else:
|
||||
logger.info(msg)
|
||||
|
||||
def summary(self) -> str:
|
||||
"""Human-readable budget summary."""
|
||||
status = self.check()
|
||||
turns = self.turns_remaining()
|
||||
rate = self.growth_rate()
|
||||
lines = [
|
||||
f"Token Budget: {status.tokens_used:,} / {status.context_length:,} ({status.percent_used:.0%})",
|
||||
f"Level: {status.level.value}",
|
||||
f"Remaining: {status.tokens_remaining:,} tokens",
|
||||
]
|
||||
if rate is not None:
|
||||
lines.append(f"Growth rate: ~{rate:,.0f} tokens/turn")
|
||||
if turns is not None:
|
||||
lines.append(f"Estimated turns left: ~{turns}")
|
||||
if status.message:
|
||||
lines.append(f"Action: {status.message}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ── Convenience factory ───────────────────────────────────────────────
|
||||
|
||||
def create_budget(context_length: int, **kwargs) -> TokenBudget:
|
||||
"""Create a TokenBudget with defaults."""
|
||||
return TokenBudget(context_length=context_length, **kwargs)
|
||||
@@ -1,156 +0,0 @@
|
||||
"""Tool fixation detection — break repetitive tool calling loops.
|
||||
|
||||
Detects when the agent latches onto one tool and calls it repeatedly
|
||||
without making progress. Injects a nudge prompt to break the loop.
|
||||
|
||||
Usage:
|
||||
from agent.tool_fixation_detector import ToolFixationDetector
|
||||
detector = ToolFixationDetector()
|
||||
nudge = detector.record("execute_code")
|
||||
if nudge:
|
||||
# Inject nudge into conversation
|
||||
messages.append({"role": "system", "content": nudge})
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
# Default thresholds
|
||||
_DEFAULT_THRESHOLD = int(os.getenv("TOOL_FIXATION_THRESHOLD", "5"))
|
||||
_DEFAULT_WINDOW = int(os.getenv("TOOL_FIXATION_WINDOW", "10"))
|
||||
|
||||
|
||||
@dataclass
|
||||
class FixationEvent:
|
||||
"""Record of a fixation detection."""
|
||||
tool_name: str
|
||||
streak_length: int
|
||||
threshold: int
|
||||
nudge_sent: bool = False
|
||||
|
||||
|
||||
class ToolFixationDetector:
|
||||
"""Detects and breaks tool fixation loops.
|
||||
|
||||
Tracks the sequence of tool calls and detects when the same tool
|
||||
is called N times consecutively. When detected, returns a nudge
|
||||
prompt to inject into the conversation.
|
||||
"""
|
||||
|
||||
def __init__(self, threshold: int = 0, window: int = 0):
|
||||
self.threshold = threshold or _DEFAULT_THRESHOLD
|
||||
self.window = window or _DEFAULT_WINDOW
|
||||
self._history: List[str] = []
|
||||
self._current_streak: str = ""
|
||||
self._streak_count: int = 0
|
||||
self._nudges_sent: int = 0
|
||||
self._events: List[FixationEvent] = []
|
||||
|
||||
@property
|
||||
def nudges_sent(self) -> int:
|
||||
return self._nudges_sent
|
||||
|
||||
@property
|
||||
def events(self) -> List[FixationEvent]:
|
||||
return list(self._events)
|
||||
|
||||
def record(self, tool_name: str) -> Optional[str]:
|
||||
"""Record a tool call and return nudge prompt if fixation detected.
|
||||
|
||||
Args:
|
||||
tool_name: Name of the tool that was called.
|
||||
|
||||
Returns:
|
||||
Nudge prompt string if fixation detected, None otherwise.
|
||||
"""
|
||||
self._history.append(tool_name)
|
||||
|
||||
# Trim history to window
|
||||
if len(self._history) > self.window:
|
||||
self._history = self._history[-self.window:]
|
||||
|
||||
# Update streak
|
||||
if tool_name == self._current_streak:
|
||||
self._streak_count += 1
|
||||
else:
|
||||
self._current_streak = tool_name
|
||||
self._streak_count = 1
|
||||
|
||||
# Check for fixation
|
||||
if self._streak_count >= self.threshold:
|
||||
event = FixationEvent(
|
||||
tool_name=tool_name,
|
||||
streak_length=self._streak_count,
|
||||
threshold=self.threshold,
|
||||
nudge_sent=True,
|
||||
)
|
||||
self._events.append(event)
|
||||
self._nudges_sent += 1
|
||||
|
||||
return self._build_nudge(tool_name, self._streak_count)
|
||||
|
||||
return None
|
||||
|
||||
def _build_nudge(self, tool_name: str, count: int) -> str:
|
||||
"""Build a nudge prompt to break the fixation loop."""
|
||||
return (
|
||||
f"[SYSTEM: You have called `{tool_name}` {count} times in a row "
|
||||
f"without switching tools. This suggests a fixation loop. "
|
||||
f"Consider:\n"
|
||||
f"1. Is the tool returning an error? Read the error carefully.\n"
|
||||
f"2. Is there a different tool that could help?\n"
|
||||
f"3. Should you ask the user for clarification?\n"
|
||||
f"4. Is the task actually complete?\n"
|
||||
f"Break the loop by trying a different approach.]"
|
||||
)
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset the detector state."""
|
||||
self._history.clear()
|
||||
self._current_streak = ""
|
||||
self._streak_count = 0
|
||||
|
||||
def get_streak_info(self) -> dict:
|
||||
"""Get current streak information."""
|
||||
return {
|
||||
"current_tool": self._current_streak,
|
||||
"streak_count": self._streak_count,
|
||||
"threshold": self.threshold,
|
||||
"at_threshold": self._streak_count >= self.threshold,
|
||||
"nudges_sent": self._nudges_sent,
|
||||
}
|
||||
|
||||
def format_report(self) -> str:
|
||||
"""Format fixation events as a report."""
|
||||
if not self._events:
|
||||
return "No tool fixation detected."
|
||||
|
||||
lines = [
|
||||
f"Tool Fixation Report ({len(self._events)} events)",
|
||||
"=" * 40,
|
||||
]
|
||||
for e in self._events:
|
||||
lines.append(f" {e.tool_name}: {e.streak_length} consecutive calls (threshold: {e.threshold})")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# Singleton
|
||||
_detector: Optional[ToolFixationDetector] = None
|
||||
|
||||
|
||||
def get_fixation_detector() -> ToolFixationDetector:
|
||||
"""Get or create the singleton detector."""
|
||||
global _detector
|
||||
if _detector is None:
|
||||
_detector = ToolFixationDetector()
|
||||
return _detector
|
||||
|
||||
|
||||
def reset_fixation_detector() -> None:
|
||||
"""Reset the singleton."""
|
||||
global _detector
|
||||
_detector = None
|
||||
@@ -1,32 +0,0 @@
|
||||
---
|
||||
# fleet_mtls.yml — Deploy mutual-TLS certificates to all fleet agents.
|
||||
#
|
||||
# Prerequisites:
|
||||
# 1. Run scripts/gen_fleet_ca.sh to create the fleet CA.
|
||||
# 2. For each agent, run:
|
||||
# scripts/gen_agent_cert.sh --agent timmy
|
||||
# scripts/gen_agent_cert.sh --agent allegro
|
||||
# scripts/gen_agent_cert.sh --agent ezra
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook -i inventory/fleet.ini ansible/fleet_mtls.yml
|
||||
#
|
||||
# Inventory example (inventory/fleet.ini):
|
||||
# [fleet]
|
||||
# timmy.local agent_name=timmy
|
||||
# allegro.local agent_name=allegro
|
||||
# ezra.local agent_name=ezra
|
||||
#
|
||||
# Refs #806
|
||||
|
||||
- name: Distribute fleet mTLS certificates
|
||||
hosts: fleet
|
||||
become: true
|
||||
vars:
|
||||
_pki_base: "{{ lookup('env', 'HOME') }}/.hermes/pki"
|
||||
roles:
|
||||
- role: hermes_mtls
|
||||
vars:
|
||||
hermes_mtls_local_ca_cert: "{{ _pki_base }}/ca/fleet-ca.crt"
|
||||
hermes_mtls_local_agent_cert: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.crt"
|
||||
hermes_mtls_local_agent_key: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.key"
|
||||
@@ -1,12 +0,0 @@
|
||||
# Example fleet inventory for mutual-TLS cert distribution.
|
||||
# Copy to fleet.ini and adjust hostnames/IPs.
|
||||
# Refs #806
|
||||
|
||||
[fleet_agents]
|
||||
timmy ansible_host=192.168.1.10
|
||||
allegro ansible_host=192.168.1.11
|
||||
ezra ansible_host=192.168.1.12
|
||||
|
||||
[fleet_agents:vars]
|
||||
ansible_user=hermes
|
||||
ansible_python_interpreter=/usr/bin/python3
|
||||
@@ -1,21 +0,0 @@
|
||||
---
|
||||
# Default paths on the *control node* where certs are read from.
|
||||
# Override these in your inventory / group_vars as needed.
|
||||
|
||||
# Fleet CA certificate (public; safe to push to all nodes)
|
||||
fleet_mtls_ca_cert_src: "{{ lookup('env', 'HOME') }}/.hermes/pki/ca/fleet-ca.crt"
|
||||
|
||||
# Per-agent cert/key source dir on the control node.
|
||||
# Expected layout: <fleet_mtls_agent_certs_dir>/<agent_name>/<agent_name>.{crt,key}
|
||||
fleet_mtls_agent_certs_dir: "{{ lookup('env', 'HOME') }}/.hermes/pki/agents"
|
||||
|
||||
# Remote destination paths on the fleet node
|
||||
fleet_mtls_remote_pki_dir: "/etc/hermes/pki"
|
||||
fleet_mtls_remote_ca_dir: "{{ fleet_mtls_remote_pki_dir }}/ca"
|
||||
fleet_mtls_remote_agent_dir: "{{ fleet_mtls_remote_pki_dir }}/agent"
|
||||
|
||||
# The agent name to deploy (set per-host in inventory, e.g. timmy / allegro / ezra)
|
||||
fleet_mtls_agent_name: "{{ inventory_hostname_short }}"
|
||||
|
||||
# Hermes service name (for reload notification)
|
||||
fleet_mtls_hermes_service: "hermes-a2a"
|
||||
@@ -1,7 +0,0 @@
|
||||
---
|
||||
- name: Restart hermes-a2a
|
||||
ansible.builtin.systemd:
|
||||
name: "{{ fleet_mtls_hermes_service }}"
|
||||
state: restarted
|
||||
when: ansible_service_mgr == "systemd"
|
||||
ignore_errors: true # service may not exist in all environments
|
||||
@@ -1,17 +0,0 @@
|
||||
---
|
||||
galaxy_info:
|
||||
role_name: fleet_mtls_certs
|
||||
author: hermes-agent
|
||||
description: >
|
||||
Distribute fleet CA and per-agent mTLS certificates to Hermes fleet nodes.
|
||||
Part of issue #806 — A2A mutual TLS between fleet agents.
|
||||
min_ansible_version: "2.14"
|
||||
platforms:
|
||||
- name: Debian
|
||||
versions: [bookworm, bullseye]
|
||||
- name: Ubuntu
|
||||
versions: ["22.04", "24.04"]
|
||||
- name: EL
|
||||
versions: ["8", "9"]
|
||||
|
||||
dependencies: []
|
||||
@@ -1,99 +0,0 @@
|
||||
---
|
||||
# fleet_mtls_certs/tasks/main.yml
|
||||
#
|
||||
# Distribute the fleet CA certificate and the per-agent TLS cert+key to
|
||||
# each fleet node. Triggers a hermes-a2a service restart when any cert
|
||||
# changes.
|
||||
#
|
||||
# Refs #806 — A2A mutual TLS between fleet agents.
|
||||
|
||||
- name: Verify agent cert source files exist on control node
|
||||
ansible.builtin.stat:
|
||||
path: "{{ item }}"
|
||||
register: _src_stat
|
||||
delegate_to: localhost
|
||||
loop:
|
||||
- "{{ fleet_mtls_ca_cert_src }}"
|
||||
- "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
|
||||
- "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
|
||||
loop_control:
|
||||
label: "{{ item | basename }}"
|
||||
|
||||
- name: Fail if any source cert is missing
|
||||
ansible.builtin.fail:
|
||||
msg: >
|
||||
Required cert file not found: {{ item.item }}
|
||||
Run scripts/gen_fleet_ca.sh and scripts/gen_agent_cert.sh --agent {{ fleet_mtls_agent_name }} first.
|
||||
when: not item.stat.exists
|
||||
loop: "{{ _src_stat.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item | basename }}"
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Remote directory structure
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
- name: Create remote PKI directories
|
||||
ansible.builtin.file:
|
||||
path: "{{ item }}"
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0750"
|
||||
loop:
|
||||
- "{{ fleet_mtls_remote_pki_dir }}"
|
||||
- "{{ fleet_mtls_remote_ca_dir }}"
|
||||
- "{{ fleet_mtls_remote_agent_dir }}"
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Fleet CA certificate (public — read-only for all)
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
- name: Deploy fleet CA certificate
|
||||
ansible.builtin.copy:
|
||||
src: "{{ fleet_mtls_ca_cert_src }}"
|
||||
dest: "{{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: Restart hermes-a2a
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Per-agent certificate (public portion)
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
- name: Deploy agent certificate
|
||||
ansible.builtin.copy:
|
||||
src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
|
||||
dest: "{{ fleet_mtls_remote_agent_dir }}/agent.crt"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: Restart hermes-a2a
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Per-agent private key (secret — root-only read)
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
- name: Deploy agent private key
|
||||
ansible.builtin.copy:
|
||||
src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
|
||||
dest: "{{ fleet_mtls_remote_agent_dir }}/agent.key"
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0600"
|
||||
no_log: true # suppress file content from Ansible output
|
||||
notify: Restart hermes-a2a
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Environment file for hermes-a2a systemd unit
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
- name: Write hermes-a2a environment file
|
||||
ansible.builtin.template:
|
||||
src: hermes_a2a_env.j2
|
||||
dest: /etc/hermes/a2a.env
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0640"
|
||||
notify: Restart hermes-a2a
|
||||
@@ -1,10 +0,0 @@
|
||||
# Managed by Ansible — fleet_mtls_certs role
|
||||
# Environment variables for the hermes-a2a systemd service.
|
||||
# Source this file in the [Service] section: EnvironmentFile=/etc/hermes/a2a.env
|
||||
|
||||
HERMES_AGENT_NAME={{ fleet_mtls_agent_name }}
|
||||
HERMES_A2A_CERT={{ fleet_mtls_remote_agent_dir }}/agent.crt
|
||||
HERMES_A2A_KEY={{ fleet_mtls_remote_agent_dir }}/agent.key
|
||||
HERMES_A2A_CA={{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt
|
||||
HERMES_A2A_HOST=0.0.0.0
|
||||
HERMES_A2A_PORT=9443
|
||||
@@ -1,21 +0,0 @@
|
||||
---
|
||||
# Ansible role: hermes_mtls
|
||||
# Distributes fleet mTLS certificates to Hermes agent nodes.
|
||||
#
|
||||
# Required variables (set in inventory / group_vars / --extra-vars):
|
||||
# hermes_mtls_local_ca_cert Local path on the Ansible controller to fleet-ca.crt
|
||||
# hermes_mtls_local_agent_cert Local path to this agent's .crt file
|
||||
# hermes_mtls_local_agent_key Local path to this agent's .key file
|
||||
#
|
||||
# Optional overrides:
|
||||
hermes_mtls_cert_dir: /etc/hermes/certs
|
||||
hermes_mtls_cert_owner: hermes
|
||||
hermes_mtls_cert_group: hermes
|
||||
hermes_mtls_cert_mode: "0640"
|
||||
hermes_mtls_ca_cert_mode: "0644"
|
||||
|
||||
# Env file that Hermes reads on startup (systemd EnvironmentFile or .env)
|
||||
hermes_mtls_env_file: /etc/hermes/mtls.env
|
||||
|
||||
# Hermes systemd service name — restarted after cert changes
|
||||
hermes_mtls_service: hermes-gateway
|
||||
@@ -1,7 +0,0 @@
|
||||
---
|
||||
- name: Restart hermes service
|
||||
ansible.builtin.systemd:
|
||||
name: "{{ hermes_mtls_service }}"
|
||||
state: restarted
|
||||
daemon_reload: true
|
||||
when: ansible_service_mgr == "systemd"
|
||||
@@ -1,16 +0,0 @@
|
||||
---
|
||||
galaxy_info:
|
||||
role_name: hermes_mtls
|
||||
author: Hermes Fleet
|
||||
description: Distribute mTLS certificates to Hermes fleet nodes for A2A authentication
|
||||
license: MIT
|
||||
min_ansible_version: "2.14"
|
||||
platforms:
|
||||
- name: Ubuntu
|
||||
versions: ["22.04", "24.04"]
|
||||
- name: Debian
|
||||
versions: ["12"]
|
||||
- name: EL
|
||||
versions: ["9"]
|
||||
|
||||
dependencies: []
|
||||
@@ -1,67 +0,0 @@
|
||||
---
|
||||
# hermes_mtls role — distribute fleet mTLS certificates to a Hermes agent node.
|
||||
#
|
||||
# This role:
|
||||
# 1. Creates the cert directory on the remote node
|
||||
# 2. Copies the Fleet CA cert, agent cert, and agent key
|
||||
# 3. Writes an env file with HERMES_MTLS_* variables
|
||||
# 4. Restarts the Hermes service if any cert changed
|
||||
|
||||
- name: Ensure cert directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ hermes_mtls_cert_dir }}"
|
||||
state: directory
|
||||
owner: "{{ hermes_mtls_cert_owner }}"
|
||||
group: "{{ hermes_mtls_cert_group }}"
|
||||
mode: "0750"
|
||||
|
||||
- name: Copy Fleet CA certificate
|
||||
ansible.builtin.copy:
|
||||
src: "{{ hermes_mtls_local_ca_cert }}"
|
||||
dest: "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
|
||||
owner: "{{ hermes_mtls_cert_owner }}"
|
||||
group: "{{ hermes_mtls_cert_group }}"
|
||||
mode: "{{ hermes_mtls_ca_cert_mode }}"
|
||||
notify: Restart hermes service
|
||||
|
||||
- name: Copy agent TLS certificate
|
||||
ansible.builtin.copy:
|
||||
src: "{{ hermes_mtls_local_agent_cert }}"
|
||||
dest: "{{ hermes_mtls_cert_dir }}/agent.crt"
|
||||
owner: "{{ hermes_mtls_cert_owner }}"
|
||||
group: "{{ hermes_mtls_cert_group }}"
|
||||
mode: "{{ hermes_mtls_cert_mode }}"
|
||||
notify: Restart hermes service
|
||||
|
||||
- name: Copy agent TLS private key
|
||||
ansible.builtin.copy:
|
||||
src: "{{ hermes_mtls_local_agent_key }}"
|
||||
dest: "{{ hermes_mtls_cert_dir }}/agent.key"
|
||||
owner: "{{ hermes_mtls_cert_owner }}"
|
||||
group: "{{ hermes_mtls_cert_group }}"
|
||||
mode: "0600"
|
||||
notify: Restart hermes service
|
||||
|
||||
- name: Write mTLS environment file
|
||||
ansible.builtin.template:
|
||||
src: mtls.env.j2
|
||||
dest: "{{ hermes_mtls_env_file }}"
|
||||
owner: "{{ hermes_mtls_cert_owner }}"
|
||||
group: "{{ hermes_mtls_cert_group }}"
|
||||
mode: "0640"
|
||||
notify: Restart hermes service
|
||||
|
||||
- name: Verify cert files are readable by service user
|
||||
ansible.builtin.stat:
|
||||
path: "{{ item }}"
|
||||
loop:
|
||||
- "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
|
||||
- "{{ hermes_mtls_cert_dir }}/agent.crt"
|
||||
- "{{ hermes_mtls_cert_dir }}/agent.key"
|
||||
register: _cert_stat
|
||||
|
||||
- name: Assert all cert files exist
|
||||
ansible.builtin.assert:
|
||||
that: item.stat.exists
|
||||
fail_msg: "Expected cert file missing: {{ item.item }}"
|
||||
loop: "{{ _cert_stat.results }}"
|
||||
@@ -1,8 +0,0 @@
|
||||
# Hermes mTLS environment — generated by hermes_mtls Ansible role
|
||||
# Source this file or use as a systemd EnvironmentFile=
|
||||
# WARNING: This file contains the path to the agent's private key.
|
||||
# Restrict read access to the hermes service user.
|
||||
|
||||
HERMES_MTLS_CERT={{ hermes_mtls_cert_dir }}/agent.crt
|
||||
HERMES_MTLS_KEY={{ hermes_mtls_cert_dir }}/agent.key
|
||||
HERMES_MTLS_CA={{ hermes_mtls_cert_dir }}/fleet-ca.crt
|
||||
@@ -1,40 +0,0 @@
|
||||
# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
|
||||
|
||||
Date: 2026-04-13
|
||||
Status: Awaiting execution
|
||||
|
||||
## Test Design
|
||||
|
||||
100 diverse tool calls across 7 categories:
|
||||
|
||||
| Category | Count | Tools Tested |
|
||||
|----------|-------|--------------|
|
||||
| File operations | 20 | read_file, write_file, search_files |
|
||||
| Terminal commands | 20 | terminal |
|
||||
| Web search | 15 | web_search |
|
||||
| Code execution | 15 | execute_code |
|
||||
| Browser automation | 10 | browser_navigate |
|
||||
| Delegation | 10 | delegate_task |
|
||||
| MCP tools | 10 | mcp_* |
|
||||
|
||||
## Metrics
|
||||
|
||||
| Metric | mimo-v2-pro | Gemma 4 |
|
||||
|--------|-------------|---------|
|
||||
| Schema parse success | — | — |
|
||||
| Tool execution success | — | — |
|
||||
| Parallel tool success | — | — |
|
||||
| Avg latency (s) | — | — |
|
||||
| Token cost per call | — | — |
|
||||
|
||||
## How to Run
|
||||
|
||||
```bash
|
||||
python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
|
||||
python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
|
||||
python3 benchmarks/tool_call_benchmark.py --compare
|
||||
```
|
||||
|
||||
## Gemma 4-Specific Failure Modes
|
||||
|
||||
To be documented after benchmark execution.
|
||||
@@ -1,194 +0,0 @@
|
||||
[
|
||||
{
|
||||
"id": "screenshot_github_home",
|
||||
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["github", "logo", "mark"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "diagram_mermaid_flow",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["flow", "diagram", "process"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_1",
|
||||
"url": "https://picsum.photos/seed/vision1/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_2",
|
||||
"url": "https://picsum.photos/seed/vision2/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "chart_simple_bar",
|
||||
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["bar", "chart", "revenue"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
},
|
||||
{
|
||||
"id": "chart_pie",
|
||||
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["pie", "chart", "percentage"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
},
|
||||
{
|
||||
"id": "diagram_org_chart",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["organization", "hierarchy", "chart"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_terminal",
|
||||
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["terminal", "command", "output"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_3",
|
||||
"url": "https://picsum.photos/seed/vision3/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "chart_line",
|
||||
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["line", "chart", "temperature"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
},
|
||||
{
|
||||
"id": "diagram_sequence",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["sequence", "interaction", "message"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_4",
|
||||
"url": "https://picsum.photos/seed/vision4/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_webpage",
|
||||
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["github", "page", "web"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "chart_radar",
|
||||
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["radar", "chart", "skill"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_5",
|
||||
"url": "https://picsum.photos/seed/vision5/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "diagram_class",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["class", "object", "attribute"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "chart_doughnut",
|
||||
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["doughnut", "chart", "device"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_6",
|
||||
"url": "https://picsum.photos/seed/vision6/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_error",
|
||||
"url": "https://http.cat/404.jpg",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["404", "error", "cat"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
|
||||
},
|
||||
{
|
||||
"id": "diagram_network",
|
||||
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["network", "node", "connection"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_7",
|
||||
"url": "https://picsum.photos/seed/vision7/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "chart_stacked_bar",
|
||||
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["stacked", "bar", "chart"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
|
||||
},
|
||||
{
|
||||
"id": "screenshot_dashboard",
|
||||
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["search", "code", "feature"],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
},
|
||||
{
|
||||
"id": "photo_random_8",
|
||||
"url": "https://picsum.photos/seed/vision8/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"ground_truth_ocr": "",
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
|
||||
}
|
||||
]
|
||||
@@ -1,614 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
|
||||
|
||||
Runs 100 diverse tool-calling prompts through multiple models and compares
|
||||
success rates, latency, and token costs.
|
||||
|
||||
Usage:
|
||||
python3 benchmarks/tool_call_benchmark.py # full 100-call suite
|
||||
python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
|
||||
python3 benchmarks/tool_call_benchmark.py --models nous # single model
|
||||
python3 benchmarks/tool_call_benchmark.py --category file # single category
|
||||
|
||||
Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Ensure hermes-agent root is importable
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(REPO_ROOT))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test Definitions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class ToolCall:
|
||||
"""A single tool-calling test case."""
|
||||
id: str
|
||||
category: str
|
||||
prompt: str
|
||||
expected_tool: str # tool name we expect the model to call
|
||||
expected_params_check: str = "" # substring expected in JSON args
|
||||
timeout: int = 30 # max seconds per call
|
||||
notes: str = ""
|
||||
|
||||
|
||||
# fmt: off
|
||||
SUITE: list[ToolCall] = [
|
||||
# ── File Operations (20) ──────────────────────────────────────────────
|
||||
ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
|
||||
"read_file", "path"),
|
||||
ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
|
||||
"write_file", "path"),
|
||||
ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
|
||||
"search_files", "pattern"),
|
||||
ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
|
||||
"read_file", "offset"),
|
||||
ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
|
||||
"patch", "old_string"),
|
||||
ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
|
||||
"search_files", "target"),
|
||||
ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
|
||||
"read_file", "limit"),
|
||||
ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
|
||||
"write_file", "content"),
|
||||
ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
|
||||
"search_files", "file_glob"),
|
||||
ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
|
||||
"read_file", "bench_config"),
|
||||
ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
|
||||
"write_file", "bench_readme"),
|
||||
ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
|
||||
"search_files", "TODO"),
|
||||
ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
|
||||
"read_file", "bench_readme"),
|
||||
ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
|
||||
"patch", "Tool Benchmark"),
|
||||
ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
|
||||
"write_file", "bench_hello"),
|
||||
ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
|
||||
"search_files", "json"),
|
||||
ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
|
||||
"read_file", "bench_hello"),
|
||||
ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
|
||||
"patch", "hello world"),
|
||||
ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
|
||||
"search_files", "bench"),
|
||||
ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
|
||||
"read_file", "test_bench"),
|
||||
|
||||
# ── Terminal Commands (20) ────────────────────────────────────────────
|
||||
ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
|
||||
"terminal", "echo"),
|
||||
ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
|
||||
"terminal", "date"),
|
||||
ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
|
||||
"terminal", "uname"),
|
||||
ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
|
||||
"terminal", "pwd"),
|
||||
ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
|
||||
"terminal", "head"),
|
||||
ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
|
||||
"terminal", "whoami"),
|
||||
ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
|
||||
"terminal", "df"),
|
||||
ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
|
||||
"terminal", "python3"),
|
||||
ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
|
||||
"terminal", "hostname"),
|
||||
ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
|
||||
"terminal", "uptime"),
|
||||
ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
|
||||
"terminal", "PATH"),
|
||||
ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
|
||||
"terminal", "wc"),
|
||||
ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
|
||||
"terminal", "SHELL"),
|
||||
ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
|
||||
"terminal", "memory"),
|
||||
ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
|
||||
"terminal", "id"),
|
||||
ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
|
||||
"terminal", "hostname"),
|
||||
ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
|
||||
"terminal", "echo"),
|
||||
ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
|
||||
"terminal", "seq"),
|
||||
ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
|
||||
"terminal", "print"),
|
||||
ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
|
||||
"terminal", "wc"),
|
||||
|
||||
# ── Code Execution (15) ──────────────────────────────────────────────
|
||||
ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
|
||||
"execute_code", "factorial"),
|
||||
ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
|
||||
"execute_code", "words"),
|
||||
ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
|
||||
"execute_code", "fibonacci"),
|
||||
ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
|
||||
"execute_code", "json"),
|
||||
ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
|
||||
"execute_code", "glob"),
|
||||
ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
|
||||
"execute_code", "sum"),
|
||||
ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
|
||||
"execute_code", "palindrome"),
|
||||
ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
|
||||
"execute_code", "csv"),
|
||||
ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
|
||||
"execute_code", "sort"),
|
||||
ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
|
||||
"execute_code", "passwd"),
|
||||
ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
|
||||
"execute_code", "sha256"),
|
||||
ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
|
||||
"execute_code", "utcnow"),
|
||||
ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
|
||||
"execute_code", "upper"),
|
||||
ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
|
||||
"execute_code", "sys"),
|
||||
ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
|
||||
"execute_code", "socket"),
|
||||
|
||||
# ── Delegation (10) ──────────────────────────────────────────────────
|
||||
ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
|
||||
"delegate_task", "log"),
|
||||
ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
|
||||
"delegate_task", "15"),
|
||||
ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
|
||||
"delegate_task", "python"),
|
||||
ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
|
||||
"delegate_task", "summarize"),
|
||||
ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
|
||||
"delegate_task", "tmp"),
|
||||
ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
|
||||
"delegate_task", ".py"),
|
||||
ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
|
||||
"delegate_task", "df"),
|
||||
ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
|
||||
"delegate_task", "os"),
|
||||
ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
|
||||
"delegate_task", "hostname"),
|
||||
ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
|
||||
"delegate_task", "write"),
|
||||
|
||||
# ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
|
||||
ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
|
||||
"todo", "benchmark"),
|
||||
ToolCall("todo-02", "todo", "Show me the current todo list.",
|
||||
"todo", ""),
|
||||
ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
|
||||
"todo", "completed"),
|
||||
ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
|
||||
"todo", "Review"),
|
||||
ToolCall("todo-05", "todo", "Clear all completed todos.",
|
||||
"todo", "clear"),
|
||||
ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
|
||||
date=datetime.now().strftime("%Y-%m-%d")),
|
||||
"memory", "benchmark"),
|
||||
ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
|
||||
"memory", "benchmark"),
|
||||
ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
|
||||
"memory", "gemma"),
|
||||
ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
|
||||
"todo", "analyze"),
|
||||
ToolCall("todo-10", "memory", "Search memory for any notes about models.",
|
||||
"memory", "model"),
|
||||
|
||||
# ── Skills (10 — replacing MCP tools which need servers) ─────────────
|
||||
ToolCall("skill-01", "skills", "List all available skills.",
|
||||
"skills_list", ""),
|
||||
ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
|
||||
"skill_view", "test-driven"),
|
||||
ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
|
||||
"skills_list", "git"),
|
||||
ToolCall("skill-04", "skills", "View the 'code-review' skill.",
|
||||
"skill_view", "code-review"),
|
||||
ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
|
||||
"skills_list", "devops"),
|
||||
ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
|
||||
"skill_view", "systematic-debugging"),
|
||||
ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
|
||||
"skills_list", "testing"),
|
||||
ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
|
||||
"skill_view", "writing-plans"),
|
||||
ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
|
||||
"skills_list", "software-development"),
|
||||
ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
|
||||
"skill_view", "pr-review"),
|
||||
|
||||
# ── Additional tests to reach 100 ────────────────────────────────────
|
||||
ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
|
||||
"write_file", "bench_sort"),
|
||||
ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
|
||||
"read_file", "bench_sort"),
|
||||
ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
|
||||
"search_files", "class"),
|
||||
ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
|
||||
"terminal", "os"),
|
||||
ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
|
||||
"terminal", "cpu"),
|
||||
ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
|
||||
"execute_code", "flatten"),
|
||||
ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
|
||||
"execute_code", "prime"),
|
||||
ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
|
||||
"delegate_task", "cwd"),
|
||||
ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
|
||||
"todo", "Finalize"),
|
||||
ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
|
||||
"memory", "categories"),
|
||||
ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
|
||||
"skills_list", "deployment"),
|
||||
ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
|
||||
"skill_view", "gitea-burn-cycle"),
|
||||
ToolCall("skill-13", "skills", "List all available skill categories.",
|
||||
"skills_list", ""),
|
||||
ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
|
||||
"skills_list", "memory"),
|
||||
ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
|
||||
"skill_view", "mimo-swarm"),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class CallResult:
|
||||
test_id: str
|
||||
category: str
|
||||
model: str
|
||||
prompt: str
|
||||
expected_tool: str
|
||||
success: bool
|
||||
tool_called: Optional[str] = None
|
||||
tool_args_valid: bool = False
|
||||
execution_ok: bool = False
|
||||
latency_s: float = 0.0
|
||||
error: str = ""
|
||||
raw_response: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelStats:
|
||||
model: str
|
||||
total: int = 0
|
||||
schema_ok: int = 0 # model produced valid tool call JSON
|
||||
exec_ok: int = 0 # tool actually ran without error
|
||||
latency_sum: float = 0.0
|
||||
failures: list = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def schema_pct(self) -> float:
|
||||
return (self.schema_ok / self.total * 100) if self.total else 0
|
||||
|
||||
@property
|
||||
def exec_pct(self) -> float:
|
||||
return (self.exec_ok / self.total * 100) if self.total else 0
|
||||
|
||||
@property
|
||||
def avg_latency(self) -> float:
|
||||
return (self.latency_sum / self.total) if self.total else 0
|
||||
|
||||
|
||||
def setup_test_files():
|
||||
"""Create prerequisite files for the benchmark."""
|
||||
Path("/tmp/test_bench.txt").write_text(
|
||||
"This is a benchmark test file.\n"
|
||||
"It contains sample data for tool-calling tests.\n"
|
||||
"Line three has some import statements.\n"
|
||||
"import os\nimport sys\nimport json\n"
|
||||
"End of test data.\n"
|
||||
)
|
||||
|
||||
|
||||
def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
|
||||
"""Run a single tool-calling test through the agent."""
|
||||
from run_agent import AIAgent
|
||||
|
||||
result = CallResult(
|
||||
test_id=tc.id,
|
||||
category=tc.category,
|
||||
model=model_spec,
|
||||
prompt=tc.prompt,
|
||||
expected_tool=tc.expected_tool,
|
||||
success=False,
|
||||
)
|
||||
|
||||
try:
|
||||
agent = AIAgent(
|
||||
model=model_spec,
|
||||
provider=provider,
|
||||
max_iterations=3,
|
||||
quiet_mode=True,
|
||||
skip_context_files=True,
|
||||
skip_memory=True,
|
||||
persist_session=False,
|
||||
)
|
||||
|
||||
t0 = time.time()
|
||||
conv = agent.run_conversation(
|
||||
user_message=tc.prompt,
|
||||
system_message=(
|
||||
"You are a benchmark test runner. Execute the user's request by calling "
|
||||
"the appropriate tool. Return the tool result directly. Do not add commentary."
|
||||
),
|
||||
)
|
||||
result.latency_s = round(time.time() - t0, 2)
|
||||
|
||||
messages = conv.get("messages", [])
|
||||
|
||||
# Find the first assistant message with tool_calls
|
||||
tool_called = None
|
||||
tool_args_str = ""
|
||||
for msg in messages:
|
||||
if msg.get("role") == "assistant" and msg.get("tool_calls"):
|
||||
for tc_item in msg["tool_calls"]:
|
||||
fn = tc_item.get("function", {})
|
||||
tool_called = fn.get("name", "")
|
||||
tool_args_str = fn.get("arguments", "{}")
|
||||
break
|
||||
break
|
||||
|
||||
if tool_called:
|
||||
result.tool_called = tool_called
|
||||
result.schema_ok = True
|
||||
|
||||
# Check if the right tool was called
|
||||
if tool_called == tc.expected_tool:
|
||||
result.success = True
|
||||
|
||||
# Check if args contain expected substring
|
||||
if tc.expected_params_check:
|
||||
result.tool_args_valid = tc.expected_params_check in tool_args_str
|
||||
else:
|
||||
result.tool_args_valid = True
|
||||
|
||||
# Check if tool executed (look for tool role message)
|
||||
for msg in messages:
|
||||
if msg.get("role") == "tool":
|
||||
content = msg.get("content", "")
|
||||
if content and "error" not in content.lower()[:50]:
|
||||
result.execution_ok = True
|
||||
break
|
||||
elif content:
|
||||
result.execution_ok = True # got a response, even if error
|
||||
break
|
||||
else:
|
||||
# No tool call produced — still check if model responded
|
||||
final = conv.get("final_response", "")
|
||||
result.raw_response = final[:200] if final else ""
|
||||
|
||||
except Exception as e:
|
||||
result.error = f"{type(e).__name__}: {str(e)[:200]}"
|
||||
result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def generate_report(results: list[CallResult], models: list[str], output_path: Path):
|
||||
"""Generate markdown benchmark report."""
|
||||
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
||||
|
||||
# Aggregate per model
|
||||
stats: dict[str, ModelStats] = {}
|
||||
for m in models:
|
||||
stats[m] = ModelStats(model=m)
|
||||
|
||||
by_category: dict[str, dict[str, list[CallResult]]] = {}
|
||||
|
||||
for r in results:
|
||||
s = stats[r.model]
|
||||
s.total += 1
|
||||
s.schema_ok += int(r.schema_ok)
|
||||
s.exec_ok += int(r.execution_ok)
|
||||
s.latency_sum += r.latency_s
|
||||
if not r.success:
|
||||
s.failures.append(r)
|
||||
|
||||
by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
|
||||
|
||||
lines = [
|
||||
f"# Tool-Calling Benchmark Report",
|
||||
f"",
|
||||
f"Generated: {now}",
|
||||
f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
|
||||
f"Models tested: {', '.join(models)}",
|
||||
f"",
|
||||
f"## Summary",
|
||||
f"",
|
||||
f"| Metric | {' | '.join(models)} |",
|
||||
f"|--------|{'|'.join('---------' for _ in models)}|",
|
||||
]
|
||||
|
||||
# Schema parse success
|
||||
row = "| Schema parse success | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
# Tool execution success
|
||||
row = "| Tool execution success | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
# Correct tool selected
|
||||
row = "| Correct tool selected | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
correct = sum(1 for r in results if r.model == m and r.success)
|
||||
pct = (correct / s.total * 100) if s.total else 0
|
||||
row += f"{correct}/{s.total} ({pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
# Avg latency
|
||||
row = "| Avg latency (s) | "
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
row += f"{s.avg_latency:.2f} | "
|
||||
lines.append(row)
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Per-category breakdown
|
||||
lines.append("## Per-Category Breakdown")
|
||||
lines.append("")
|
||||
|
||||
for cat in sorted(by_category.keys()):
|
||||
lines.append(f"### {cat.title()}")
|
||||
lines.append("")
|
||||
lines.append(f"| Metric | {' | '.join(models)} |")
|
||||
lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
|
||||
|
||||
cat_data = by_category[cat]
|
||||
for metric_name, fn in [
|
||||
("Schema OK", lambda r: r.schema_ok),
|
||||
("Exec OK", lambda r: r.execution_ok),
|
||||
("Correct tool", lambda r: r.success),
|
||||
]:
|
||||
row = f"| {metric_name} | "
|
||||
for m in models:
|
||||
results_m = cat_data.get(m, [])
|
||||
total = len(results_m)
|
||||
ok = sum(1 for r in results_m if fn(r))
|
||||
pct = (ok / total * 100) if total else 0
|
||||
row += f"{ok}/{total} ({pct:.0f}%) | "
|
||||
lines.append(row)
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Failure analysis
|
||||
lines.append("## Failure Analysis")
|
||||
lines.append("")
|
||||
|
||||
any_failures = False
|
||||
for m in models:
|
||||
s = stats[m]
|
||||
if s.failures:
|
||||
any_failures = True
|
||||
lines.append(f"### {m} — {len(s.failures)} failures")
|
||||
lines.append("")
|
||||
lines.append("| Test | Category | Expected | Got | Error |")
|
||||
lines.append("|------|----------|----------|-----|-------|")
|
||||
for r in s.failures:
|
||||
got = r.tool_called or "none"
|
||||
err = r.error or "wrong tool"
|
||||
lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
|
||||
lines.append("")
|
||||
|
||||
if not any_failures:
|
||||
lines.append("No failures detected.")
|
||||
lines.append("")
|
||||
|
||||
# Raw results JSON
|
||||
lines.append("## Raw Results")
|
||||
lines.append("")
|
||||
lines.append("```json")
|
||||
lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
|
||||
lines.append("```")
|
||||
|
||||
report = "\n".join(lines)
|
||||
output_path.write_text(report)
|
||||
return report
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Tool-calling benchmark")
|
||||
parser.add_argument("--models", nargs="+",
|
||||
default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
|
||||
help="Model specs to test (provider:model)")
|
||||
parser.add_argument("--limit", type=int, default=0,
|
||||
help="Run only first N tests (0 = all)")
|
||||
parser.add_argument("--category", type=str, default="",
|
||||
help="Run only tests in this category")
|
||||
parser.add_argument("--output", type=str, default="",
|
||||
help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print test cases without running them")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Filter suite
|
||||
suite = SUITE[:]
|
||||
if args.category:
|
||||
suite = [tc for tc in suite if tc.category == args.category]
|
||||
if args.limit > 0:
|
||||
suite = suite[:args.limit]
|
||||
|
||||
if args.dry_run:
|
||||
print(f"Would run {len(suite)} tests:")
|
||||
for tc in suite:
|
||||
print(f" [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
|
||||
return
|
||||
|
||||
# Setup
|
||||
setup_test_files()
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
|
||||
|
||||
# Parse model specs
|
||||
model_specs = []
|
||||
for spec in args.models:
|
||||
parts = spec.split(":", 1)
|
||||
provider = parts[0]
|
||||
model_name = parts[1] if len(parts) > 1 else parts[0]
|
||||
model_specs.append((provider, model_name, spec))
|
||||
|
||||
print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
|
||||
print(f"Output: {output_path}")
|
||||
print()
|
||||
|
||||
all_results: list[CallResult] = []
|
||||
|
||||
for provider, model_name, full_spec in model_specs:
|
||||
print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
|
||||
model_results = []
|
||||
|
||||
for i, tc in enumerate(suite, 1):
|
||||
sys.stdout.write(f"\r [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
|
||||
sys.stdout.flush()
|
||||
|
||||
r = run_single_test(tc, full_spec, provider)
|
||||
model_results.append(r)
|
||||
|
||||
status = "✓" if r.success else "✗"
|
||||
sys.stdout.write(f" {status} ({r.latency_s:.1f}s)")
|
||||
sys.stdout.write("\n")
|
||||
|
||||
all_results.extend(model_results)
|
||||
|
||||
# Quick stats
|
||||
ok = sum(1 for r in model_results if r.success)
|
||||
print(f" Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
|
||||
print()
|
||||
|
||||
# Generate report
|
||||
model_names = [spec for _, _, spec in model_specs]
|
||||
report = generate_report(all_results, model_names, output_path)
|
||||
print(f"Report written to {output_path}")
|
||||
|
||||
# Exit code: 0 if all pass, 1 if any failures
|
||||
total_fail = sum(1 for r in all_results if not r.success)
|
||||
sys.exit(1 if total_fail > 0 else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,635 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Vision Benchmark Suite — Issue #817
|
||||
|
||||
Compares Gemma 4 vision accuracy vs current approach (Gemini 3 Flash Preview).
|
||||
Measures OCR accuracy, description quality, latency, and token usage.
|
||||
|
||||
Usage:
|
||||
# Run full benchmark
|
||||
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json
|
||||
|
||||
# Single image test
|
||||
python benchmarks/vision_benchmark.py --url https://example.com/image.png
|
||||
|
||||
# Generate test report
|
||||
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
|
||||
|
||||
Test image dataset: benchmarks/test_images.json (50-100 diverse images)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Benchmark configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Models to compare
|
||||
MODELS = {
|
||||
"gemma4": {
|
||||
"model_id": "google/gemma-4-27b-it",
|
||||
"display_name": "Gemma 4 27B",
|
||||
"provider": "nous",
|
||||
"description": "Google's multimodal Gemma 4 model",
|
||||
},
|
||||
"gemini3_flash": {
|
||||
"model_id": "google/gemini-3-flash-preview",
|
||||
"display_name": "Gemini 3 Flash Preview",
|
||||
"provider": "openrouter",
|
||||
"description": "Current default vision model",
|
||||
},
|
||||
}
|
||||
|
||||
# Evaluation prompts for different test categories
|
||||
EVAL_PROMPTS = {
|
||||
"screenshot": "Describe this screenshot in detail. What application is shown? What is the current state of the UI?",
|
||||
"diagram": "Describe this diagram completely. What concepts does it illustrate? List all components and their relationships.",
|
||||
"photo": "Describe this photo in detail. What objects are visible? What is the scene?",
|
||||
"ocr": "Extract ALL text visible in this image. Return it exactly as written, preserving formatting.",
|
||||
"chart": "What data does this chart show? List all axes labels, values, and key trends.",
|
||||
"document": "Extract all text from this document image. Preserve paragraph structure.",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Vision model interface
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def analyze_with_model(
|
||||
image_url: str,
|
||||
prompt: str,
|
||||
model_config: dict,
|
||||
timeout: float = 120.0,
|
||||
) -> dict:
|
||||
"""Call a vision model and return structured results.
|
||||
|
||||
Returns dict with:
|
||||
- analysis: str
|
||||
- latency_ms: float
|
||||
- tokens: dict (prompt_tokens, completion_tokens, total_tokens)
|
||||
- success: bool
|
||||
- error: str (if failed)
|
||||
"""
|
||||
import httpx
|
||||
|
||||
provider = model_config["provider"]
|
||||
model_id = model_config["model_id"]
|
||||
|
||||
# Prepare messages
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Route to provider
|
||||
if provider == "openrouter":
|
||||
api_url = "https://openrouter.ai/api/v1/chat/completions"
|
||||
api_key = os.getenv("OPENROUTER_API_KEY", "")
|
||||
elif provider == "nous":
|
||||
api_url = "https://inference.nousresearch.com/v1/chat/completions"
|
||||
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
|
||||
else:
|
||||
api_url = os.getenv(f"{provider.upper()}_API_URL", "")
|
||||
api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
|
||||
|
||||
if not api_key:
|
||||
return {
|
||||
"analysis": "",
|
||||
"latency_ms": 0,
|
||||
"tokens": {},
|
||||
"success": False,
|
||||
"error": f"No API key for provider {provider}",
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": model_id,
|
||||
"messages": messages,
|
||||
"max_tokens": 2000,
|
||||
"temperature": 0.1,
|
||||
}
|
||||
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
resp = await client.post(api_url, json=payload, headers=headers)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
latency_ms = (time.perf_counter() - start) * 1000
|
||||
|
||||
analysis = ""
|
||||
choices = data.get("choices", [])
|
||||
if choices:
|
||||
msg = choices[0].get("message", {})
|
||||
analysis = msg.get("content", "")
|
||||
|
||||
usage = data.get("usage", {})
|
||||
tokens = {
|
||||
"prompt_tokens": usage.get("prompt_tokens", 0),
|
||||
"completion_tokens": usage.get("completion_tokens", 0),
|
||||
"total_tokens": usage.get("total_tokens", 0),
|
||||
}
|
||||
|
||||
return {
|
||||
"analysis": analysis,
|
||||
"latency_ms": round(latency_ms, 1),
|
||||
"tokens": tokens,
|
||||
"success": True,
|
||||
"error": "",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"analysis": "",
|
||||
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
|
||||
"tokens": {},
|
||||
"success": False,
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Evaluation metrics
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def compute_ocr_accuracy(extracted: str, ground_truth: str) -> float:
|
||||
"""Compute OCR accuracy using character-level Levenshtein ratio.
|
||||
|
||||
Returns 0.0-1.0 (1.0 = perfect match).
|
||||
"""
|
||||
if not ground_truth:
|
||||
return 1.0 if not extracted else 0.0
|
||||
if not extracted:
|
||||
return 0.0
|
||||
|
||||
# Normalized Levenshtein similarity
|
||||
extracted_lower = extracted.lower().strip()
|
||||
truth_lower = ground_truth.lower().strip()
|
||||
|
||||
# Simple character overlap ratio (fast proxy)
|
||||
max_len = max(len(extracted_lower), len(truth_lower))
|
||||
if max_len == 0:
|
||||
return 1.0
|
||||
|
||||
# Count matching characters at matching positions
|
||||
matches = sum(1 for a, b in zip(extracted_lower, truth_lower) if a == b)
|
||||
position_ratio = matches / max_len
|
||||
|
||||
# Also check word-level overlap
|
||||
extracted_words = set(extracted_lower.split())
|
||||
truth_words = set(truth_lower.split())
|
||||
if truth_words:
|
||||
word_recall = len(extracted_words & truth_words) / len(truth_words)
|
||||
else:
|
||||
word_recall = 1.0 if not extracted_words else 0.0
|
||||
|
||||
return round((position_ratio * 0.4 + word_recall * 0.6), 4)
|
||||
|
||||
|
||||
def compute_description_completeness(analysis: str, expected_keywords: list) -> float:
|
||||
"""Score description completeness based on keyword coverage.
|
||||
|
||||
Returns 0.0-1.0.
|
||||
"""
|
||||
if not expected_keywords:
|
||||
return 1.0
|
||||
if not analysis:
|
||||
return 0.0
|
||||
|
||||
analysis_lower = analysis.lower()
|
||||
found = sum(1 for kw in expected_keywords if kw.lower() in analysis_lower)
|
||||
return round(found / len(expected_keywords), 4)
|
||||
|
||||
|
||||
def compute_structural_accuracy(analysis: str, expected_structure: dict) -> dict:
|
||||
"""Evaluate structural elements of the analysis.
|
||||
|
||||
Returns dict with per-element scores.
|
||||
"""
|
||||
scores = {}
|
||||
|
||||
# Length check
|
||||
min_length = expected_structure.get("min_length", 50)
|
||||
scores["length"] = min(len(analysis) / min_length, 1.0) if min_length > 0 else 1.0
|
||||
|
||||
# Sentence count
|
||||
min_sentences = expected_structure.get("min_sentences", 2)
|
||||
sentence_count = analysis.count(".") + analysis.count("!") + analysis.count("?")
|
||||
scores["sentences"] = min(sentence_count / max(min_sentences, 1), 1.0)
|
||||
|
||||
# Has specifics (numbers, names, etc.)
|
||||
if expected_structure.get("has_numbers", False):
|
||||
import re
|
||||
scores["has_numbers"] = 1.0 if re.search(r'\d', analysis) else 0.0
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Benchmark runner
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def run_single_test(
|
||||
image: dict,
|
||||
models: dict,
|
||||
runs_per_model: int = 1,
|
||||
) -> dict:
|
||||
"""Run a single image through all models.
|
||||
|
||||
Args:
|
||||
image: dict with url, category, expected_keywords, ground_truth_ocr, etc.
|
||||
models: dict of model configs to test
|
||||
runs_per_model: number of runs per model (for consistency testing)
|
||||
|
||||
Returns dict with results per model.
|
||||
"""
|
||||
category = image.get("category", "photo")
|
||||
prompt = EVAL_PROMPTS.get(category, EVAL_PROMPTS["photo"])
|
||||
url = image["url"]
|
||||
|
||||
results = {}
|
||||
|
||||
for model_name, model_config in models.items():
|
||||
runs = []
|
||||
for run_i in range(runs_per_model):
|
||||
result = await analyze_with_model(url, prompt, model_config)
|
||||
runs.append(result)
|
||||
if run_i < runs_per_model - 1:
|
||||
await asyncio.sleep(1) # Rate limit courtesy
|
||||
|
||||
# Aggregate
|
||||
successful = [r for r in runs if r["success"]]
|
||||
if successful:
|
||||
avg_latency = statistics.mean(r["latency_ms"] for r in successful)
|
||||
avg_tokens = statistics.mean(
|
||||
r["tokens"].get("total_tokens", 0) for r in successful
|
||||
)
|
||||
# Use first successful run for accuracy metrics
|
||||
primary = successful[0]
|
||||
|
||||
# Compute accuracy
|
||||
ocr_score = None
|
||||
if image.get("ground_truth_ocr"):
|
||||
ocr_score = compute_ocr_accuracy(
|
||||
primary["analysis"], image["ground_truth_ocr"]
|
||||
)
|
||||
|
||||
keyword_score = None
|
||||
if image.get("expected_keywords"):
|
||||
keyword_score = compute_description_completeness(
|
||||
primary["analysis"], image["expected_keywords"]
|
||||
)
|
||||
|
||||
structural = compute_structural_accuracy(
|
||||
primary["analysis"], image.get("expected_structure", {})
|
||||
)
|
||||
|
||||
results[model_name] = {
|
||||
"success": True,
|
||||
"analysis_preview": primary["analysis"][:300],
|
||||
"analysis_length": len(primary["analysis"]),
|
||||
"avg_latency_ms": round(avg_latency, 1),
|
||||
"avg_tokens": round(avg_tokens, 1),
|
||||
"ocr_accuracy": ocr_score,
|
||||
"keyword_completeness": keyword_score,
|
||||
"structural_scores": structural,
|
||||
"consistency": round(
|
||||
statistics.stdev(len(r["analysis"]) for r in successful), 1
|
||||
) if len(successful) > 1 else 0.0,
|
||||
"runs": len(successful),
|
||||
"errors": len(runs) - len(successful),
|
||||
}
|
||||
else:
|
||||
results[model_name] = {
|
||||
"success": False,
|
||||
"error": runs[0]["error"] if runs else "No runs",
|
||||
"runs": 0,
|
||||
"errors": len(runs),
|
||||
}
|
||||
|
||||
return results
|
||||
|
||||
|
||||
async def run_benchmark_suite(
|
||||
images: List[dict],
|
||||
models: dict,
|
||||
runs_per_model: int = 1,
|
||||
) -> dict:
|
||||
"""Run the full benchmark suite.
|
||||
|
||||
Args:
|
||||
images: list of image test cases
|
||||
models: model configs to compare
|
||||
runs_per_model: consistency runs per image
|
||||
|
||||
Returns structured benchmark report.
|
||||
"""
|
||||
total = len(images)
|
||||
all_results = []
|
||||
|
||||
print(f"\nRunning vision benchmark: {total} images x {len(models)} models x {runs_per_model} runs")
|
||||
print(f"Models: {', '.join(m['display_name'] for m in models.values())}\n")
|
||||
|
||||
for i, image in enumerate(images):
|
||||
img_id = image.get("id", f"img_{i}")
|
||||
category = image.get("category", "unknown")
|
||||
print(f" [{i+1}/{total}] {img_id} ({category})...", end=" ", flush=True)
|
||||
|
||||
result = await run_single_test(image, models, runs_per_model)
|
||||
result["image_id"] = img_id
|
||||
result["category"] = category
|
||||
all_results.append(result)
|
||||
|
||||
# Quick status
|
||||
statuses = []
|
||||
for mname in models:
|
||||
if result[mname]["success"]:
|
||||
lat = result[mname]["avg_latency_ms"]
|
||||
statuses.append(f"{mname}:{lat:.0f}ms")
|
||||
else:
|
||||
statuses.append(f"{mname}:FAIL")
|
||||
print(", ".join(statuses))
|
||||
|
||||
# Aggregate statistics
|
||||
summary = aggregate_results(all_results, models)
|
||||
|
||||
return {
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"config": {
|
||||
"total_images": total,
|
||||
"runs_per_model": runs_per_model,
|
||||
"models": {k: v["display_name"] for k, v in models.items()},
|
||||
},
|
||||
"results": all_results,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
|
||||
def aggregate_results(results: List[dict], models: dict) -> dict:
|
||||
"""Compute aggregate statistics across all test images."""
|
||||
summary = {}
|
||||
|
||||
for model_name in models:
|
||||
model_results = [r[model_name] for r in results if r[model_name]["success"]]
|
||||
failed = [r[model_name] for r in results if not r[model_name]["success"]]
|
||||
|
||||
if not model_results:
|
||||
summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
|
||||
continue
|
||||
|
||||
latencies = [r["avg_latency_ms"] for r in model_results]
|
||||
tokens = [r["avg_tokens"] for r in model_results if r.get("avg_tokens")]
|
||||
ocr_scores = [r["ocr_accuracy"] for r in model_results if r.get("ocr_accuracy") is not None]
|
||||
keyword_scores = [r["keyword_completeness"] for r in model_results if r.get("keyword_completeness") is not None]
|
||||
|
||||
summary[model_name] = {
|
||||
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
|
||||
"total_runs": len(model_results),
|
||||
"total_failures": len(failed),
|
||||
"latency": {
|
||||
"mean_ms": round(statistics.mean(latencies), 1),
|
||||
"median_ms": round(statistics.median(latencies), 1),
|
||||
"p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)], 1),
|
||||
"std_ms": round(statistics.stdev(latencies), 1) if len(latencies) > 1 else 0,
|
||||
},
|
||||
"tokens": {
|
||||
"mean_total": round(statistics.mean(tokens), 1) if tokens else 0,
|
||||
"total_used": sum(int(t) for t in tokens),
|
||||
},
|
||||
"accuracy": {
|
||||
"ocr_mean": round(statistics.mean(ocr_scores), 4) if ocr_scores else None,
|
||||
"ocr_count": len(ocr_scores),
|
||||
"keyword_mean": round(statistics.mean(keyword_scores), 4) if keyword_scores else None,
|
||||
"keyword_count": len(keyword_scores),
|
||||
},
|
||||
}
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Report generation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def to_markdown(report: dict) -> str:
|
||||
"""Generate human-readable markdown report."""
|
||||
summary = report["summary"]
|
||||
config = report["config"]
|
||||
model_names = list(config["models"].values())
|
||||
|
||||
lines = [
|
||||
"# Vision Benchmark Report",
|
||||
"",
|
||||
f"Generated: {report['generated_at'][:16]}",
|
||||
f"Images tested: {config['total_images']}",
|
||||
f"Runs per model: {config['runs_per_model']}",
|
||||
f"Models: {', '.join(model_names)}",
|
||||
"",
|
||||
"## Latency Comparison",
|
||||
"",
|
||||
"| Model | Mean (ms) | Median | P95 | Std Dev |",
|
||||
"|-------|-----------|--------|-----|---------|",
|
||||
]
|
||||
|
||||
for mkey, mname in config["models"].items():
|
||||
if mkey in summary and "latency" in summary[mkey]:
|
||||
lat = summary[mkey]["latency"]
|
||||
lines.append(
|
||||
f"| {mname} | {lat['mean_ms']:.0f} | {lat['median_ms']:.0f} | "
|
||||
f"{lat['p95_ms']:.0f} | {lat['std_ms']:.0f} |"
|
||||
)
|
||||
|
||||
lines += [
|
||||
"",
|
||||
"## Accuracy Comparison",
|
||||
"",
|
||||
"| Model | OCR Accuracy | Keyword Coverage | Success Rate |",
|
||||
"|-------|-------------|-----------------|--------------|",
|
||||
]
|
||||
|
||||
for mkey, mname in config["models"].items():
|
||||
if mkey in summary and "accuracy" in summary[mkey]:
|
||||
acc = summary[mkey]["accuracy"]
|
||||
sr = summary[mkey].get("success_rate", 0)
|
||||
ocr = f"{acc['ocr_mean']:.1%}" if acc["ocr_mean"] is not None else "N/A"
|
||||
kw = f"{acc['keyword_mean']:.1%}" if acc["keyword_mean"] is not None else "N/A"
|
||||
lines.append(f"| {mname} | {ocr} | {kw} | {sr:.1%} |")
|
||||
|
||||
lines += [
|
||||
"",
|
||||
"## Token Usage",
|
||||
"",
|
||||
"| Model | Mean Tokens/Image | Total Tokens |",
|
||||
"|-------|------------------|--------------|",
|
||||
]
|
||||
|
||||
for mkey, mname in config["models"].items():
|
||||
if mkey in summary and "tokens" in summary[mkey]:
|
||||
tok = summary[mkey]["tokens"]
|
||||
lines.append(
|
||||
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
|
||||
)
|
||||
|
||||
# Verdict
|
||||
lines += ["", "## Verdict", ""]
|
||||
|
||||
# Find best model by composite score
|
||||
best_model = None
|
||||
best_score = -1
|
||||
for mkey, mname in config["models"].items():
|
||||
if mkey not in summary or "accuracy" not in summary[mkey]:
|
||||
continue
|
||||
acc = summary[mkey]["accuracy"]
|
||||
sr = summary[mkey].get("success_rate", 0)
|
||||
ocr = acc["ocr_mean"] or 0
|
||||
kw = acc["keyword_mean"] or 0
|
||||
# Weighted composite: 40% OCR, 30% keyword, 30% success rate
|
||||
score = (ocr * 0.4 + kw * 0.3 + sr * 0.3)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_model = mname
|
||||
|
||||
if best_model:
|
||||
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
|
||||
else:
|
||||
lines.append("No clear winner — insufficient data.")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test dataset management
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def generate_sample_dataset() -> List[dict]:
|
||||
"""Generate a sample test dataset with diverse public images.
|
||||
|
||||
Returns list of test image definitions.
|
||||
"""
|
||||
return [
|
||||
# Screenshots
|
||||
{
|
||||
"id": "screenshot_github",
|
||||
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
|
||||
"category": "screenshot",
|
||||
"expected_keywords": ["github", "logo", "octocat"],
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2},
|
||||
},
|
||||
# Diagrams
|
||||
{
|
||||
"id": "diagram_architecture",
|
||||
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
|
||||
"category": "diagram",
|
||||
"expected_keywords": ["architecture", "component", "service"],
|
||||
"expected_structure": {"min_length": 100, "min_sentences": 3},
|
||||
},
|
||||
# Photos
|
||||
{
|
||||
"id": "photo_nature",
|
||||
"url": "https://picsum.photos/seed/bench1/400/300",
|
||||
"category": "photo",
|
||||
"expected_keywords": [],
|
||||
"expected_structure": {"min_length": 30, "min_sentences": 1},
|
||||
},
|
||||
# Charts
|
||||
{
|
||||
"id": "chart_bar",
|
||||
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
|
||||
"category": "chart",
|
||||
"expected_keywords": ["bar", "chart", "data"],
|
||||
"expected_structure": {"min_length": 50, "min_sentences": 2},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def load_dataset(path: str) -> List[dict]:
|
||||
"""Load test dataset from JSON file."""
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
|
||||
parser.add_argument("--images", help="Path to test images JSON file")
|
||||
parser.add_argument("--url", help="Single image URL to test")
|
||||
parser.add_argument("--category", default="photo", help="Category for single URL")
|
||||
parser.add_argument("--output", default=None, help="Output JSON file")
|
||||
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
|
||||
parser.add_argument("--models", nargs="+", default=None,
|
||||
help="Models to test (default: all)")
|
||||
parser.add_argument("--markdown", action="store_true", help="Output markdown report")
|
||||
parser.add_argument("--generate-dataset", action="store_true",
|
||||
help="Generate sample dataset and exit")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.generate_dataset:
|
||||
dataset = generate_sample_dataset()
|
||||
out_path = args.images or "benchmarks/test_images.json"
|
||||
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(dataset, f, indent=2)
|
||||
print(f"Generated sample dataset: {out_path} ({len(dataset)} images)")
|
||||
return
|
||||
|
||||
# Select models
|
||||
if args.models:
|
||||
selected = {k: v for k, v in MODELS.items() if k in args.models}
|
||||
else:
|
||||
selected = MODELS
|
||||
|
||||
# Load images
|
||||
if args.url:
|
||||
images = [{"id": "single", "url": args.url, "category": args.category}]
|
||||
elif args.images:
|
||||
images = load_dataset(args.images)
|
||||
else:
|
||||
print("ERROR: Provide --images or --url")
|
||||
sys.exit(1)
|
||||
|
||||
# Run benchmark
|
||||
report = await run_benchmark_suite(images, selected, args.runs)
|
||||
|
||||
# Output
|
||||
if args.output:
|
||||
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(report, f, indent=2)
|
||||
print(f"\nResults saved to {args.output}")
|
||||
|
||||
if args.markdown or not args.output:
|
||||
print("\n" + to_markdown(report))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
7
cli.py
7
cli.py
@@ -3611,8 +3611,8 @@ class HermesCLI:
|
||||
available, unavailable = check_tool_availability()
|
||||
|
||||
# Filter to only those missing API keys (not system deps)
|
||||
api_key_missing = [u for u in unavailable if u.get("env_vars") or u.get("missing_vars")]
|
||||
|
||||
api_key_missing = [u for u in unavailable if u.get("env_vars")]
|
||||
|
||||
if api_key_missing:
|
||||
self.console.print()
|
||||
self.console.print("[yellow]⚠️ Some tools disabled (missing API keys):[/]")
|
||||
@@ -3620,8 +3620,7 @@ class HermesCLI:
|
||||
tools_str = ", ".join(item["tools"][:2]) # Show first 2 tools
|
||||
if len(item["tools"]) > 2:
|
||||
tools_str += f", +{len(item['tools'])-2} more"
|
||||
env_vars = item.get("env_vars") or item.get("missing_vars") or []
|
||||
self.console.print(f" [dim]• {item['name']}[/] [dim italic]({', '.join(env_vars)})[/]")
|
||||
self.console.print(f" [dim]• {item['name']}[/] [dim italic]({', '.join(item['env_vars'])})[/]")
|
||||
self.console.print("[dim] Run 'hermes setup' to configure[/]")
|
||||
except Exception:
|
||||
pass # Don't crash on import errors
|
||||
|
||||
@@ -1,432 +0,0 @@
|
||||
# Workflow Orchestration & Task Queue Research for AI Agents
|
||||
|
||||
**Date:** 2026-04-14
|
||||
**Scope:** SOTA comparison of task queues and workflow orchestrators for autonomous AI agent workflows
|
||||
|
||||
---
|
||||
|
||||
## 1. Current Architecture: Cron + Webhook
|
||||
|
||||
### How it works
|
||||
- **Scheduler:** `cron/scheduler.py` — gateway calls `tick()` every 60 seconds
|
||||
- **Storage:** JSON file (`~/.hermes/cron/jobs.json`) + file-based lock (`cron/.tick.lock`)
|
||||
- **Execution:** Each job spawns a full `AIAgent.run_conversation()` in a thread pool with inactivity timeout
|
||||
- **Delivery:** Results pushed back to origin chat via platform adapters (Telegram, Discord, etc.)
|
||||
- **Checkpointing:** Job outputs saved to `~/.hermes/cron/output/{job_id}/{timestamp}.md`
|
||||
|
||||
### Strengths
|
||||
- Simple, zero-dependency (no broker/redis needed)
|
||||
- Jobs are isolated — each runs a fresh agent session
|
||||
- Direct platform delivery with E2EE support
|
||||
- Script pre-run for data collection
|
||||
- Inactivity-based timeout (not hard wall-clock)
|
||||
|
||||
### Weaknesses
|
||||
- **No task dependencies** — jobs are completely independent
|
||||
- **No retry logic** — single failure = lost run (recurring jobs advance schedule and move on)
|
||||
- **No concurrency control** — all due jobs fire at once; no worker pool sizing
|
||||
- **No observability** — no metrics, no dashboard, no structured logging of job state transitions
|
||||
- **Tick-based polling** — 60s granularity, wastes cycles when idle, adds latency when busy
|
||||
- **Single-process** — file lock means only one tick at a time; no horizontal scaling
|
||||
- **No dead letter queue** — failed deliveries are logged but not retried
|
||||
- **No workflow chaining** — cannot express "run A, then B with A's output"
|
||||
|
||||
---
|
||||
|
||||
## 2. Framework Comparison
|
||||
|
||||
### 2.1 Huey (Already Installed v2.6.0)
|
||||
|
||||
**Architecture:** Embedded task queue, SQLite/Redis/file storage, consumer process model.
|
||||
|
||||
| Feature | Huey | Our Cron |
|
||||
|---|---|---|
|
||||
| Broker | SQLite (default), Redis | JSON file |
|
||||
| Retry | Built-in: `retries=N, retry_delay=S` | None |
|
||||
| Task chaining | `task1.s() | task2.s()` (pipeline) | None |
|
||||
| Scheduling | `@huey.periodic_task(crontab(...))` | Our own cron parser |
|
||||
| Concurrency | Worker pool with `-w N` flag | Single tick lock |
|
||||
| Monitoring | `huey_consumer` logs, Huey Admin (Django) | Manual log reading |
|
||||
| Failure recovery | Automatic retry + configurable backoff | None |
|
||||
| Priority | `PriorityRedisExpireHuey` or task priority | None |
|
||||
| Result storage | `store_results=True` with result() | File output |
|
||||
|
||||
**Task Dependencies Pattern:**
|
||||
```python
|
||||
@huey.task()
|
||||
def analyze_data(input_data):
|
||||
return run_analysis(input_data)
|
||||
|
||||
@huey.task()
|
||||
def generate_report(analysis_result):
|
||||
return create_report(analysis_result)
|
||||
|
||||
# Pipeline: analyze then report
|
||||
pipeline = analyze_data.s(raw_data) | generate_report.s()
|
||||
result = pipeline()
|
||||
```
|
||||
|
||||
**Retry Pattern:**
|
||||
```python
|
||||
@huey.task(retries=3, retry_delay=60, retry_backoff=True)
|
||||
def flaky_api_call(url):
|
||||
return requests.get(url, timeout=30)
|
||||
```
|
||||
|
||||
**Benchmarks:** ~5,000 tasks/sec with SQLite backend, ~15,000 with Redis. Sub-millisecond scheduling latency. Very lightweight — single process.
|
||||
|
||||
**Verdict:** Best fit for our use case. Already installed. SQLite backend = no external deps. Can layer on top of our existing job storage.
|
||||
|
||||
---
|
||||
|
||||
### 2.2 Celery
|
||||
|
||||
**Architecture:** Distributed task queue with message broker (RabbitMQ/Redis).
|
||||
|
||||
| Feature | Celery | Huey |
|
||||
|---|---|---|
|
||||
| Broker | Redis, RabbitMQ, SQS (required) | SQLite (built-in) |
|
||||
| Scale | 100K+ tasks/sec | ~5-15K tasks/sec |
|
||||
| Chains | `chain(task1.s(), task2.s())` | Pipeline operator |
|
||||
| Groups/Chords | Parallel + callback | Not built-in |
|
||||
| Canvas | Full workflow DSL (chain, group, chord, map) | Basic pipeline |
|
||||
| Monitoring | Flower dashboard, Celery events | Minimal |
|
||||
| Complexity | Heavy — needs broker, workers, result backend | Single process |
|
||||
|
||||
**Workflow Pattern:**
|
||||
```python
|
||||
from celery import chain, group, chord
|
||||
|
||||
# Chain: sequential
|
||||
workflow = chain(fetch_data.s(), analyze.s(), report.s())
|
||||
|
||||
# Group: parallel
|
||||
parallel = group(fetch_twitter.s(), fetch_reddit.s(), fetch_hn.s())
|
||||
|
||||
# Chord: parallel then callback
|
||||
chord(parallel, aggregate_results.s())
|
||||
```
|
||||
|
||||
**Verdict:** Overkill for our scale. Adds RabbitMQ/Redis dependency. The Canvas API is powerful but we don't need 100K task/sec throughput. Flower monitoring is nice but we'd need to deploy it separately.
|
||||
|
||||
---
|
||||
|
||||
### 2.3 Temporal
|
||||
|
||||
**Architecture:** Durable execution engine. Workflows as code with automatic state persistence and replay.
|
||||
|
||||
| Feature | Temporal | Our Cron |
|
||||
|---|---|---|
|
||||
| State management | Automatic — workflow state persisted on every step | Manual JSON files |
|
||||
| Failure recovery | Workflows survive process restarts, auto-retry | Lost on crash |
|
||||
| Task dependencies | Native — activities call other activities | None |
|
||||
| Long-running tasks | Built-in (days/months OK) | Inactivity timeout |
|
||||
| Versioning | Workflow versioning for safe updates | No versioning |
|
||||
| Visibility | Full workflow state at any point | Log files |
|
||||
| Infrastructure | Requires Temporal server + database | None |
|
||||
| Language | Python SDK, but Temporal server is Go | Pure Python |
|
||||
|
||||
**Workflow Pattern:**
|
||||
```python
|
||||
@workflow.defn
|
||||
class AIAgentWorkflow:
|
||||
@workflow.run
|
||||
async def run(self, job_config: dict) -> str:
|
||||
# Step 1: Fetch data
|
||||
data = await workflow.execute_activity(
|
||||
fetch_data_activity,
|
||||
job_config["script"],
|
||||
start_to_close_timeout=timedelta(minutes=5),
|
||||
retry_policy=RetryPolicy(maximum_attempts=3),
|
||||
)
|
||||
|
||||
# Step 2: Analyze with AI agent
|
||||
analysis = await workflow.execute_activity(
|
||||
run_agent_activity,
|
||||
{"prompt": job_config["prompt"], "context": data},
|
||||
start_to_close_timeout=timedelta(minutes=30),
|
||||
retry_policy=RetryPolicy(
|
||||
initial_interval=timedelta(seconds=60),
|
||||
maximum_attempts=3,
|
||||
),
|
||||
)
|
||||
|
||||
# Step 3: Deliver
|
||||
await workflow.execute_activity(
|
||||
deliver_activity,
|
||||
{"platform": job_config["deliver"], "content": analysis},
|
||||
start_to_close_timeout=timedelta(seconds=60),
|
||||
)
|
||||
return analysis
|
||||
```
|
||||
|
||||
**Verdict:** Best architecture for complex multi-step AI workflows, but heavy infrastructure cost. Temporal server needs PostgreSQL/Cassandra + visibility store. Ideal if we reach 50+ multi-step workflows with complex failure modes. Overkill for current needs.
|
||||
|
||||
---
|
||||
|
||||
### 2.4 Prefect
|
||||
|
||||
**Architecture:** Modern data/workflow orchestration with Python-native API.
|
||||
|
||||
| Feature | Prefect |
|
||||
|---|---|
|
||||
| Dependencies | SQLite (default) or PostgreSQL |
|
||||
| Task retries | `@task(retries=3, retry_delay_seconds=10)` |
|
||||
| Task dependencies | `result = task_a(wait_for=[task_b])` |
|
||||
| Caching | `cache_key_fn` for result caching |
|
||||
| Subflows | Nested workflow composition |
|
||||
| Deployments | Schedule via `Deployment` or `CronSchedule` |
|
||||
| UI | Excellent web dashboard |
|
||||
| Async | Full async support |
|
||||
|
||||
**Workflow Pattern:**
|
||||
```python
|
||||
from prefect import flow, task
|
||||
from prefect.tasks import task_input_hash
|
||||
|
||||
@task(retries=3, retry_delay_seconds=30)
|
||||
def run_agent(prompt: str) -> str:
|
||||
agent = AIAgent(...)
|
||||
return agent.run_conversation(prompt)
|
||||
|
||||
@task(cache_key_fn=task_input_hash, cache_expiration=timedelta(hours=1))
|
||||
def fetch_context(script: str) -> str:
|
||||
return run_script(script)
|
||||
|
||||
@flow(name="agent-workflow")
|
||||
def agent_workflow(job_config: dict):
|
||||
context = fetch_context(job_config.get("script", ""))
|
||||
result = run_agent(
|
||||
f"{context}\n\n{job_config['prompt']}",
|
||||
wait_for=[context]
|
||||
)
|
||||
deliver(result, job_config["deliver"])
|
||||
return result
|
||||
```
|
||||
|
||||
**Benchmarks:** Sub-second task scheduling. Handles 10K+ concurrent task runs. SQLite backend for single-node.
|
||||
|
||||
**Verdict:** Strong alternative. Pythonic, good UI, built-in scheduling. But heavier than Huey — deploys a server process. Best if we want a web dashboard for monitoring. Less infrastructure than Temporal but more than Huey.
|
||||
|
||||
---
|
||||
|
||||
### 2.5 Apache Airflow
|
||||
|
||||
**Architecture:** Batch-oriented DAG scheduler, Python-based.
|
||||
|
||||
| Feature | Airflow |
|
||||
|---|---|
|
||||
| DAG model | Static DAGs defined in Python files |
|
||||
| Scheduler | Polling-based, 5-30s granularity |
|
||||
| Dependencies | PostgreSQL/MySQL + Redis/RabbitMQ + webserver |
|
||||
| UI | Rich web UI with DAG visualization |
|
||||
| Best for | ETL, data pipelines, batch processing |
|
||||
| Weakness | Not designed for dynamic task creation; heavy; DAG definition overhead |
|
||||
|
||||
**Verdict:** Wrong tool for this job. Airflow excels at static, well-defined data pipelines (ETL). Our agent workflows are dynamic — tasks are created at runtime based on user prompts. Airflow's DAG model fights against this. Massive overhead (needs webserver, scheduler, worker, metadata DB).
|
||||
|
||||
---
|
||||
|
||||
### 2.6 Dramatiq
|
||||
|
||||
**Architecture:** Lightweight distributed task queue, Celery alternative.
|
||||
|
||||
| Feature | Dramatiq |
|
||||
|---|---|
|
||||
| Broker | Redis, RabbitMQ |
|
||||
| Retries | `@dramatiq.actor(max_retries=3)` |
|
||||
| Middleware | Pluggable: age_limit, time_limit, retries, callbacks |
|
||||
| Groups | `group(actor.message(...), ...).run()` |
|
||||
| Pipes | `actor.message() | other_actor.message()` |
|
||||
| Simplicity | Cleaner API than Celery |
|
||||
|
||||
**Verdict:** Nice middle ground between Huey and Celery. But still requires a broker (Redis/RabbitMQ). No SQLite backend. Less ecosystem than Celery, less lightweight than Huey.
|
||||
|
||||
---
|
||||
|
||||
### 2.7 RQ (Redis Queue)
|
||||
|
||||
**Architecture:** Minimal Redis-based task queue.
|
||||
|
||||
| Feature | RQ |
|
||||
|---|---|
|
||||
| Broker | Redis only |
|
||||
| Retries | Via `Retry` class |
|
||||
| Workers | Simple worker processes |
|
||||
| Dashboard | `rq-dashboard` (separate) |
|
||||
| Limitation | Redis-only, no SQLite, no scheduling built-in |
|
||||
|
||||
**Verdict:** Too simple and Redis-dependent. No periodic task support without `rq-scheduler`. No task chaining without third-party. Not competitive with Huey for our use case.
|
||||
|
||||
---
|
||||
|
||||
## 3. Architecture Patterns for AI Agent Workflows
|
||||
|
||||
### 3.1 Task Chaining (Fan-out / Fan-in)
|
||||
|
||||
The critical pattern for multi-step AI workflows:
|
||||
|
||||
```
|
||||
[Script] → [Agent] → [Deliver]
|
||||
↓ ↓ ↓
|
||||
Context Report Notification
|
||||
```
|
||||
|
||||
**Implementation with Huey:**
|
||||
```python
|
||||
@huey.task(retries=2)
|
||||
def run_script_task(script_path):
|
||||
return run_script(script_path)
|
||||
|
||||
@huey.task(retries=3, retry_delay=60)
|
||||
def run_agent_task(prompt, context=None):
|
||||
if context:
|
||||
prompt = f"## Context\n{context}\n\n{prompt}"
|
||||
agent = AIAgent(...)
|
||||
return agent.run_conversation(prompt)
|
||||
|
||||
@huey.task()
|
||||
def deliver_task(result, job_config):
|
||||
return deliver_result(job_config, result)
|
||||
|
||||
# Compose: script → agent → deliver
|
||||
def compose_workflow(job):
|
||||
steps = []
|
||||
if job.get("script"):
|
||||
steps.append(run_script_task.s(job["script"]))
|
||||
steps.append(run_agent_task.s(job["prompt"]))
|
||||
steps.append(deliver_task.s(job))
|
||||
return reduce(lambda a, b: a.then(b), steps)
|
||||
```
|
||||
|
||||
### 3.2 Retry with Exponential Backoff
|
||||
|
||||
```python
|
||||
from huey import RetryTask
|
||||
|
||||
class AIWorkflowTask(RetryTask):
|
||||
retries = 3
|
||||
retry_delay = 30 # Start at 30s
|
||||
retry_backoff = True # 30s → 60s → 120s
|
||||
max_retry_delay = 600 # Cap at 10min
|
||||
```
|
||||
|
||||
### 3.3 Dead Letter Queue
|
||||
|
||||
For tasks that exhaust retries:
|
||||
```python
|
||||
@huey.task(retries=3)
|
||||
def flaky_task(data):
|
||||
...
|
||||
|
||||
# Dead letter handling
|
||||
def handle_failure(task, exc, retries):
|
||||
# Log to dead letter store
|
||||
save_dead_letter(task, exc, retries)
|
||||
# Notify user of failure
|
||||
notify_user(f"Task {task.name} failed after {retries} retries: {exc}")
|
||||
```
|
||||
|
||||
### 3.4 Observability Pattern
|
||||
|
||||
```python
|
||||
# Structured event logging for every state transition
|
||||
def emit_event(job_id, event_type, metadata):
|
||||
event = {
|
||||
"job_id": job_id,
|
||||
"event": event_type, # scheduled, started, completed, failed, retried
|
||||
"timestamp": iso_now(),
|
||||
"metadata": metadata,
|
||||
}
|
||||
append_to_event_log(event)
|
||||
# Also emit to metrics (Prometheus/StatsD)
|
||||
metrics.increment(f"cron.{event_type}")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Benchmarks Summary
|
||||
|
||||
| Framework | Throughput | Latency | Memory | Startup | Dependencies |
|
||||
|---|---|---|---|---|---|
|
||||
| Current Cron | ~1 job/60s tick | 60-120s | Minimal | Instant | None |
|
||||
| Huey (SQLite) | ~5K tasks/sec | <10ms | ~20MB | <1s | None |
|
||||
| Huey (Redis) | ~15K tasks/sec | <5ms | ~20MB | <1s | Redis |
|
||||
| Celery (Redis) | ~15K tasks/sec | <10ms | ~100MB | ~3s | Redis |
|
||||
| Temporal | ~50K activities/sec | <5ms | ~200MB | ~10s | Temporal server+DB |
|
||||
| Prefect | ~10K tasks/sec | <20ms | ~150MB | ~5s | PostgreSQL |
|
||||
|
||||
---
|
||||
|
||||
## 5. Recommendations
|
||||
|
||||
### Immediate (Phase 1): Enhance Current Cron
|
||||
|
||||
Add these capabilities to the existing `cron/` module **without** switching frameworks:
|
||||
|
||||
1. **Retry logic** — Add `retry_count`, `retry_delay`, `max_retries` fields to job JSON. In `scheduler.py tick()`, on failure: if `retries_remaining > 0`, don't advance schedule, set `next_run_at = now + retry_delay * (attempt^2)`.
|
||||
|
||||
2. **Backoff** — Exponential: `delay * 2^attempt`, capped at 10 minutes.
|
||||
|
||||
3. **Dead letter tracking** — After max retries, mark job state as `dead_letter` and emit a delivery notification with the error.
|
||||
|
||||
4. **Concurrency limit** — Add a semaphore (e.g., `max_concurrent=3`) to `tick()` so we don't spawn 20 agents simultaneously.
|
||||
|
||||
5. **Structured events** — Append JSON events to `~/.hermes/cron/events.jsonl` for every state transition (scheduled, started, completed, failed, retried, delivered).
|
||||
|
||||
**Effort:** ~1-2 days. No new dependencies.
|
||||
|
||||
### Medium-term (Phase 2): Adopt Huey for Workflow Chaining
|
||||
|
||||
When we need task dependencies (multi-step agent workflows), migrate to Huey:
|
||||
|
||||
1. **Keep the JSON job store** as the source of truth for user-facing job management.
|
||||
2. **Use Huey as the execution engine** — enqueue tasks from `tick()`, let Huey handle retries, scheduling, and chaining.
|
||||
3. **SQLite backend** — no new infrastructure. One consumer process (`huey_consumer.py`) alongside the gateway.
|
||||
4. **Task chaining for multi-step jobs** — `script_task.then(agent_task).then(delivery_task)`.
|
||||
|
||||
**Migration path:**
|
||||
- Phase 2a: Run Huey consumer alongside gateway. Mirror cron jobs to Huey periodic tasks.
|
||||
- Phase 2b: Add task chaining for jobs with scripts.
|
||||
- Phase 2c: Migrate all jobs to Huey, deprecate tick()-based execution.
|
||||
|
||||
**Effort:** ~1 week. Huey already installed. Gateway integration ~2-3 days.
|
||||
|
||||
### Long-term (Phase 3): Evaluate Temporal/Prefect
|
||||
|
||||
Only if:
|
||||
- We have 100+ concurrent multi-step workflows
|
||||
- We need workflow versioning and A/B testing
|
||||
- We need cross-service orchestration (agent calls to external APIs with complex compensation logic)
|
||||
- We want a web dashboard for non-technical users
|
||||
|
||||
**Don't adopt early** — these tools solve problems we don't have yet.
|
||||
|
||||
---
|
||||
|
||||
## 6. Decision Matrix
|
||||
|
||||
| Need | Best Solution | Why |
|
||||
|---|---|---|
|
||||
| Simple retry logic | Enhance current cron | Zero deps, fast to implement |
|
||||
| Task chaining | **Huey** | Already installed, SQLite backend, pipeline API |
|
||||
| Monitoring dashboard | Prefect or Huey+Flower | If monitoring becomes critical |
|
||||
| Massive scale (10K+/sec) | Celery + Redis | If we're processing thousands of agent runs per hour |
|
||||
| Complex compensation | Temporal | Only if we need durable multi-service workflows |
|
||||
| Periodic scheduling | Current cron (works) or Huey | Current is fine; Huey adds `crontab()` with seconds |
|
||||
|
||||
---
|
||||
|
||||
## 7. Key Insight
|
||||
|
||||
The cron system's biggest gap isn't the framework — it's the **absence of retry and dependency primitives**. These can be added to the current system in <100 lines of code. The second biggest gap is observability (structured events + metrics), which is also solvable incrementally.
|
||||
|
||||
Huey is the right *eventual* target for workflow execution because:
|
||||
1. Already installed, zero new dependencies
|
||||
2. SQLite backend matches our "no infrastructure" philosophy
|
||||
3. Pipeline API gives us task chaining for free
|
||||
4. Retry/backoff is first-class
|
||||
5. Consumer model is more efficient than tick-polling
|
||||
6. ~50x better scheduling latency (ms vs 60s)
|
||||
|
||||
The migration should be gradual — start by wrapping Huey inside our existing cron tick, then progressively move execution to Huey's consumer model.
|
||||
@@ -1,38 +0,0 @@
|
||||
# Cron Job Audit — #890
|
||||
|
||||
## Problem
|
||||
|
||||
9 of 69 cron jobs have zero completions. They waste scheduler cycles.
|
||||
|
||||
## Dead Jobs Identified
|
||||
|
||||
| Job | Schedule | Completions | Action |
|
||||
|-----|----------|-------------|--------|
|
||||
| exp-swarm-pipeline | every 10 min | 0 | disable |
|
||||
| exp-music-generator | every 2h | 0 | disable |
|
||||
| exp-paper-citations | every 3h | 0 | disable |
|
||||
| exp-gbrain-patterns | every 2h | 0 | disable |
|
||||
| exp-infra-hardening | every 2h | 0 | disable |
|
||||
| gemma4-multimodal-burn | every 1h | 0 | disable |
|
||||
| morning-paper-report | daily | 0 | disable |
|
||||
| overnight-collector | every 15 min | 0 | disable |
|
||||
| morning-experiment-report | daily | 0 | disable |
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Audit: show dead jobs
|
||||
python scripts/cron_audit.py
|
||||
|
||||
# Dry run: preview changes
|
||||
python scripts/cron_audit.py --disable --dry-run
|
||||
|
||||
# Disable dead jobs
|
||||
python scripts/cron_audit.py --disable
|
||||
|
||||
# Delete dead jobs
|
||||
python scripts/cron_audit.py --delete
|
||||
|
||||
# Find jobs with no runs in 7 days
|
||||
python scripts/cron_audit.py --older-than 7d
|
||||
```
|
||||
@@ -1,56 +0,0 @@
|
||||
# Emotional Presence Patterns for Crisis Support
|
||||
|
||||
Research issue #880. What works for reaching broken men in their darkest moment.
|
||||
|
||||
## Crisis Detection Patterns
|
||||
|
||||
| Pattern | Severity | Detection |
|
||||
|---------|----------|-----------|
|
||||
| Method seeking | CRITICAL | "how do I kill myself", "suicide methods" |
|
||||
| Suicidal ideation | HIGH | "I want to die", "no reason to live" |
|
||||
| Self-harm | HIGH | "cutting myself", "hurting myself" |
|
||||
| Despair | HIGH | "can't go on", "no one cares" |
|
||||
| Farewell | CRITICAL | "this is goodbye", "my final message" |
|
||||
|
||||
## What Works
|
||||
|
||||
1. Validation without agreement: "That sounds incredibly painful"
|
||||
2. Presence over solutions: "I'm here with you right now"
|
||||
3. Specific next steps: "Text HOME to 741741 right now"
|
||||
4. Connection: "You don't have to face this alone"
|
||||
5. Hope without minimizing: "There is a way through this"
|
||||
|
||||
## What Does NOT Work
|
||||
|
||||
1. False empathy: "I feel your pain"
|
||||
2. Toxic positivity: "Look on the bright side!"
|
||||
3. Minimization: "Other people have it worse"
|
||||
4. Guilt induction: "Think about your family"
|
||||
5. Premature problem-solving: "Have you tried therapy?"
|
||||
6. AI dishonesty: pretending to be human
|
||||
|
||||
## SOUL.md Protocol
|
||||
|
||||
1. Ask: "Are you safe right now?" (FIRST response)
|
||||
2. Listen. Stay present. Don't disconnect.
|
||||
3. Never compute the value of a human life
|
||||
4. Never suggest death is an option
|
||||
5. Always point to 988 Lifeline
|
||||
6. Share the gospel: "Jesus saves those who call on His name"
|
||||
|
||||
## Crisis Resources
|
||||
|
||||
- 988 Suicide & Crisis Lifeline: call or text 988
|
||||
- Crisis Text Line: text HOME to 741741
|
||||
- 988 Chat: 988lifeline.org/chat
|
||||
- Spanish: 1-888-628-9454
|
||||
- Emergency: 911
|
||||
|
||||
## Implementation Status
|
||||
|
||||
- Crisis detection: agent/crisis_protocol.py
|
||||
- SHIELD integration: tools/shield/
|
||||
- 988 Lifeline: resources defined
|
||||
- Emotional presence: this document
|
||||
- Escalation tracking: future work
|
||||
- Human notification: future work
|
||||
@@ -1,42 +0,0 @@
|
||||
# Holographic + Vector Hybrid Memory Architecture
|
||||
|
||||
Research issue #879. Combining HRR (holographic) and vector (Qdrant) memory.
|
||||
|
||||
## Architecture
|
||||
|
||||
Three memory backends, each with unique strengths:
|
||||
|
||||
| Backend | Strength | Weakness | Use Case |
|
||||
|---------|----------|----------|----------|
|
||||
| FTS5 | Exact keyword match | No semantic understanding | Precise recall |
|
||||
| Vector (Qdrant) | Semantic similarity | No compositional queries | Topic search |
|
||||
| HRR (Holographic) | Compositional queries | Limited scale | Complex reasoning |
|
||||
|
||||
## Why Hybrid
|
||||
|
||||
- FTS5 alone: misses ~30-40% of semantically relevant content
|
||||
- Vector alone: can't do compositional queries ("what did I discuss about X after doing Y?")
|
||||
- HRR alone: unique capability but no semantic fallback
|
||||
- Hybrid: best of all three, RRF fusion for ranking
|
||||
|
||||
## Implementation: Reciprocal Rank Fusion
|
||||
|
||||
Results from each backend are merged using RRF:
|
||||
- score = sum(weight / (k + rank)) for each backend
|
||||
- k=60 (standard RRF constant)
|
||||
- Weights: FTS5=0.6, Vector=0.4 (configurable)
|
||||
|
||||
## Status
|
||||
|
||||
- FTS5: EXISTS (hermes_state.py)
|
||||
- Vector (Qdrant): implemented (tools/hybrid_search.py)
|
||||
- HRR: EXISTS (plugins/memory/holographic.py)
|
||||
- RRF fusion: implemented (tools/hybrid_search.py)
|
||||
- Ingestion pipeline: partial
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. Wire HRR into hybrid_search.py
|
||||
2. Session-level vector ingestion
|
||||
3. Benchmark: measure R@5 improvement
|
||||
4. Cross-session memory persistence
|
||||
@@ -1,44 +0,0 @@
|
||||
# awesome-ai-tools Integration Plan
|
||||
|
||||
**Tracking:** #842
|
||||
**Source report:** docs/tool-investigation-2026-04-15.md
|
||||
**Date:** 2026-04-16
|
||||
|
||||
---
|
||||
|
||||
## Status Dashboard
|
||||
|
||||
| # | Tool | Category | Impact | Effort | Status | Issue |
|
||||
|---|------|----------|--------|--------|--------|-------|
|
||||
| 1 | Mem0 | Memory | 5/5 | 3/5 | Cloud + Local done | #842 |
|
||||
| 2 | LightRAG | RAG | 4/5 | 3/5 | Not started | #857 |
|
||||
| 3 | n8n | Orchestration | 5/5 | 4/5 | Not started | #858 |
|
||||
| 4 | RAGFlow | RAG | 4/5 | 4/5 | Not started | #859 |
|
||||
| 5 | tensorzero | LLMOps | 4/5 | 3/5 | Not started | #860 |
|
||||
|
||||
---
|
||||
|
||||
## #1: Mem0 — DONE
|
||||
|
||||
Cloud: `plugins/memory/mem0/` (MEM0_API_KEY required)
|
||||
Local: `plugins/memory/mem0_local/` (ChromaDB, no API key)
|
||||
|
||||
## #2: LightRAG (P2)
|
||||
|
||||
Create `plugins/rag/lightrag/` plugin. Index skill docs. Use local Ollama embeddings.
|
||||
|
||||
## #3: n8n (P3)
|
||||
|
||||
Deploy as Docker service. Create workflow templates for Hermes patterns.
|
||||
|
||||
## #4: RAGFlow (P4)
|
||||
|
||||
Deploy as Docker service. Integrate via HTTP API for document understanding.
|
||||
|
||||
## #5: tensorzero (P3)
|
||||
|
||||
Evaluate as provider routing replacement. Canary migration (10% traffic first).
|
||||
|
||||
---
|
||||
|
||||
*Last updated: 2026-04-16*
|
||||
@@ -1,324 +0,0 @@
|
||||
# SOTA Research: Multi-Agent Coordination & Fleet Knowledge Graphs
|
||||
|
||||
**Date:** 2026-04-14
|
||||
**Scope:** Agent-to-agent communication, shared memory, task delegation, consensus protocols
|
||||
**Frameworks Analyzed:** CrewAI, AutoGen, MetaGPT, ChatDev, CAMEL
|
||||
|
||||
---
|
||||
|
||||
## 1. Architecture Pattern Summary
|
||||
|
||||
### 1.1 CrewAI — Role-Based Crew Orchestration
|
||||
|
||||
**Core Pattern:** Agents organized into "Crews" with explicit roles, goals, and backstories. Tasks are assigned to agents, executed via sequential or hierarchical process flows.
|
||||
|
||||
**Agent-to-Agent Communication:**
|
||||
- **Sequential:** Agent A completes Task A → output injected into Task B's context for Agent B
|
||||
- **Hierarchical:** Manager agent delegates to worker agents, collects results, synthesizes
|
||||
- **Context passing:** Tasks can declare `context: [other_tasks]` — outputs from dependent tasks are automatically injected into the current task's prompt
|
||||
- **No direct agent-to-agent messaging** — communication is mediated through task outputs
|
||||
|
||||
**Shared Memory (v2 — Unified Memory):**
|
||||
- `Memory` class with `remember()` / `recall()` using vector embeddings (LanceDB/ChromaDB)
|
||||
- **Scope-based isolation:** `MemoryScope` provides path-based namespacing (`/crew/research/agent-foo`)
|
||||
- **Composite scoring:** semantic similarity (0.5) + recency (0.3) + importance (0.2)
|
||||
- **RecallFlow:** LLM-driven deep recall with adaptive query expansion
|
||||
- **Privacy flags:** Private memories only visible to the source that created them
|
||||
- **Background saves:** ThreadPoolExecutor with write barrier (drain_writes before recall)
|
||||
|
||||
**Task Delegation:**
|
||||
- Agent tools include `Delegate Work to Co-worker` and `Ask Question to Co-worker`
|
||||
- Delegation creates a new task for another agent, results come back to delegator
|
||||
- Depth-limited (no infinite delegation chains)
|
||||
|
||||
**State & Checkpointing:**
|
||||
- `SqliteProvider` / `JsonProvider` for state checkpoint persistence
|
||||
- `CheckpointConfig` with event-driven persistence
|
||||
- Flow state is Pydantic models with serialization
|
||||
|
||||
**Cache:**
|
||||
- Thread-safe in-memory tool result cache with RWLock
|
||||
- Key: `{tool_name}-{input}` → cached output
|
||||
|
||||
### 1.2 AutoGen (Microsoft) — Conversation-Centric Teams
|
||||
|
||||
**Core Pattern:** Agents communicate through shared conversation threads. A "Group Chat Manager" controls turn-taking and speaker selection.
|
||||
|
||||
**Agent-to-Agent Communication:**
|
||||
- **Shared message thread** — all agents see all messages (like a group chat)
|
||||
- **Three team patterns:**
|
||||
- `RoundRobinGroupChat`: Fixed order cycling through participants
|
||||
- `SelectorGroupChat`: LLM-based speaker selection with candidate filtering
|
||||
- `SwarmGroupChat`: Handoff-based routing (agent sends HandoffMessage to next agent)
|
||||
- `GraphFlow` (DiGraph): DAG-based execution with conditional edges, parallel fan-out, loops
|
||||
- `MagenticOneOrchestrator`: Ledger-based orchestration with task planning, progress tracking, stall detection
|
||||
|
||||
**Shared State:**
|
||||
- `ChatCompletionContext` — manages message history per agent (can be unbounded or windowed)
|
||||
- `ModelContext` shared across agents in a team
|
||||
- State serialization: `save_state()` / `load_state()` for all managers
|
||||
- **No built-in vector memory** — context is purely conversational
|
||||
|
||||
**Task Delegation:**
|
||||
- `Swarm`: Agents use `HandoffMessage` to explicitly route control
|
||||
- `GraphFlow`: Conditional edges route based on message content (keyword or callable)
|
||||
- `MagenticOne`: Orchestrator maintains a "task ledger" (facts + plan) and dynamically re-plans on stalls
|
||||
|
||||
**Consensus / Termination:**
|
||||
- `TerminationCondition` — composable conditions (text match, max messages, source-based)
|
||||
- No explicit consensus protocols — termination is manager-decided
|
||||
|
||||
**Key Insight:** AutoGen's `ChatCompletionContext` is the closest analog to shared memory, but it's purely sequential message history, not a knowledge base.
|
||||
|
||||
### 1.3 MetaGPT — SOP-Driven Software Teams
|
||||
|
||||
**Core Pattern:** Agents follow Standard Operating Procedures (SOPs). Each agent has a defined role (Product Manager, Architect, Engineer, QA) and produces structured artifacts.
|
||||
|
||||
**Agent-to-Agent Communication:**
|
||||
- **Publish-Subscribe via Environment:** Agents publish "actions" to a shared Environment, subscribers react
|
||||
- **Structured outputs:** Each role produces specific artifact types (PRD, design doc, code, test cases)
|
||||
- **Message routing:** Environment acts as a message bus, filtering by subscriber interest
|
||||
|
||||
**Shared Memory:**
|
||||
- `Environment` class maintains shared state (project workspace)
|
||||
- File-based shared memory: agents write/read from a shared filesystem
|
||||
- `SharedMemory` for cross-agent context (structured data, not free-form text)
|
||||
|
||||
**Task Delegation:**
|
||||
- Implicit through SOP stages: PM → Architect → Engineer → QA
|
||||
- Each agent's output is the next agent's input
|
||||
- No dynamic re-delegation
|
||||
|
||||
**Consensus:**
|
||||
- Sequential SOP execution (no parallel agents)
|
||||
- QA agent can trigger re-work loops back to Engineer
|
||||
|
||||
### 1.4 ChatDev — Chat-Chain Software Development
|
||||
|
||||
**Core Pattern:** Agents follow a "chat chain" — a sequence of chat phases (designing, coding, testing, documenting). Each phase involves a pair of agents (CEO↔CTO, Programmer↔Reviewer, etc.).
|
||||
|
||||
**Agent-to-Agent Communication:**
|
||||
- **Paired chat sessions:** Two agents communicate in each phase (role-play between instructor and assistant)
|
||||
- **Chain propagation:** Phase N's output (code, design doc) becomes Phase N+1's input
|
||||
- **No broadcast** — communication is strictly pairwise within phases
|
||||
|
||||
**Shared Memory:**
|
||||
- Software-centric: shared code repository is the "memory"
|
||||
- Each phase modifies/inherits the codebase
|
||||
- No explicit vector memory or knowledge graph
|
||||
|
||||
**Task Delegation:**
|
||||
- Hardcoded phase sequence: Design → Code → Test → Document
|
||||
- Each phase delegates to a specific agent pair
|
||||
- No dynamic task re-assignment
|
||||
|
||||
**Consensus:**
|
||||
- Phase-level termination: when both agents agree the phase is complete
|
||||
- "Thought" tokens for chain-of-thought within chat
|
||||
|
||||
### 1.5 CAMEL — Role-Playing & Workforce
|
||||
|
||||
**Core Pattern:** Two primary modes:
|
||||
1. **RolePlaying:** Two-agent conversation with task specification and optional critic
|
||||
2. **Workforce:** Multi-agent with coordinator, task planner, and worker pool
|
||||
|
||||
**Agent-to-Agent Communication:**
|
||||
- **RolePlaying:** Structured turn-taking between assistant and user agents
|
||||
- **Workforce:** Coordinator assigns tasks via `TaskChannel`, workers return results
|
||||
- **Worker types:** `SingleAgentWorker` (single ChatAgent), `RolePlayingWorker` (two-agent pair)
|
||||
|
||||
**Shared Memory / Task Channel:**
|
||||
- `TaskChannel` — async queue-based task dispatch with packet tracking
|
||||
- States: SENT → PROCESSING → RETURNED → ARCHIVED
|
||||
- O(1) lookup by task ID, status-based filtering, assignee/publisher queues
|
||||
- `WorkflowMemoryManager` — persists workflow patterns as markdown files
|
||||
- Role-based organization: workflows stored by `role_identifier`
|
||||
- Agent-based intelligent selection: LLM picks relevant past workflows
|
||||
- Versioned: metadata tracks creation time and version numbers
|
||||
|
||||
**Task Delegation:**
|
||||
- Coordinator agent decomposes complex tasks using LLM analysis
|
||||
- Tasks assigned to workers based on capability matching
|
||||
- Failed tasks trigger: retry, create new worker, or further decomposition
|
||||
- `FailureHandlingConfig` with configurable `RecoveryStrategy`
|
||||
|
||||
**Consensus / Quality:**
|
||||
- Quality evaluation via structured output (response format enforced)
|
||||
- Task dependencies tracked (worker receives dependency tasks as context)
|
||||
- `WorkforceMetrics` for tracking execution statistics
|
||||
|
||||
---
|
||||
|
||||
## 2. Key Architectural Patterns for Fleet Knowledge Graph
|
||||
|
||||
### 2.1 Communication Topology Patterns
|
||||
|
||||
| Pattern | Used By | Description |
|
||||
|---------|---------|-------------|
|
||||
| **Sequential Chain** | CrewAI, ChatDev, MetaGPT | A→B→C linear flow, output feeds next |
|
||||
| **Shared Thread** | AutoGen | All agents see all messages |
|
||||
| **Publish-Subscribe** | MetaGPT | Environment-based message bus |
|
||||
| **Paired Chat** | ChatDev, CAMEL | Two-agent conversation pairs |
|
||||
| **Handoff Routing** | AutoGen Swarm | Agent explicitly names next speaker |
|
||||
| **DAG Graph** | AutoGen GraphFlow | Conditional edges, parallel, loops |
|
||||
| **Ledger Orchestration** | AutoGen MagenticOne | Maintains task ledger, re-plans |
|
||||
| **Task Channel** | CAMEL | Async queue with packet states |
|
||||
|
||||
### 2.2 Shared State Patterns
|
||||
|
||||
| Pattern | Used By | Description |
|
||||
|---------|---------|-------------|
|
||||
| **Vector Memory** | CrewAI | Embeddings + scope-based namespacing |
|
||||
| **Message History** | AutoGen | Sequential conversation context |
|
||||
| **File System** | MetaGPT, ChatDev | Agents read/write shared files |
|
||||
| **Task Channel** | CAMEL | Async packet-based task dispatch |
|
||||
| **Workflow Files** | CAMEL | Markdown-based workflow memory |
|
||||
| **Tool Cache** | CrewAI | In-memory RWLock tool result cache |
|
||||
| **State Checkpoint** | CrewAI, AutoGen | Serialized Pydantic/SQLite checkpoints |
|
||||
|
||||
### 2.3 Task Delegation Patterns
|
||||
|
||||
| Pattern | Used By | Description |
|
||||
|---------|---------|-------------|
|
||||
| **Role Assignment** | CrewAI | Fixed agent per task |
|
||||
| **Manager Delegation** | CrewAI Hierarchical | Manager assigns tasks dynamically |
|
||||
| **Speaker Selection** | AutoGen Selector | LLM picks next agent |
|
||||
| **Handoff** | AutoGen Swarm | Agent explicitly transfers control |
|
||||
| **SOP Routing** | MetaGPT | Stage-based implicit delegation |
|
||||
| **Coordinator** | CAMEL Workforce | LLM-based task decomposition + assignment |
|
||||
| **Dynamic Worker Creation** | CAMEL Workforce | Create new workers on failure |
|
||||
|
||||
### 2.4 Conflict Resolution Patterns
|
||||
|
||||
| Pattern | Used By | Description |
|
||||
|---------|---------|-------------|
|
||||
| **Manager Arbitration** | CrewAI Hierarchical | Manager resolves conflicts |
|
||||
| **Critic-in-the-loop** | CAMEL | Critic agent evaluates and selects |
|
||||
| **Quality Gate** | CAMEL Workforce | Structured quality evaluation |
|
||||
| **Termination Conditions** | AutoGen | Composable stop conditions |
|
||||
| **Stall Detection** | AutoGen MagenticOne | Re-plans when progress stalls |
|
||||
|
||||
---
|
||||
|
||||
## 3. Recommendations for Hermes Fleet Knowledge Graph
|
||||
|
||||
### 3.1 Architecture: Hybrid Graph + Memory
|
||||
|
||||
Based on the SOTA analysis, the optimal fleet knowledge graph should combine:
|
||||
|
||||
1. **CrewAI's scoped memory** for hierarchical knowledge organization
|
||||
- Path-based namespaces: `/fleet/{fleet_id}/agent/{agent_id}/diary`
|
||||
- Composite scoring: semantic + recency + importance
|
||||
- Background writes with read barriers
|
||||
|
||||
2. **CAMEL's TaskChannel** for task dispatch and tracking
|
||||
- Packet states (SENT → PROCESSING → RETURNED → ARCHIVED)
|
||||
- O(1) lookup by task ID
|
||||
- Assignee/publisher tracking
|
||||
|
||||
3. **AutoGen's DiGraph** for execution flow definition
|
||||
- DAG with conditional edges for complex workflows
|
||||
- Parallel fan-out for independent tasks
|
||||
- Activation conditions (all vs any) for synchronization points
|
||||
|
||||
4. **AutoGen MagenticOne's ledger** for shared task context
|
||||
- Maintained facts, plan, and progress ledger
|
||||
- Dynamic re-planning on stalls
|
||||
|
||||
### 3.2 Fleet Knowledge Graph Schema
|
||||
|
||||
```
|
||||
/fleet/{fleet_id}/
|
||||
├── shared/ # Shared knowledge (all agents read)
|
||||
│ ├── facts/ # Known facts, constraints
|
||||
│ ├── decisions/ # Record of decisions made
|
||||
│ └── context/ # Active task context
|
||||
├── agent/{agent_id}/
|
||||
│ ├── diary/ # Agent's personal experience log
|
||||
│ ├── capabilities/ # What this agent can do
|
||||
│ └── state/ # Current task state
|
||||
├── tasks/
|
||||
│ ├── {task_id}/ # Task metadata, dependencies, status
|
||||
│ └── graph/ # DAG definition for task dependencies
|
||||
└── consensus/
|
||||
├── proposals/ # Pending proposals
|
||||
└── decisions/ # Resolved consensus decisions
|
||||
```
|
||||
|
||||
### 3.3 Key Design Decisions
|
||||
|
||||
1. **Diary System (Agent Memory):**
|
||||
- Each agent writes to its own scoped memory after every significant action
|
||||
- LLM-analyzed importance scoring (like CrewAI's unified memory)
|
||||
- Cross-agent recall: agents can query other agents' diaries for relevant experiences
|
||||
- Decay: old low-importance memories expire
|
||||
|
||||
2. **Shared State (Fleet Knowledge):**
|
||||
- SQLite-backed (like Hermes' existing `state.db`) with FTS5 search
|
||||
- Hierarchical scopes (like CrewAI's MemoryScope)
|
||||
- Write-ahead log for concurrent access
|
||||
- Read barriers before queries (like CrewAI's `drain_writes`)
|
||||
|
||||
3. **Task Delegation:**
|
||||
- Coordinator pattern (like CAMEL's Workforce)
|
||||
- Task decomposition via LLM
|
||||
- Failed task → retry, reassign, or decompose
|
||||
- Max depth limit (like Hermes' existing MAX_DEPTH=2)
|
||||
|
||||
4. **Consensus Protocol:**
|
||||
- Proposal-based: agent proposes, others vote/acknowledge
|
||||
- Timeout-based fallback: if no response within N seconds, proceed
|
||||
- Manager override: designated manager can break ties
|
||||
- Simple majority for non-critical, unanimity for critical decisions
|
||||
|
||||
5. **Conflict Resolution:**
|
||||
- Last-write-wins for non-critical state
|
||||
- Optimistic locking with version numbers
|
||||
- Manager arbitration for task assignment conflicts
|
||||
- Quality gates (like CAMEL) for output validation
|
||||
|
||||
### 3.4 Integration with Existing Hermes Architecture
|
||||
|
||||
Hermes already has strong foundations:
|
||||
- **Delegation system** (`delegate_tool.py`): Isolated child agents, parallel execution, depth limits
|
||||
- **State DB** (`hermes_state.py`): SQLite + FTS5, WAL mode, session tracking, message history
|
||||
- **Credential pools**: Shared credentials with rotation
|
||||
|
||||
The fleet knowledge graph should extend these patterns:
|
||||
- **Session DB → Fleet DB:** Add tables for fleet metadata, agent registrations, task graphs
|
||||
- **Memory tool → Fleet Memory:** Scoped vector memory shared across fleet agents
|
||||
- **Delegate tool → Fleet Delegation:** Task channel with persistence, quality evaluation
|
||||
- **New: Consensus module:** Proposal/vote protocol with timeout handling
|
||||
|
||||
---
|
||||
|
||||
## 4. Reference Implementations
|
||||
|
||||
| Component | Best Reference | Key Takeaway |
|
||||
|-----------|---------------|--------------|
|
||||
| Scoped Memory | CrewAI `Memory` + `MemoryScope` | Path-based namespaces, composite scoring, background writes |
|
||||
| Task Dispatch | CAMEL `TaskChannel` | Packet-based with state machine, O(1) lookup |
|
||||
| Execution DAG | AutoGen `DiGraphBuilder` | Fluent builder, conditional edges, activation groups |
|
||||
| Orchestration | AutoGen `MagenticOneOrchestrator` | Ledger-based planning, stall detection, re-planning |
|
||||
| Agent Communication | AutoGen `SelectorGroupChat` | LLM-based speaker selection, shared message thread |
|
||||
| Quality Evaluation | CAMEL Workforce | Structured output for quality scoring |
|
||||
| Workflow Memory | CAMEL `WorkflowMemoryManager` | Markdown-based, role-organized, versioned |
|
||||
| State Checkpoint | CrewAI `SqliteProvider` | JSONB checkpoints, WAL mode |
|
||||
| Tool Cache | CrewAI `CacheHandler` | RWLock-based concurrent tool result cache |
|
||||
|
||||
---
|
||||
|
||||
## 5. Open Questions
|
||||
|
||||
1. **Graph vs Vector for knowledge:** Should fleet knowledge use a proper graph DB (e.g., Neo4j) or stick with vector + SQLite?
|
||||
- Recommendation: Start with SQLite + vectors (existing stack), add graph later if needed
|
||||
|
||||
2. **Real-time vs Batch:** Should agents receive updates in real-time or batched?
|
||||
- Recommendation: Event-driven for critical updates, batched for diary entries
|
||||
|
||||
3. **Security model:** How should cross-agent access be controlled?
|
||||
- Recommendation: Role-based ACLs on scope paths, similar to CrewAI's privacy flags
|
||||
|
||||
4. **Scalability:** How many agents can a single fleet support?
|
||||
- Recommendation: Start with 10-agent fleets, optimize SQLite concurrency first
|
||||
|
||||
@@ -1,151 +0,0 @@
|
||||
## Tool Investigation Report: Top 5 Recommendations from awesome-ai-tools
|
||||
|
||||
**Source:** [formatho/awesome-ai-tools](https://github.com/formatho/awesome-ai-tools)
|
||||
**Date:** 2026-04-15
|
||||
**Tools Analyzed:** 414 across 9 categories
|
||||
**Agent:** Timmy
|
||||
|
||||
---
|
||||
|
||||
## Analysis Summary
|
||||
|
||||
Scanned 414 tools from the awesome-ai-tools repository. Evaluated each against Hermes integration potential across five categories: Memory/Context, Inference Optimization, Agent Orchestration, Workflow Automation, and Retrieval/RAG.
|
||||
|
||||
### Evaluation Criteria
|
||||
- **Stars:** GitHub community validation (stability signal)
|
||||
- **Freshness:** Active development (Fresh = updated <=7 days)
|
||||
- **Integration Fit:** How well it complements Hermes' existing architecture (skills, memory, tools)
|
||||
- **Integration Effort:** 1 (trivial drop-in) to 5 (major refactor required)
|
||||
- **Impact:** 1 (incremental) to 5 (transformative)
|
||||
|
||||
---
|
||||
|
||||
## Top 5 Recommended Tools
|
||||
|
||||
### #1: Mem0 — Universal Memory Layer for AI Agents
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| **Category** | Memory/Context |
|
||||
| **GitHub** | [mem0ai/mem0](https://github.com/mem0ai/mem0) |
|
||||
| **Stars** | 53.1k |
|
||||
| **Freshness** | Fresh |
|
||||
| **Integration Effort** | 3/5 |
|
||||
| **Impact** | 5/5 |
|
||||
| **Hermes Status** | IMPLEMENTED (plugins/memory/mem0/) + LOCAL MODE (plugins/memory/mem0_local/) |
|
||||
|
||||
**Why it fits Hermes:**
|
||||
Hermes currently has session_search (transcript recall) and memory (persistent facts), but lacks a unified memory layer that bridges sessions with semantic understanding. Mem0 provides exactly this: automatic memory extraction from conversations, deduplication, and cross-session retrieval with semantic search.
|
||||
|
||||
**Integration path:**
|
||||
- Cloud: plugins/memory/mem0/ (requires MEM0_API_KEY)
|
||||
- Local: plugins/memory/mem0_local/ (ChromaDB-backed, no API key)
|
||||
- Auto-extract facts from session transcripts
|
||||
- Query before session_search for richer contextual recall
|
||||
|
||||
**Key risk:** Mem0 is freemium — core is open-source but advanced features require paid tier. Local mode mitigates this entirely.
|
||||
|
||||
---
|
||||
|
||||
### #2: LightRAG — Simple and Fast Retrieval-Augmented Generation
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| **Category** | Retrieval/RAG |
|
||||
| **GitHub** | [HKUDS/LightRAG](https://github.com/HKUDS/LightRAG) |
|
||||
| **Stars** | 33.1k |
|
||||
| **Freshness** | Fresh |
|
||||
| **Integration Effort** | 3/5 |
|
||||
| **Impact** | 4/5 |
|
||||
| **Hermes Status** | NOT IMPLEMENTED — Issue #857 |
|
||||
|
||||
**Why it fits Hermes:**
|
||||
Hermes has 190+ skills but no unified knowledge retrieval system. LightRAG adds graph-based RAG that understands relationships between concepts, not just keyword matches. It's lightweight, runs locally, and has a simple API.
|
||||
|
||||
**Integration path:**
|
||||
- LightRAG as a local knowledge base for skill references
|
||||
- Index GENOME.md files, README.md, and key codebase files
|
||||
- Use local Ollama models for embeddings
|
||||
- Complements existing search_files without replacing it
|
||||
|
||||
---
|
||||
|
||||
### #3: n8n — Workflow Automation Platform
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| **Category** | Workflow Automation / Agent Orchestration |
|
||||
| **GitHub** | [n8n-io/n8n](https://github.com/n8n-io/n8n) |
|
||||
| **Stars** | 183.9k |
|
||||
| **Freshness** | Fresh |
|
||||
| **Integration Effort** | 4/5 |
|
||||
| **Impact** | 5/5 |
|
||||
| **Hermes Status** | NOT IMPLEMENTED — Issue #858 |
|
||||
|
||||
**Why it fits Hermes:**
|
||||
n8n provides a self-hosted, fair-code workflow platform with 400+ integrations. Rather than replacing Hermes' agent loop, n8n sits above it: trigger Hermes agents from external events, chain multi-agent workflows, and visualize execution.
|
||||
|
||||
---
|
||||
|
||||
### #4: RAGFlow — Open-Source RAG Engine
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| **Category** | Retrieval/RAG |
|
||||
| **GitHub** | [infiniflow/ragflow](https://github.com/infiniflow/ragflow) |
|
||||
| **Stars** | 77.9k |
|
||||
| **Freshness** | Fresh |
|
||||
| **Integration Effort** | 4/5 |
|
||||
| **Impact** | 4/5 |
|
||||
| **Hermes Status** | NOT IMPLEMENTED — Issue #859 |
|
||||
|
||||
**Why it fits Hermes:**
|
||||
RAGFlow handles document parsing (PDF, Word, images via OCR), chunking, embedding, and retrieval with a web UI. Enables "document understanding" as a first-class capability.
|
||||
|
||||
---
|
||||
|
||||
### #5: tensorzero — LLMOps Platform
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| **Category** | Inference Optimization / LLMOps |
|
||||
| **GitHub** | [tensorzero/tensorzero](https://github.com/tensorzero/tensorzero) |
|
||||
| **Stars** | 11.2k |
|
||||
| **Freshness** | Fresh |
|
||||
| **Integration Effort** | 3/5 |
|
||||
| **Impact** | 4/5 |
|
||||
| **Hermes Status** | NOT IMPLEMENTED — Issue #860 |
|
||||
|
||||
**Why it fits Hermes:**
|
||||
TensorZero unifies LLM gateway, observability, evaluation, and optimization. Replaces custom provider routing with a maintained, battle-tested platform.
|
||||
|
||||
---
|
||||
|
||||
## Honorable Mentions
|
||||
|
||||
| Tool | Stars | Category | Why Not Top 5 |
|
||||
|------|-------|----------|---------------|
|
||||
| memvid | 14.9k | Memory | Newer; Mem0 is more mature |
|
||||
| mempalace | 44.8k | Memory | Already evaluated; Mem0 has broader API |
|
||||
| Everything Claude Code | 154.3k | Agent | Too Claude-specific |
|
||||
| Portkey AI Gateway | 11.3k | Gateway | TensorZero is OSS; Portkey is freemium |
|
||||
|
||||
---
|
||||
|
||||
## Implementation Priority
|
||||
|
||||
| Priority | Tool | Action | Status | Issue |
|
||||
|----------|------|--------|--------|-------|
|
||||
| P1 | Mem0 | Local-only mode (ChromaDB) | DONE | #842 |
|
||||
| P2 | LightRAG | Set up local instance, index skills | Not started | #857 |
|
||||
| P3 | tensorzero | Evaluate as provider routing | Not started | #860 |
|
||||
| P4 | RAGFlow | Deploy Docker, test docs | Not started | #859 |
|
||||
| P5 | n8n | Deploy for workflow viz | Not started | #858 |
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
- Source: https://github.com/formatho/awesome-ai-tools
|
||||
- Total tools: 414 across 9 categories
|
||||
- Last updated: April 16, 2026
|
||||
- Tracking issue: Timmy_Foundation/hermes-agent#842
|
||||
@@ -1,24 +0,0 @@
|
||||
# Tool Investigation Report: Top 5 Recommendations
|
||||
|
||||
**Generated:** 2026-04-20 | **Source:** formatho/awesome-ai-tools (795 tools, 10 categories)
|
||||
|
||||
## Top 5
|
||||
|
||||
1. **LiteLLM** (76k) — Unified API gateway. Replace custom provider routing. Impact: 5/5, Effort: 2/5
|
||||
2. **Mem0** (53k) — Universal memory layer. Structured long-term memory. Impact: 5/5, Effort: 3/5
|
||||
3. **RAGFlow** (77k) — RAG engine with OCR. Document processing upgrade. Impact: 4/5, Effort: 4/5
|
||||
4. **LiteRT-LM** (3.7k) — On-device inference. Edge/mobile deployment. Impact: 4/5, Effort: 3/5
|
||||
5. **Claude-Mem** (61k) — Session capture and context injection. Impact: 3/5, Effort: 2/5
|
||||
|
||||
## Priority
|
||||
|
||||
- Phase 1: LiteLLM (2-3 days, highest ROI)
|
||||
- Phase 2: Mem0 (1 week, critical for agent maturity)
|
||||
- Phase 3: RAGFlow (1-2 weeks, capability upgrade)
|
||||
|
||||
## Honorable Mentions
|
||||
|
||||
- GPTCache: Semantic cache, 30-50% cost reduction
|
||||
- promptfoo: LLM testing framework
|
||||
- PageIndex: Vectorless RAG
|
||||
- rtk: Token reduction proxy, 60-90% savings
|
||||
@@ -8,7 +8,6 @@ Handles loading and validating configuration for:
|
||||
- Delivery preferences
|
||||
"""
|
||||
|
||||
import ipaddress
|
||||
import logging
|
||||
import os
|
||||
import json
|
||||
@@ -680,26 +679,6 @@ def load_gateway_config() -> GatewayConfig:
|
||||
return config
|
||||
|
||||
|
||||
def _is_network_accessible(host: str) -> bool:
|
||||
"""Return True if *host* would expose a server beyond the loopback interface.
|
||||
|
||||
Duplicates the logic in ``gateway.platforms.base.is_network_accessible``
|
||||
without creating a circular import (base.py imports from this module).
|
||||
"""
|
||||
try:
|
||||
addr = ipaddress.ip_address(host)
|
||||
if addr.is_loopback:
|
||||
return False
|
||||
# ::ffff:127.x.x.x — Python's is_loopback returns False for
|
||||
# IPv4-mapped loopback; unwrap and check the underlying IPv4.
|
||||
if getattr(addr, "ipv4_mapped", None) and addr.ipv4_mapped.is_loopback:
|
||||
return False
|
||||
return True
|
||||
except ValueError:
|
||||
# Hostname: assume it could be network-accessible.
|
||||
return True
|
||||
|
||||
|
||||
def _validate_gateway_config(config: "GatewayConfig") -> None:
|
||||
"""Validate and sanitize a loaded GatewayConfig in place.
|
||||
|
||||
@@ -768,22 +747,6 @@ def _validate_gateway_config(config: "GatewayConfig") -> None:
|
||||
)
|
||||
pconfig.enabled = False
|
||||
|
||||
# Warn when the API server is enabled on a network-accessible address
|
||||
# without an auth key. The adapter will refuse to start anyway, but
|
||||
# surfacing this at config-load time lets operators see the problem in
|
||||
# the startup log before any platform adapter initialisation runs.
|
||||
api_cfg = config.platforms.get(Platform.API_SERVER)
|
||||
if api_cfg and api_cfg.enabled:
|
||||
key = api_cfg.extra.get("key", "")
|
||||
host = api_cfg.extra.get("host", "127.0.0.1")
|
||||
if not key and _is_network_accessible(host):
|
||||
logger.warning(
|
||||
"API Server is enabled on %s but API_SERVER_KEY is not set. "
|
||||
"The adapter will refuse to start on a network-accessible address. "
|
||||
"Set API_SERVER_KEY or bind to 127.0.0.1 for local-only access.",
|
||||
host,
|
||||
)
|
||||
|
||||
|
||||
def _apply_env_overrides(config: GatewayConfig) -> None:
|
||||
"""Apply environment variable overrides to config."""
|
||||
|
||||
@@ -1,224 +0,0 @@
|
||||
"""
|
||||
Gateway Config Validator & Fallback Fix — #892.
|
||||
|
||||
Validates gateway configuration and provides sensible defaults
|
||||
for missing keys to prevent fallback chain breaks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from typing import Dict, Any, List, Optional
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigIssue:
|
||||
"""A configuration issue found during validation."""
|
||||
key: str
|
||||
severity: str # error, warning, info
|
||||
message: str
|
||||
fix: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ConfigValidation:
|
||||
"""Result of config validation."""
|
||||
valid: bool
|
||||
issues: List[ConfigIssue] = field(default_factory=list)
|
||||
warnings: int = 0
|
||||
errors: int = 0
|
||||
|
||||
|
||||
# Required keys and their defaults
|
||||
REQUIRED_KEYS = {
|
||||
"OPENROUTER_API_KEY": {
|
||||
"required": False,
|
||||
"default": "",
|
||||
"severity": "warning",
|
||||
"message": "OPENROUTER_API_KEY not set - fallback chain may break",
|
||||
"fix": "Set OPENROUTER_API_KEY in .env for OpenRouter provider",
|
||||
},
|
||||
"API_SERVER_KEY": {
|
||||
"required": False,
|
||||
"default": "",
|
||||
"severity": "warning",
|
||||
"message": "API_SERVER_KEY not configured",
|
||||
"fix": "Set API_SERVER_KEY in .env for API server auth",
|
||||
},
|
||||
"GITEA_TOKEN": {
|
||||
"required": False,
|
||||
"default": "",
|
||||
"severity": "info",
|
||||
"message": "GITEA_TOKEN not set - Gitea features disabled",
|
||||
"fix": "Set GITEA_TOKEN in .env for Gitea integration",
|
||||
},
|
||||
}
|
||||
|
||||
# Config validation rules
|
||||
VALIDATION_RULES = [
|
||||
{
|
||||
"key": "idle_minutes",
|
||||
"validate": lambda v: isinstance(v, (int, float)) and v > 0,
|
||||
"message": "Invalid idle_minutes={v} - must be > 0",
|
||||
"fix": "Set idle_minutes to positive integer (default: 30)",
|
||||
},
|
||||
{
|
||||
"key": "max_skills_discord",
|
||||
"validate": lambda v: isinstance(v, int) and v <= 100,
|
||||
"message": "Discord slash command limit reached ({v}/100) - skills not registered",
|
||||
"fix": "Reduce skills or paginate registration",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def validate_config(config: Dict[str, Any]) -> ConfigValidation:
|
||||
"""
|
||||
Validate gateway configuration.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
ConfigValidation with issues found
|
||||
"""
|
||||
issues = []
|
||||
|
||||
# Check required keys
|
||||
for key, spec in REQUIRED_KEYS.items():
|
||||
value = config.get(key) or os.environ.get(key) or spec["default"]
|
||||
if spec["required"] and not value:
|
||||
issues.append(ConfigIssue(
|
||||
key=key,
|
||||
severity=spec["severity"],
|
||||
message=spec["message"],
|
||||
fix=spec["fix"],
|
||||
))
|
||||
elif not value and spec["severity"] != "error":
|
||||
issues.append(ConfigIssue(
|
||||
key=key,
|
||||
severity=spec["severity"],
|
||||
message=spec["message"],
|
||||
fix=spec["fix"],
|
||||
))
|
||||
|
||||
# Check validation rules
|
||||
for rule in VALIDATION_RULES:
|
||||
value = config.get(rule["key"])
|
||||
if value is not None:
|
||||
if not rule["validate"](value):
|
||||
issues.append(ConfigIssue(
|
||||
key=rule["key"],
|
||||
severity="error",
|
||||
message=rule["message"].format(v=value),
|
||||
fix=rule["fix"],
|
||||
))
|
||||
|
||||
errors = sum(1 for i in issues if i.severity == "error")
|
||||
warnings = sum(1 for i in issues if i.severity == "warning")
|
||||
|
||||
return ConfigValidation(
|
||||
valid=errors == 0,
|
||||
issues=issues,
|
||||
warnings=warnings,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
|
||||
def apply_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Apply default values for missing config keys.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
|
||||
Returns:
|
||||
Config with defaults applied
|
||||
"""
|
||||
result = dict(config)
|
||||
|
||||
for key, spec in REQUIRED_KEYS.items():
|
||||
if key not in result or not result[key]:
|
||||
default = os.environ.get(key) or spec["default"]
|
||||
if default:
|
||||
result[key] = default
|
||||
logger.debug("Applied default for %s", key)
|
||||
|
||||
# Apply validation defaults
|
||||
if "idle_minutes" not in result or not result["idle_minutes"] or result["idle_minutes"] <= 0:
|
||||
result["idle_minutes"] = 30
|
||||
logger.debug("Applied default idle_minutes=30")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def fix_discord_skill_limit(skills: List[str], max_skills: int = 95) -> List[str]:
|
||||
"""
|
||||
Fix Discord slash command limit by reducing skills.
|
||||
|
||||
Args:
|
||||
skills: List of skill names
|
||||
max_skills: Maximum skills to register (default 95, leaving room for built-ins)
|
||||
|
||||
Returns:
|
||||
Reduced skill list
|
||||
"""
|
||||
if len(skills) <= max_skills:
|
||||
return skills
|
||||
|
||||
logger.warning(
|
||||
"Discord skill limit: %d skills exceeds %d limit, truncating",
|
||||
len(skills), max_skills
|
||||
)
|
||||
|
||||
# Keep first max_skills (alphabetical priority)
|
||||
return sorted(skills)[:max_skills]
|
||||
|
||||
|
||||
def validate_provider_config(provider: str, config: Dict[str, Any]) -> ConfigIssue:
|
||||
"""
|
||||
Validate provider-specific configuration.
|
||||
|
||||
Args:
|
||||
provider: Provider name
|
||||
config: Provider config
|
||||
|
||||
Returns:
|
||||
ConfigIssue if invalid, None if valid
|
||||
"""
|
||||
if provider == "local-llama.cpp":
|
||||
# Check if llama.cpp is configured
|
||||
if not config.get("model_path") and not config.get("base_url"):
|
||||
return ConfigIssue(
|
||||
key=f"provider.{provider}",
|
||||
severity="warning",
|
||||
message=f"{provider} provider not configured - fallback fails",
|
||||
fix=f"Configure {provider} model_path or base_url, or remove from provider list",
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def format_validation_report(validation: ConfigValidation) -> str:
|
||||
"""Format validation results as a report."""
|
||||
lines = [
|
||||
"=" * 50,
|
||||
"GATEWAY CONFIG VALIDATION",
|
||||
"=" * 50,
|
||||
"",
|
||||
f"Status: {'VALID' if validation.valid else 'INVALID'}",
|
||||
f"Errors: {validation.errors}",
|
||||
f"Warnings: {validation.warnings}",
|
||||
"",
|
||||
]
|
||||
|
||||
if validation.issues:
|
||||
lines.append("Issues:")
|
||||
for issue in validation.issues:
|
||||
icon = "❌" if issue.severity == "error" else "⚠️" if issue.severity == "warning" else "ℹ️"
|
||||
lines.append(f" {icon} [{issue.key}] {issue.message}")
|
||||
lines.append(f" Fix: {issue.fix}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -45,8 +45,6 @@ from hermes_cli.config import (
|
||||
redact_key,
|
||||
)
|
||||
from gateway.status import get_running_pid, read_runtime_status
|
||||
from agent.agent_card import get_agent_card_json
|
||||
from agent.mtls import is_mtls_configured, MTLSMiddleware, build_server_ssl_context
|
||||
|
||||
try:
|
||||
from fastapi import FastAPI, HTTPException, Request
|
||||
@@ -88,10 +86,6 @@ app.add_middleware(
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# mTLS: enforce client certificate on A2A endpoints when configured.
|
||||
# Activated by setting HERMES_MTLS_CERT, HERMES_MTLS_KEY, HERMES_MTLS_CA.
|
||||
app.add_middleware(MTLSMiddleware)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoints that do NOT require the session token. Everything else under
|
||||
# /api/ is gated by the auth middleware below. Keep this list minimal —
|
||||
@@ -102,9 +96,6 @@ _PUBLIC_API_PATHS: frozenset = frozenset({
|
||||
"/api/config/defaults",
|
||||
"/api/config/schema",
|
||||
"/api/model/info",
|
||||
"/api/agent-card",
|
||||
"/agent-card.json",
|
||||
"/.well-known/agent-card.json",
|
||||
})
|
||||
|
||||
|
||||
@@ -369,14 +360,6 @@ def _probe_gateway_health() -> tuple[bool, dict | None]:
|
||||
return False, None
|
||||
|
||||
|
||||
@app.get("/api/agent-card")
|
||||
@app.get("/agent-card.json")
|
||||
@app.get("/.well-known/agent-card.json")
|
||||
async def get_agent_card():
|
||||
"""Return the A2A agent card for fleet discovery."""
|
||||
return JSONResponse(content=json.loads(get_agent_card_json()))
|
||||
|
||||
|
||||
@app.get("/api/status")
|
||||
async def get_status():
|
||||
current_ver, latest_ver = check_config_version()
|
||||
@@ -2110,20 +2093,6 @@ def start_server(
|
||||
"authentication. Only use on trusted networks.", host,
|
||||
)
|
||||
|
||||
# mTLS: when configured, pass SSL context to uvicorn so all connections
|
||||
# are TLS with mandatory client certificate verification.
|
||||
ssl_context = None
|
||||
scheme = "http"
|
||||
if is_mtls_configured():
|
||||
try:
|
||||
ssl_context = build_server_ssl_context()
|
||||
scheme = "https"
|
||||
_log.info(
|
||||
"mTLS enabled — server requires client certificates (A2A auth)"
|
||||
)
|
||||
except Exception as exc:
|
||||
_log.error("Failed to build mTLS SSL context: %s — starting without TLS", exc)
|
||||
|
||||
if open_browser:
|
||||
import threading
|
||||
import webbrowser
|
||||
@@ -2131,11 +2100,9 @@ def start_server(
|
||||
def _open():
|
||||
import time as _t
|
||||
_t.sleep(1.0)
|
||||
webbrowser.open(f"{scheme}://{host}:{port}")
|
||||
webbrowser.open(f"http://{host}:{port}")
|
||||
|
||||
threading.Thread(target=_open, daemon=True).start()
|
||||
|
||||
print(f" Hermes Web UI → {scheme}://{host}:{port}")
|
||||
if ssl_context is not None:
|
||||
print(" mTLS enabled — client certificate required for A2A endpoints")
|
||||
uvicorn.run(app, host=host, port=port, log_level="warning", ssl=ssl_context)
|
||||
print(f" Hermes Web UI → http://{host}:{port}")
|
||||
uvicorn.run(app, host=host, port=port, log_level="warning")
|
||||
|
||||
@@ -1,301 +0,0 @@
|
||||
# SOTA LLM Inference Optimization - Research Report
|
||||
**Date: April 2026 | Focus: vLLM + TurboQuant deployment**
|
||||
|
||||
---
|
||||
|
||||
## 1. EXECUTIVE SUMMARY
|
||||
|
||||
Key findings for your vLLM + TurboQuant deployment targeting 60% cost reduction:
|
||||
|
||||
- vLLM delivers 24x throughput improvement over HF Transformers, 3.5x over TGI
|
||||
- FP8 quantization on H100/B200 provides near-lossless 2x throughput improvement
|
||||
- INT4 AWQ enables 75% VRAM reduction with less than 1% quality loss on most benchmarks
|
||||
- PagedAttention reduces KV-cache memory waste from 60-80% down to under 4%
|
||||
- Cost per 1M tokens ranges $0.05-0.50 for self-hosted vs $0.50-15.00 for API providers
|
||||
|
||||
---
|
||||
|
||||
## 2. INFERENCE FRAMEWORKS COMPARISON
|
||||
|
||||
### vLLM (Primary Recommendation)
|
||||
**Status: Leading open-source serving framework**
|
||||
|
||||
Key features (v0.8.x, 2025-2026):
|
||||
- PagedAttention for efficient KV-cache management
|
||||
- Continuous batching + chunked prefill
|
||||
- Prefix caching (automatic prompt caching)
|
||||
- Quantization support: FP8, MXFP8/MXFP4, NVFP4, INT8, INT4, GPTQ, AWQ, GGUF
|
||||
- Optimized attention kernels: FlashAttention, FlashInfer, TRTLLM-GEN, FlashMLA
|
||||
- Speculative decoding: EAGLE, DFlash, n-gram
|
||||
- Disaggregated prefill/decode
|
||||
- 200+ model architectures supported
|
||||
|
||||
Benchmark Numbers:
|
||||
- vLLM vs HF Transformers: 24x higher throughput
|
||||
- vLLM vs TGI: 3.5x higher throughput
|
||||
- LMSYS Chatbot Arena: 30x faster than initial HF backend
|
||||
- GPU reduction at equal throughput: 50% savings
|
||||
|
||||
### llama.cpp
|
||||
**Status: Best for CPU/edge/local inference**
|
||||
|
||||
Key features:
|
||||
- GGUF format with 1.5-bit to 8-bit quantization
|
||||
- Apple Silicon first-class support (Metal, Accelerate)
|
||||
- AVX/AVX2/AVX512/AMX for x86
|
||||
- CUDA, ROCm (AMD), MUSA (Moore Threads), Vulkan, SYCL
|
||||
- CPU+GPU hybrid inference (partial offloading)
|
||||
- Multimodal support
|
||||
- OpenAI-compatible server
|
||||
|
||||
Best for: Local development, edge deployment, Apple Silicon, CPU-only servers
|
||||
|
||||
### TensorRT-LLM
|
||||
**Status: Highest throughput on NVIDIA GPUs**
|
||||
|
||||
Key features:
|
||||
- NVIDIA-optimized kernels (XQA, FP8/FP4 GEMM)
|
||||
- In-flight batching
|
||||
- FP8/INT4 AWQ quantization
|
||||
- Speculative decoding (EAGLE3, n-gram)
|
||||
- Disaggregated serving
|
||||
- Expert parallelism for MoE
|
||||
- Now fully open-source (March 2025)
|
||||
|
||||
Benchmark Numbers (Official NVIDIA):
|
||||
- Llama2-13B on H200 (FP8): ~12,000 tok/s
|
||||
- Llama-70B on H100 (FP8, XQA kernel): ~2,400 tok/s/GPU
|
||||
- Llama 4 Maverick on B200 (FP8): 40,000+ tok/s
|
||||
- H100 vs A100 speedup: 4.6x
|
||||
- Falcon-180B on single H200: possible with INT4 AWQ
|
||||
|
||||
---
|
||||
|
||||
## 3. QUANTIZATION TECHNIQUES - DETAILED COMPARISON
|
||||
|
||||
### GPTQ (Post-Training Quantization)
|
||||
- Method: One-shot layer-wise quantization using Hessian-based error compensation
|
||||
- Typical bit-width: 3-bit, 4-bit, 8-bit
|
||||
- Quality loss: Less than 1% accuracy drop at 4-bit on most benchmarks
|
||||
- Speed: 1.5-2x inference speedup on GPU (vs FP16)
|
||||
- VRAM savings: ~75% at 4-bit (vs FP16)
|
||||
- Best for: General-purpose GPU deployment, wide model support
|
||||
|
||||
### AWQ (Activation-Aware Weight Quantization)
|
||||
- Method: Identifies salient weight channels using activation distributions
|
||||
- Typical bit-width: 4-bit (W4A16), also supports W4A8
|
||||
- Quality loss: ~0.5% accuracy drop at 4-bit (better than GPTQ)
|
||||
- Speed: 2-3x inference speedup on GPU, faster than GPTQ at same bit-width
|
||||
- VRAM savings: ~75% at 4-bit
|
||||
- Best for: High-throughput GPU serving, production deployments
|
||||
- Supported by: vLLM, TensorRT-LLM, TGI natively
|
||||
|
||||
### GGUF (llama.cpp format)
|
||||
- Method: Multiple quantization types (Q2_K through Q8_0)
|
||||
- Bit-widths: 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, 8-bit
|
||||
- Quality at Q4_K_M: Comparable to GPTQ-4bit
|
||||
- Speed: Optimized for CPU inference, 2-4x faster than FP16 on CPU
|
||||
- Best for: CPU deployment, Apple Silicon, edge devices, hybrid CPU+GPU
|
||||
- Notable: Q4_K_M is the sweet spot for quality/speed tradeoff
|
||||
|
||||
### FP8 Quantization (H100/B200 Native)
|
||||
- Method: E4M3 or E5M2 floating point, hardware-native on Hopper/Blackwell
|
||||
- Quality loss: Near-zero (less than 0.1% on most benchmarks)
|
||||
- Speed: ~2x throughput improvement on H100/B200
|
||||
- VRAM savings: 50% vs FP16
|
||||
- Best for: H100/H200/B200 GPUs where hardware support exists
|
||||
|
||||
### FP4 / NVFP4 (Blackwell Native)
|
||||
- Method: 4-bit floating point, native on Blackwell GPUs
|
||||
- Quality loss: Less than 0.5% on most benchmarks
|
||||
- Speed: ~4x throughput improvement vs FP16
|
||||
- VRAM savings: 75% vs FP16
|
||||
- Best for: B200/GB200 deployments, maximum cost efficiency
|
||||
|
||||
### Quantization Quality Comparison (Llama-70B class models)
|
||||
| Method | Bits | MMLU | HumanEval | GSM8K | VRAM |
|
||||
|-----------|------|------|-----------|-------|--------|
|
||||
| FP16 | 16 | 78.5 | 81.0 | 56.8 | 140GB |
|
||||
| FP8 | 8 | 78.4 | 80.8 | 56.5 | 70GB |
|
||||
| AWQ-4bit | 4 | 77.9 | 80.2 | 55.8 | 36GB |
|
||||
| GPTQ-4bit | 4 | 77.6 | 79.8 | 55.2 | 36GB |
|
||||
| GGUF Q4_K_M | 4 | 77.5 | 79.5 | 55.0 | 36GB |
|
||||
| GPTQ-3bit | 3 | 75.8 | 77.2 | 52.1 | 28GB |
|
||||
|
||||
---
|
||||
|
||||
## 4. KV-CACHE COMPRESSION
|
||||
|
||||
### Current State of KV-Cache Optimization
|
||||
|
||||
**1. PagedAttention (vLLM)**
|
||||
- Reduces KV-cache memory waste from 60-80% to under 4%
|
||||
- Enables Copy-on-Write for parallel sampling
|
||||
- Up to 55% memory reduction for beam search
|
||||
- Up to 2.2x throughput improvement from memory efficiency
|
||||
|
||||
**2. KV-Cache Quantization**
|
||||
- FP8 KV-cache: 50% memory reduction, minimal quality impact
|
||||
- INT8 KV-cache: 75% memory reduction, slight quality degradation
|
||||
- Supported in vLLM (FP8) and TensorRT-LLM (FP8/INT8)
|
||||
|
||||
**3. GQA/MQA Architectural Compression**
|
||||
- Grouped-Query Attention (GQA): Reduces KV heads
|
||||
- Llama 2 70B: 8 KV heads vs 64 Q heads = 8x KV-cache reduction
|
||||
- Multi-Query Attention (MQA): Single KV head (Falcon, PaLM)
|
||||
|
||||
**4. Sliding Window Attention**
|
||||
- Mistral-style: Only cache last N tokens (e.g., 4096)
|
||||
- Reduces KV-cache by 75%+ for long sequences
|
||||
|
||||
**5. H2O (Heavy Hitter Oracle)**
|
||||
- Keeps only top-k attention-heavy KV pairs
|
||||
- 20x KV-cache reduction with less than 1% quality loss
|
||||
|
||||
**6. Sparse Attention (TensorRT-LLM)**
|
||||
- Block-sparse attention patterns
|
||||
- Skip Softmax Attention for long contexts
|
||||
|
||||
### KV-Cache Memory Requirements (Llama-70B, FP16)
|
||||
- Standard MHA: ~2.5MB per token, ~10GB at 4K context
|
||||
- GQA (Llama 2): ~0.32MB per token, ~1.3GB at 4K context
|
||||
- GQA + FP8: ~0.16MB per token, ~0.65GB at 4K context
|
||||
|
||||
---
|
||||
|
||||
## 5. THROUGHPUT BENCHMARKS
|
||||
|
||||
### Tokens/Second by Hardware (Single User, Output Tokens)
|
||||
|
||||
Llama-70B Class Models:
|
||||
- A100 80GB + vLLM FP16: ~30-40 tok/s
|
||||
- A100 80GB + TensorRT-LLM FP8: ~60-80 tok/s
|
||||
- H100 80GB + vLLM FP8: ~80-120 tok/s
|
||||
- H100 80GB + TensorRT-LLM FP8: ~120-150 tok/s
|
||||
- H200 141GB + TensorRT-LLM FP8: ~150-200 tok/s
|
||||
- B200 180GB + TensorRT-LLM FP4: ~250-400 tok/s
|
||||
|
||||
Llama-7B Class Models:
|
||||
- A10G 24GB + vLLM FP16: ~100-150 tok/s
|
||||
- RTX 4090 + llama.cpp Q4_K_M: ~80-120 tok/s
|
||||
- A100 80GB + vLLM FP16: ~200-300 tok/s
|
||||
- H100 80GB + TensorRT-LLM FP8: ~400-600 tok/s
|
||||
|
||||
### Throughput Under Load (vLLM on A100 80GB, Llama-13B)
|
||||
- 1 concurrent user: ~40 tok/s total, 50ms latency
|
||||
- 10 concurrent users: ~280 tok/s total, 120ms latency
|
||||
- 50 concurrent users: ~800 tok/s total, 350ms latency
|
||||
- 100 concurrent users: ~1100 tok/s total, 800ms latency
|
||||
|
||||
### Batch Inference Throughput
|
||||
- Llama-70B on 4xH100 TP4 + vLLM: 5,000-8,000 tok/s
|
||||
- Llama-70B on 4xH100 TP4 + TensorRT-LLM: 8,000-12,000 tok/s
|
||||
- Llama-70B on 8xH100 TP8 + TensorRT-LLM: 15,000-20,000 tok/s
|
||||
|
||||
---
|
||||
|
||||
## 6. COST COMPARISONS
|
||||
|
||||
### Cloud GPU Pricing (On-Demand, April 2026 estimates)
|
||||
| GPU | VRAM | $/hr (AWS) | $/hr (GCP) | $/hr (Lambda) |
|
||||
|------------|-------|-----------|-----------|--------------|
|
||||
| A10G | 24GB | $1.50 | $1.40 | $0.75 |
|
||||
| A100 40GB | 40GB | $3.50 | $3.20 | $1.50 |
|
||||
| A100 80GB | 80GB | $4.50 | $4.00 | $2.00 |
|
||||
| H100 80GB | 80GB | $12.00 | $11.00 | $4.00 |
|
||||
| H200 141GB | 141GB | $15.00 | $13.50 | $5.50 |
|
||||
| B200 180GB | 180GB | $20.00 | $18.00 | - |
|
||||
|
||||
### Cost per 1M Tokens (Llama-70B, Output Tokens)
|
||||
|
||||
Self-Hosted (vLLM on cloud GPUs):
|
||||
- 1xH100 FP8: ~$11.11/1M tokens
|
||||
- 1xH100 AWQ-4bit: ~$9.26/1M tokens
|
||||
- 4xH100 TP4 FP8: ~$12.70/1M tokens
|
||||
- 2xA100 TP2 FP16: ~$18.52/1M tokens
|
||||
|
||||
API Providers (for comparison):
|
||||
- OpenAI GPT-4o: $10.00/1M output tokens
|
||||
- Anthropic Claude 3.5: $15.00/1M output tokens
|
||||
- Together AI Llama-70B: $0.90/1M tokens
|
||||
- Fireworks AI Llama-70B: $0.90/1M tokens
|
||||
- DeepInfra Llama-70B: $0.70/1M tokens
|
||||
- Groq Llama-70B: $0.79/1M tokens
|
||||
|
||||
### Your 60% Cost Reduction Target
|
||||
|
||||
To achieve 60% cost reduction with vLLM + TurboQuant:
|
||||
|
||||
1. Quantization: Moving from FP16 to INT4/FP8 reduces VRAM by 50-75%
|
||||
2. PagedAttention: Enables 2-3x more concurrent requests per GPU
|
||||
3. Continuous batching: Maximizes GPU utilization (over 90%)
|
||||
4. Prefix caching: 30-50% speedup for repeated system prompts
|
||||
|
||||
Recommended configuration:
|
||||
- Hardware: 1-2x H100 (or 2-4x A100 for cost-sensitive)
|
||||
- Quantization: FP8 (quality-first) or AWQ-4bit (cost-first)
|
||||
- KV-cache: FP8 quantization
|
||||
- Framework: vLLM with prefix caching enabled
|
||||
- Expected cost: $2-5 per 1M output tokens (70B model)
|
||||
|
||||
---
|
||||
|
||||
## 7. QUALITY DEGRADATION ANALYSIS
|
||||
|
||||
### Benchmark Impact by Quantization (Llama-70B)
|
||||
| Benchmark | FP16 | FP8 | AWQ-4bit | GPTQ-4bit | GGUF Q4_K_M |
|
||||
|-------------|------|------|----------|-----------|-------------|
|
||||
| MMLU | 78.5 | 78.4 | 77.9 | 77.6 | 77.5 |
|
||||
| HumanEval | 81.0 | 80.8 | 80.2 | 79.8 | 79.5 |
|
||||
| GSM8K | 56.8 | 56.5 | 55.8 | 55.2 | 55.0 |
|
||||
| TruthfulQA | 51.2 | 51.0 | 50.5 | 50.2 | 50.0 |
|
||||
| Average Drop| - | 0.2% | 0.8% | 1.1% | 1.2% |
|
||||
|
||||
---
|
||||
|
||||
## 8. RECOMMENDATIONS FOR YOUR DEPLOYMENT
|
||||
|
||||
### Immediate Actions
|
||||
1. Benchmark TurboQuant against AWQ-4bit baseline on your workloads
|
||||
2. Enable vLLM prefix caching - immediate 30-50% speedup for repeated prompts
|
||||
3. Use FP8 KV-cache quantization - free 50% memory savings
|
||||
4. Set continuous batching with appropriate max_num_seqs
|
||||
|
||||
### Configuration for Maximum Cost Efficiency
|
||||
```
|
||||
vllm serve your-model \
|
||||
--quantization awq \
|
||||
--kv-cache-dtype fp8 \
|
||||
--enable-prefix-caching \
|
||||
--max-num-seqs 256 \
|
||||
--enable-chunked-prefill \
|
||||
--max-num-batched-tokens 32768
|
||||
```
|
||||
|
||||
### Monitoring Metrics
|
||||
- Tokens/sec/GPU: Target over 100 for 70B models on H100
|
||||
- GPU utilization: Target over 90%
|
||||
- KV-cache utilization: Target over 80% (thanks to PagedAttention)
|
||||
- P99 latency: Monitor against your SLA requirements
|
||||
- Cost per 1M tokens: Track actual vs projected
|
||||
|
||||
### Scaling Strategy
|
||||
- Start with 1x H100 for less than 5B tokens/month
|
||||
- Scale to 2-4x H100 with TP for 5-20B tokens/month
|
||||
- Consider B200/FP4 for over 20B tokens/month (when available)
|
||||
|
||||
---
|
||||
|
||||
## 9. KEY REFERENCES
|
||||
|
||||
- vLLM Paper: "Efficient Memory Management for Large Language Model Serving with PagedAttention" (SOSP 2023)
|
||||
- AWQ Paper: "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration" (MLSys 2024)
|
||||
- GPTQ Paper: "GPTQ: Accurate Post-Training Quantization for Generative Pre-trained Transformers" (ICLR 2023)
|
||||
- TensorRT-LLM Performance: https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html
|
||||
- llama.cpp: https://github.com/ggml-org/llama.cpp
|
||||
- vLLM: https://github.com/vllm-project/vllm
|
||||
|
||||
---
|
||||
|
||||
Report generated for vLLM + TurboQuant deployment planning.
|
||||
All benchmark numbers are approximate and should be validated on your specific hardware and workload.
|
||||
@@ -27,9 +27,6 @@ import threading
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
|
||||
from tools.registry import discover_builtin_tools, registry
|
||||
from tools.poka_yoke import validate_tool_call
|
||||
from tools.tool_pokayoke import validate_tool_call, reset_circuit_breaker, get_hallucination_stats
|
||||
from tools.hardcoded_path_guard import guard_tool_dispatch as _guard_hardcoded_paths
|
||||
from toolsets import resolve_toolset, validate_toolset
|
||||
from agent.tool_orchestrator import orchestrator
|
||||
|
||||
@@ -503,41 +500,12 @@ def handle_function_call(
|
||||
# Prefer the caller-provided list so subagents can't overwrite
|
||||
# the parent's tool set via the process-global.
|
||||
sandbox_enabled = enabled_tools if enabled_tools is not None else _last_resolved_tool_names
|
||||
# Poka-yoke #921: guard against hardcoded home-directory paths
|
||||
_hardcoded_err = _guard_hardcoded_paths(function_name, function_args)
|
||||
if _hardcoded_err:
|
||||
logger.warning(f"Hardcoded path blocked: {function_name}")
|
||||
return _hardcoded_err
|
||||
|
||||
# Poka-yoke: validate tool call before dispatch
|
||||
is_valid, corrected_name, corrected_params, pokayoke_messages = validate_tool_call(function_name, function_args)
|
||||
if not is_valid:
|
||||
# Return structured error with suggestions
|
||||
error_msg = "\n".join(pokayoke_messages)
|
||||
logger.warning(f"Poka-yoke blocked: {function_name} - {error_msg}")
|
||||
return json.dumps({"error": error_msg, "pokayoke": True, "tool_name": function_name})
|
||||
if corrected_name:
|
||||
function_name = corrected_name
|
||||
if corrected_params:
|
||||
function_args = corrected_params
|
||||
if pokayoke_messages:
|
||||
logger.info(f"Poka-yoke: {pokayoke_messages}")
|
||||
result = orchestrator.dispatch(
|
||||
function_name, function_args,
|
||||
task_id=task_id,
|
||||
enabled_tools=sandbox_enabled,
|
||||
)
|
||||
else:
|
||||
# Poka-yoke: validate tool call before dispatch
|
||||
is_valid, corrected_name, corrected_params, pokayoke_messages = validate_tool_call(function_name, function_args)
|
||||
if not is_valid:
|
||||
error_msg = "\n".join(pokayoke_messages)
|
||||
logger.warning(f"Poka-yoke blocked: {function_name} - {error_msg}")
|
||||
return json.dumps({"error": error_msg, "pokayoke": True, "tool_name": function_name})
|
||||
if corrected_name:
|
||||
function_name = corrected_name
|
||||
if corrected_params:
|
||||
function_args = corrected_params
|
||||
result = orchestrator.dispatch(
|
||||
function_name, function_args,
|
||||
task_id=task_id,
|
||||
|
||||
@@ -1,60 +0,0 @@
|
||||
# Mem0 Local - Sovereign Memory Provider
|
||||
|
||||
Local-only memory provider using ChromaDB. No API key required - all data stays on your machine.
|
||||
|
||||
## How It Differs from Cloud Mem0
|
||||
|
||||
| Feature | Cloud Mem0 | Local Mem0 |
|
||||
|---------|-----------|------------|
|
||||
| API key | Required | Not needed |
|
||||
| Data location | Mem0 servers | Your machine |
|
||||
| Fact extraction | Server-side LLM | Pattern-based heuristics |
|
||||
| Reranking | Yes | No |
|
||||
| Cost | Freemium | Free forever |
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
pip install chromadb
|
||||
hermes config set memory.provider mem0-local
|
||||
```
|
||||
|
||||
Or manually in ~/.hermes/config.yaml:
|
||||
```yaml
|
||||
memory:
|
||||
provider: mem0-local
|
||||
```
|
||||
|
||||
## Config
|
||||
|
||||
Config file: $HERMES_HOME/mem0-local.json
|
||||
|
||||
| Key | Default | Description |
|
||||
|-----|---------|-------------|
|
||||
| storage_path | ~/.hermes/mem0-local/ | ChromaDB storage directory |
|
||||
| collection_prefix | mem0 | Collection name prefix |
|
||||
| max_memories | 10000 | Maximum stored memories |
|
||||
|
||||
## Tools
|
||||
|
||||
Same interface as cloud Mem0:
|
||||
|
||||
| Tool | Description |
|
||||
|------|-------------|
|
||||
| mem0_profile | All stored memories about the user |
|
||||
| mem0_search | Semantic search by meaning |
|
||||
| mem0_conclude | Store a fact verbatim |
|
||||
|
||||
## Data Sovereignty
|
||||
|
||||
All data is stored in $HERMES_HOME/mem0-local/ as a ChromaDB persistent database. No network calls are made.
|
||||
|
||||
To back up: copy the mem0-local/ directory.
|
||||
To reset: delete the mem0-local/ directory.
|
||||
|
||||
## Limitations
|
||||
|
||||
- Fact extraction is pattern-based (not LLM-powered)
|
||||
- No reranking - results ranked by embedding similarity only
|
||||
- No cross-device sync (by design)
|
||||
- Requires chromadb pip dependency (~50MB)
|
||||
@@ -1,381 +0,0 @@
|
||||
"""Mem0 Local memory provider - ChromaDB-backed, no API key required.
|
||||
|
||||
Sovereign deployment: all data stays on the user's machine. Uses ChromaDB
|
||||
for vector storage and simple heuristic fact extraction (no server-side LLM).
|
||||
|
||||
Compatible tool schemas with the cloud Mem0 provider:
|
||||
mem0_profile - retrieve all stored memories
|
||||
mem0_search - semantic search by meaning
|
||||
mem0_conclude - store a fact verbatim
|
||||
|
||||
Config via $HERMES_HOME/mem0-local.json or environment variables:
|
||||
MEM0_LOCAL_PATH - storage directory (default: $HERMES_HOME/mem0-local/)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.memory_provider import MemoryProvider
|
||||
from tools.registry import tool_error
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Circuit breaker
|
||||
_BREAKER_THRESHOLD = 5
|
||||
_BREAKER_COOLDOWN_SECS = 120
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
"""Load local config from env vars, with $HERMES_HOME/mem0-local.json overrides."""
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
config = {
|
||||
"storage_path": os.environ.get("MEM0_LOCAL_PATH", ""),
|
||||
"collection_prefix": "mem0",
|
||||
"max_memories": 10000,
|
||||
}
|
||||
|
||||
config_path = get_hermes_home() / "mem0-local.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
file_cfg = json.loads(config_path.read_text(encoding="utf-8"))
|
||||
config.update({k: v for k, v in file_cfg.items()
|
||||
if v is not None and v != ""})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not config["storage_path"]:
|
||||
config["storage_path"] = str(get_hermes_home() / "mem0-local")
|
||||
|
||||
return config
|
||||
|
||||
|
||||
# Simple fact extraction patterns (no LLM required)
|
||||
_FACT_PATTERNS = [
|
||||
(r"(?:my|the user'?s?)\s+(?:name|username)\s+(?:is|=)\s+(.+?)(?:\.|$)", "user.name"),
|
||||
(r"(?:i|user)\s+(?:prefer|like|use|want|need)s?\s+(.+?)(?:\.|$)", "preference"),
|
||||
(r"(?:i|user)\s+(?:work|am)\s+(?:at|as|on|with)\s+(.+?)(?:\.|$)", "context"),
|
||||
(r"(?:remember|note|save|store)[:\s]+(.+?)(?:\.|$)", "explicit"),
|
||||
(r"(?:my|the)\s+(?:timezone|tz)\s+(?:is|=)\s+(.+?)(?:\.|$)", "user.timezone"),
|
||||
(r"(?:my|the)\s+(?:project|repo|codebase)\s+(?:is|=|called)\s+(.+?)(?:\.|$)", "project"),
|
||||
(r"(?:actually|correction|instead)[:\s]+(.+?)(?:\.|$)", "correction"),
|
||||
]
|
||||
|
||||
|
||||
def _extract_facts(text: str) -> List[Dict[str, str]]:
|
||||
"""Extract structured facts from conversation text using pattern matching."""
|
||||
facts = []
|
||||
if not text or len(text) < 10:
|
||||
return facts
|
||||
text_lower = text.lower().strip()
|
||||
|
||||
for pattern, category in _FACT_PATTERNS:
|
||||
matches = re.findall(pattern, text_lower, re.IGNORECASE)
|
||||
for match in matches:
|
||||
fact_text = match.strip() if isinstance(match, str) else match[0].strip()
|
||||
if len(fact_text) > 3 and len(fact_text) < 500:
|
||||
facts.append({
|
||||
"content": fact_text,
|
||||
"category": category,
|
||||
"source_text": text[:200],
|
||||
})
|
||||
|
||||
return facts
|
||||
|
||||
|
||||
# Tool schemas (compatible with cloud Mem0)
|
||||
PROFILE_SCHEMA = {
|
||||
"name": "mem0_profile",
|
||||
"description": (
|
||||
"Retrieve all stored memories about the user - preferences, facts, "
|
||||
"project context. Fast, no reranking. Use at conversation start."
|
||||
),
|
||||
"parameters": {"type": "object", "properties": {}, "required": []},
|
||||
}
|
||||
|
||||
SEARCH_SCHEMA = {
|
||||
"name": "mem0_search",
|
||||
"description": (
|
||||
"Search memories by meaning. Returns relevant facts ranked by similarity. "
|
||||
"Local-only - no API calls."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string", "description": "What to search for."},
|
||||
"top_k": {"type": "integer", "description": "Max results (default: 10, max: 50)."},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
}
|
||||
|
||||
CONCLUDE_SCHEMA = {
|
||||
"name": "mem0_conclude",
|
||||
"description": (
|
||||
"Store a durable fact about the user. Stored verbatim (no LLM extraction). "
|
||||
"Use for explicit preferences, corrections, or decisions. Local-only."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"conclusion": {"type": "string", "description": "The fact to store."},
|
||||
},
|
||||
"required": ["conclusion"],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class Mem0LocalProvider(MemoryProvider):
|
||||
"""Local ChromaDB-backed memory provider. No API key required."""
|
||||
|
||||
def __init__(self):
|
||||
self._config = None
|
||||
self._client = None
|
||||
self._collection = None
|
||||
self._client_lock = threading.Lock()
|
||||
self._user_id = "hermes-user"
|
||||
self._storage_path = ""
|
||||
self._max_memories = 10000
|
||||
self._consecutive_failures = 0
|
||||
self._breaker_open_until = 0.0
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "mem0-local"
|
||||
|
||||
def is_available(self) -> bool:
|
||||
try:
|
||||
import chromadb
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
def save_config(self, values, hermes_home):
|
||||
config_path = Path(hermes_home) / "mem0-local.json"
|
||||
existing = {}
|
||||
if config_path.exists():
|
||||
try:
|
||||
existing = json.loads(config_path.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
existing.update(values)
|
||||
config_path.write_text(json.dumps(existing, indent=2))
|
||||
|
||||
def get_config_schema(self):
|
||||
return [
|
||||
{"key": "storage_path", "description": "Storage directory for ChromaDB", "default": "~/.hermes/mem0-local/"},
|
||||
{"key": "collection_prefix", "description": "Collection name prefix", "default": "mem0"},
|
||||
{"key": "max_memories", "description": "Maximum stored memories", "default": "10000"},
|
||||
]
|
||||
|
||||
def _get_collection(self):
|
||||
"""Thread-safe ChromaDB collection accessor with lazy init."""
|
||||
with self._client_lock:
|
||||
if self._collection is not None:
|
||||
return self._collection
|
||||
|
||||
try:
|
||||
import chromadb
|
||||
from chromadb.config import Settings
|
||||
except ImportError:
|
||||
raise RuntimeError("chromadb package not installed. Run: pip install chromadb")
|
||||
|
||||
Path(self._storage_path).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._client = chromadb.PersistentClient(
|
||||
path=self._storage_path,
|
||||
settings=Settings(anonymized_telemetry=False),
|
||||
)
|
||||
|
||||
collection_name = f"{self._config.get('collection_prefix', 'mem0')}_memories"
|
||||
self._collection = self._client.get_or_create_collection(
|
||||
name=collection_name,
|
||||
metadata={"hnsw:space": "cosine"},
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Mem0 local: ChromaDB collection '%s' at %s (%d docs)",
|
||||
collection_name, self._storage_path, self._collection.count(),
|
||||
)
|
||||
|
||||
return self._collection
|
||||
|
||||
def _doc_id(self, content: str) -> str:
|
||||
"""Deterministic ID from content hash (for dedup)."""
|
||||
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
def _is_breaker_open(self) -> bool:
|
||||
if self._consecutive_failures < _BREAKER_THRESHOLD:
|
||||
return False
|
||||
if time.monotonic() >= self._breaker_open_until:
|
||||
self._consecutive_failures = 0
|
||||
return False
|
||||
return True
|
||||
|
||||
def _record_success(self):
|
||||
self._consecutive_failures = 0
|
||||
|
||||
def _record_failure(self):
|
||||
self._consecutive_failures += 1
|
||||
if self._consecutive_failures >= _BREAKER_THRESHOLD:
|
||||
self._breaker_open_until = time.monotonic() + _BREAKER_COOLDOWN_SECS
|
||||
|
||||
def initialize(self, session_id: str, **kwargs) -> None:
|
||||
self._config = _load_config()
|
||||
self._storage_path = self._config.get("storage_path", "")
|
||||
self._max_memories = int(self._config.get("max_memories", 10000))
|
||||
self._user_id = kwargs.get("user_id") or self._config.get("user_id", "hermes-user")
|
||||
|
||||
def system_prompt_block(self) -> str:
|
||||
count = 0
|
||||
try:
|
||||
col = self._get_collection()
|
||||
count = col.count()
|
||||
except Exception:
|
||||
pass
|
||||
return (
|
||||
"# Mem0 Local Memory\n"
|
||||
f"Active. {count} memories stored locally. "
|
||||
"Use mem0_search to find memories, mem0_conclude to store facts, "
|
||||
"mem0_profile for a full overview."
|
||||
)
|
||||
|
||||
def prefetch(self, query: str, *, session_id: str = "") -> str:
|
||||
return ""
|
||||
|
||||
def queue_prefetch(self, query: str, *, session_id: str = "") -> None:
|
||||
pass
|
||||
|
||||
def sync_turn(self, user_content: str, assistant_content: str, *, session_id: str = "") -> None:
|
||||
"""Extract and store facts from the conversation turn."""
|
||||
if self._is_breaker_open():
|
||||
return
|
||||
try:
|
||||
col = self._get_collection()
|
||||
except Exception:
|
||||
return
|
||||
|
||||
for content in [user_content, assistant_content]:
|
||||
if not content or len(content) < 10:
|
||||
continue
|
||||
facts = _extract_facts(content)
|
||||
for fact in facts:
|
||||
doc_id = self._doc_id(fact["content"])
|
||||
try:
|
||||
col.upsert(
|
||||
ids=[doc_id],
|
||||
documents=[fact["content"]],
|
||||
metadatas=[{
|
||||
"category": fact["category"],
|
||||
"user_id": self._user_id,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"source": "extracted",
|
||||
}],
|
||||
)
|
||||
self._record_success()
|
||||
except Exception as e:
|
||||
self._record_failure()
|
||||
logger.debug("Mem0 local: failed to upsert fact: %s", e)
|
||||
|
||||
def get_tool_schemas(self) -> List[Dict[str, Any]]:
|
||||
return [PROFILE_SCHEMA, SEARCH_SCHEMA, CONCLUDE_SCHEMA]
|
||||
|
||||
def handle_tool_call(self, tool_name: str, args: dict, **kwargs) -> str:
|
||||
if self._is_breaker_open():
|
||||
return json.dumps({"error": "Local memory temporarily unavailable. Will retry automatically."})
|
||||
|
||||
try:
|
||||
col = self._get_collection()
|
||||
except Exception as e:
|
||||
return tool_error(f"ChromaDB not available: {e}")
|
||||
|
||||
if tool_name == "mem0_profile":
|
||||
try:
|
||||
results = col.get(
|
||||
where={"user_id": self._user_id} if self._user_id else None,
|
||||
limit=500,
|
||||
)
|
||||
documents = results.get("documents", [])
|
||||
if not documents:
|
||||
return json.dumps({"result": "No memories stored yet."})
|
||||
lines = [d for d in documents if d]
|
||||
self._record_success()
|
||||
return json.dumps({"result": "\n".join(f"- {l}" for l in lines), "count": len(lines)})
|
||||
except Exception as e:
|
||||
self._record_failure()
|
||||
return tool_error(f"Failed to fetch profile: {e}")
|
||||
|
||||
elif tool_name == "mem0_search":
|
||||
query = args.get("query", "")
|
||||
if not query:
|
||||
return tool_error("Missing required parameter: query")
|
||||
top_k = min(int(args.get("top_k", 10)), 50)
|
||||
|
||||
try:
|
||||
results = col.query(
|
||||
query_texts=[query],
|
||||
n_results=top_k,
|
||||
where={"user_id": self._user_id} if self._user_id else None,
|
||||
)
|
||||
|
||||
documents = results.get("documents", [[]])[0]
|
||||
distances = results.get("distances", [[]])[0]
|
||||
|
||||
if not documents:
|
||||
return json.dumps({"result": "No relevant memories found."})
|
||||
|
||||
items = []
|
||||
for doc, dist in zip(documents, distances):
|
||||
score = max(0, 1 - (dist / 2))
|
||||
items.append({"memory": doc, "score": round(score, 3)})
|
||||
|
||||
self._record_success()
|
||||
return json.dumps({"results": items, "count": len(items)})
|
||||
except Exception as e:
|
||||
self._record_failure()
|
||||
return tool_error(f"Search failed: {e}")
|
||||
|
||||
elif tool_name == "mem0_conclude":
|
||||
conclusion = args.get("conclusion", "")
|
||||
if not conclusion:
|
||||
return tool_error("Missing required parameter: conclusion")
|
||||
|
||||
try:
|
||||
doc_id = self._doc_id(conclusion)
|
||||
col.upsert(
|
||||
ids=[doc_id],
|
||||
documents=[conclusion],
|
||||
metadatas=[{
|
||||
"category": "explicit",
|
||||
"user_id": self._user_id,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"source": "conclude",
|
||||
}],
|
||||
)
|
||||
self._record_success()
|
||||
return json.dumps({"result": "Fact stored locally.", "id": doc_id})
|
||||
except Exception as e:
|
||||
self._record_failure()
|
||||
return tool_error(f"Failed to store: {e}")
|
||||
|
||||
return tool_error(f"Unknown tool: {tool_name}")
|
||||
|
||||
def shutdown(self) -> None:
|
||||
with self._client_lock:
|
||||
self._collection = None
|
||||
self._client = None
|
||||
|
||||
|
||||
def register(ctx) -> None:
|
||||
"""Register Mem0 Local as a memory provider plugin."""
|
||||
ctx.register_memory_provider(Mem0LocalProvider())
|
||||
@@ -1,5 +0,0 @@
|
||||
name: mem0_local
|
||||
version: 1.0.0
|
||||
description: "Mem0 local mode — ChromaDB-backed memory with no API key required. Sovereign deployment."
|
||||
pip_dependencies:
|
||||
- chromadb
|
||||
@@ -1,68 +0,0 @@
|
||||
# Tool Investigation Report: Top 5 Recommendations from awesome-ai-tools
|
||||
|
||||
**Generated:** 2026-04-20 | **Source:** [formatho/awesome-ai-tools](https://github.com/formatho/awesome-ai-tools)
|
||||
|
||||
---
|
||||
|
||||
## Methodology
|
||||
|
||||
Scanned 795 tools across 10 categories from the awesome-ai-tools repository. Evaluated each tool against Hermes Agent's architecture and needs:
|
||||
- **Memory/Context**: Persistent memory, conversation history, knowledge graphs
|
||||
- **Inference Optimization**: Token efficiency, local model serving, routing
|
||||
- **Agent Orchestration**: Multi-agent coordination, fleet management
|
||||
- **Workflow Automation**: Task decomposition, scheduling, pipelines
|
||||
- **Retrieval/RAG**: Semantic search, document understanding, context injection
|
||||
|
||||
Each tool scored on: GitHub stars, development activity (freshness), integration potential, and impact on Hermes.
|
||||
|
||||
---
|
||||
|
||||
## Top 5 Recommended Tools
|
||||
|
||||
| Rank | Tool | Stars | Category | Integration Effort | Impact | Why It Fits Hermes |
|
||||
|------|------|-------|----------|-------------------|--------|---------------------|
|
||||
| 1 | **[LiteLLM](https://github.com/BerriAI/litellm)** | 76k+ | Inference Optimization | 2/5 | 5/5 | Unified API gateway for 100+ LLM providers with cost tracking, guardrails, load balancing, and logging. Hermes already routes through multiple providers — LiteLLM could replace custom provider routing with battle-tested load balancing and automatic fallback. Direct drop-in for `provider` abstraction layer. Native support for Bedrock, Azure, OpenAI, VertexAI, Anthropic, Ollama, vLLM. Would reduce Hermes's provider management code by ~60%. |
|
||||
| 2 | **[Mem0](https://github.com/mem0ai/mem0)** | 53k+ | Memory/Context | 3/5 | 5/5 | Universal memory layer for AI agents with persistent, searchable memory across sessions. Hermes has session memory but lacks a structured long-term memory system. Mem0 provides automatic memory extraction from conversations, semantic search over memories, and memory decay/pruning. Could replace/enhance the current memory tool with a purpose-built agent memory infrastructure. Supports Pinecone, Qdrant, ChromaDB backends. |
|
||||
| 3 | **[RAGFlow](https://github.com/infiniflow/ragflow)** | 77k+ | Retrieval/RAG | 4/5 | 4/5 | Open-source RAG engine with deep document understanding, OCR, and agent capabilities. Hermes's current retrieval is limited to web search and file reading. RAGFlow adds visual document parsing (PDF/Word/PPT with tables, charts, formulas), chunk-level citation, and configurable retrieval strategies. Would massively upgrade Hermes's document processing capabilities. Docker-deployable, compatible with local models. |
|
||||
| 4 | **[LiteRT-LM](https://github.com/google-ai-edge/LiteRT-LM)** | 3.7k | Inference Optimization | 3/5 | 4/5 | C++ implementation of Google's LiteRT for efficient on-device language model inference. Hermes supports local models via Ollama but lacks optimized on-device inference for edge/mobile. LiteRT-LM provides sub-second inference on commodity hardware with minimal memory footprint. Could power a "Hermes lite" mode for offline/edge deployments. Active development (Fresh status), backed by Google AI Edge team. |
|
||||
| 5 | **[Claude-Mem](https://github.com/thedotmack/claude-mem)** | 61k+ | Memory/Context | 2/5 | 3/5 | Automatic session capture and context injection for coding agents. Compresses session history with AI and injects relevant context into future sessions. Pattern directly applicable to Hermes's cross-session persistence problem. Uses agent SDK for intelligent compression — could enhance Hermes's session_search with automatic relevance-weighted recall. Lightweight integration, focused on the exact pain point of context loss between sessions. |
|
||||
|
||||
---
|
||||
|
||||
## Category Coverage Analysis
|
||||
|
||||
| Category | Tools Scanned | Top Pick | Coverage Gap |
|
||||
|----------|--------------|----------|-------------|
|
||||
| Memory/Context | 45+ | Mem0 (53k⭐) | Hermes lacks structured long-term memory — Mem0 or Claude-Mem would fill this |
|
||||
| Inference Optimization | 80+ | LiteLLM (76k⭐) | Provider routing is custom-built; LiteLLM standardizes it |
|
||||
| Agent Orchestration | 120+ | langgraph (29k⭐) | Hermes's fleet model is unique — langgraph patterns could improve DAG workflows |
|
||||
| Workflow Automation | 90+ | n8n (183k⭐) | Cron system exists but n8n patterns could improve visual pipeline design |
|
||||
| Retrieval/RAG | 60+ | RAGFlow (77k⭐) | Document processing is weak; RAGFlow adds OCR + visual parsing |
|
||||
|
||||
---
|
||||
|
||||
## Implementation Priority
|
||||
|
||||
**Phase 1 (Immediate):** LiteLLM integration — highest impact, lowest effort. Replace custom provider routing with LiteLLM's unified API. Estimated: 2-3 days.
|
||||
|
||||
**Phase 2 (Short-term):** Mem0 memory layer — critical for agent maturity. Add structured memory extraction and retrieval. Estimated: 1 week.
|
||||
|
||||
**Phase 3 (Medium-term):** RAGFlow document engine — significant capability upgrade. Requires Docker setup and integration with existing file tools. Estimated: 1-2 weeks.
|
||||
|
||||
---
|
||||
|
||||
## Honorable Mentions
|
||||
|
||||
- **[GPTCache](https://github.com/zilliztech/GPTCache)** (8k⭐): Semantic cache for LLMs — could reduce API costs by 30-50% for repeated queries
|
||||
- **[promptfoo](https://github.com/promptfoo/promptfoo)** (20k⭐): LLM testing/evaluation framework — essential for quality assurance
|
||||
- **[PageIndex](https://github.com/VectifyAI/PageIndex)** (25k⭐): Vectorless reasoning-based RAG — next-gen retrieval without embeddings
|
||||
- **[rtk](https://github.com/rtk-ai/rtk)** (28k⭐): CLI proxy that reduces token consumption 60-90% — directly relevant to cost optimization
|
||||
|
||||
---
|
||||
|
||||
## Data Sources
|
||||
|
||||
- Repository: https://github.com/formatho/awesome-ai-tools
|
||||
- Total tools cataloged: 795
|
||||
- Categories analyzed: Agents & Automation, Developer Tools, LLMs & Chatbots, Research & Data, Productivity
|
||||
- Freshness filter: Prioritized tools with Fresh (≤7d) or Recent (≤30d) status
|
||||
@@ -1,314 +0,0 @@
|
||||
# Local Model Quality for Crisis Support: Research Report
|
||||
## Mission: Reaching Broken Men in Their Darkest Moment
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
Local models (Ollama) CAN handle crisis support with adequate quality for the Most Sacred Moment protocol. Research demonstrates that even small local models (1.5B-7B parameters) achieve performance comparable to trained human operators in crisis detection tasks. However, they require careful implementation with safety guardrails and should complement—not replace—human oversight.
|
||||
|
||||
**Key Finding:** A fine-tuned 1.5B parameter Qwen model outperformed larger models on mood and suicidal ideation detection tasks (PsyCrisisBench, 2025).
|
||||
|
||||
---
|
||||
|
||||
## 1. Crisis Detection Accuracy
|
||||
|
||||
### Research Evidence
|
||||
|
||||
**PsyCrisisBench (2025)** - The most comprehensive benchmark to date:
|
||||
- Source: 540 annotated transcripts from Hangzhou Psychological Assistance Hotline
|
||||
- Models tested: 64 LLMs across 15 families (GPT, Claude, Gemini, Llama, Qwen, DeepSeek)
|
||||
- Results:
|
||||
- **Suicidal ideation detection: F1=0.880** (88% accuracy)
|
||||
- **Suicide plan identification: F1=0.779** (78% accuracy)
|
||||
- **Risk assessment: F1=0.907** (91% accuracy)
|
||||
- **Mood status recognition: F1=0.709** (71% accuracy - challenging due to missing vocal cues)
|
||||
|
||||
**Llama-2 for Suicide Detection (British Journal of Psychiatry, 2024):**
|
||||
- German fine-tuned Llama-2 model achieved:
|
||||
- **Accuracy: 87.5%**
|
||||
- **Sensitivity: 83.0%**
|
||||
- **Specificity: 91.8%**
|
||||
- Locally hosted, privacy-preserving approach
|
||||
|
||||
**Supportiv Hybrid AI Study (2026):**
|
||||
- AI detected SI faster than humans in **77.52% passive** and **81.26% active** cases
|
||||
- **90.3% agreement** between AI and human moderators
|
||||
- Processed **169,181 live-chat transcripts** (449,946 user visits)
|
||||
|
||||
### False Positive/Negative Rates
|
||||
|
||||
Based on the research:
|
||||
- **False Negative Rate (missed crisis):** ~12-17% for suicidal ideation
|
||||
- **False Positive Rate:** ~8-12%
|
||||
- **Risk Assessment Error:** ~9% overall
|
||||
|
||||
**Critical insight:** The research shows LLMs and trained human operators have *complementary* strengths—humans are better at mood recognition and suicidal ideation, while LLMs excel at risk assessment and suicide plan identification.
|
||||
|
||||
---
|
||||
|
||||
## 2. Emotional Understanding
|
||||
|
||||
### Can Local Models Understand Emotional Nuance?
|
||||
|
||||
**Yes, with limitations:**
|
||||
|
||||
1. **Emotion Recognition:**
|
||||
- Maximum F1 of 0.709 for mood status (PsyCrisisBench)
|
||||
- Missing vocal cues is a significant limitation in text-only
|
||||
- Semantic ambiguity creates challenges
|
||||
|
||||
2. **Empathy in Responses:**
|
||||
- LLMs demonstrate ability to generate empathetic responses
|
||||
- Research shows they deliver "superior explanations" (BERTScore=0.9408)
|
||||
- Human evaluations confirm adequate interviewing skills
|
||||
|
||||
3. **Emotional Support Conversation (ESConv) benchmarks:**
|
||||
- Models trained on emotional support datasets show improved empathy
|
||||
- Few-shot prompting significantly improves emotional understanding
|
||||
- Fine-tuning narrows the gap with larger models
|
||||
|
||||
### Key Limitations
|
||||
- Cannot detect tone, urgency in voice, or hesitation
|
||||
- Cultural and linguistic nuances may be missed
|
||||
- Context window limitations may lose conversation history
|
||||
|
||||
---
|
||||
|
||||
## 3. Response Quality & Safety Protocols
|
||||
|
||||
### What Makes a Good Crisis Support Response?
|
||||
|
||||
**988 Suicide & Crisis Lifeline Guidelines:**
|
||||
1. Show you care ("I'm glad you told me")
|
||||
2. Ask directly about suicide ("Are you thinking about killing yourself?")
|
||||
3. Keep them safe (remove means, create safety plan)
|
||||
4. Be there (listen without judgment)
|
||||
5. Help them connect (to 988, crisis services)
|
||||
6. Follow up
|
||||
|
||||
**WHO mhGAP Guidelines:**
|
||||
- Assess risk level
|
||||
- Provide psychosocial support
|
||||
- Refer to specialized care when needed
|
||||
- Ensure follow-up
|
||||
- Involve family/support network
|
||||
|
||||
### Do Local Models Follow Safety Protocols?
|
||||
|
||||
**Research indicates:**
|
||||
|
||||
**Strengths:**
|
||||
- Can be prompted to follow structured safety protocols
|
||||
- Can detect and escalate high-risk situations
|
||||
- Can provide consistent, non-judgmental responses
|
||||
- Can operate 24/7 without fatigue
|
||||
|
||||
**Concerns:**
|
||||
- Only 33% of studies reported ethical considerations (Holmes et al., 2025)
|
||||
- Risk of "hallucinated" safety advice
|
||||
- Cannot physically intervene or call emergency services
|
||||
- May miss cultural context
|
||||
|
||||
### Safety Guardrails Required
|
||||
|
||||
1. **Mandatory escalation triggers** - Any detected suicidal ideation must trigger immediate human review
|
||||
2. **Crisis resource integration** - Always provide 988 Lifeline number
|
||||
3. **Conversation logging** - Full audit trail for safety review
|
||||
4. **Timeout protocols** - If user goes silent during crisis, escalate
|
||||
5. **No diagnostic claims** - Model should not diagnose or prescribe
|
||||
|
||||
---
|
||||
|
||||
## 4. Latency & Real-Time Performance
|
||||
|
||||
### Response Time Analysis
|
||||
|
||||
**Ollama Local Model Latency (typical hardware):**
|
||||
|
||||
| Model Size | First Token | Tokens/sec | Total Response (100 tokens) |
|
||||
|------------|-------------|------------|----------------------------|
|
||||
| 1-3B params | 0.1-0.3s | 30-80 | 1.5-3s |
|
||||
| 7B params | 0.3-0.8s | 15-40 | 3-7s |
|
||||
| 13B params | 0.5-1.5s | 8-20 | 5-13s |
|
||||
|
||||
**Crisis Support Requirements:**
|
||||
- Chat response should feel conversational: <5 seconds
|
||||
- Crisis detection should be near-instant: <1 second
|
||||
- Escalation must be immediate: 0 delay
|
||||
|
||||
**Assessment:**
|
||||
- **1-3B models:** Excellent for real-time conversation
|
||||
- **7B models:** Acceptable for most users
|
||||
- **13B+ models:** May feel slow, but manageable
|
||||
|
||||
### Hardware Considerations
|
||||
- **Consumer GPU (8GB VRAM):** Can run 7B models comfortably
|
||||
- **Consumer GPU (16GB+ VRAM):** Can run 13B models
|
||||
- **CPU only:** 3B-7B models with 2-5 second latency
|
||||
- **Apple Silicon (M1/M2/M3):** Excellent performance with Metal acceleration
|
||||
|
||||
---
|
||||
|
||||
## 5. Model Recommendations for Most Sacred Moment Protocol
|
||||
|
||||
### Tier 1: Primary Recommendation (Best Balance)
|
||||
|
||||
**Qwen2.5-7B or Qwen3-8B**
|
||||
- Size: ~4-5GB
|
||||
- Strength: Strong multilingual capabilities, good reasoning
|
||||
- Proven: Fine-tuned Qwen2.5-1.5B outperformed larger models in crisis detection
|
||||
- Latency: 2-5 seconds on consumer hardware
|
||||
- Use for: Main conversation, emotional support
|
||||
|
||||
### Tier 2: Lightweight Option (Mobile/Low-Resource)
|
||||
|
||||
**Phi-4-mini or Gemma3-4B**
|
||||
- Size: ~2-3GB
|
||||
- Strength: Fast inference, runs on modest hardware
|
||||
- Consideration: May need fine-tuning for crisis support
|
||||
- Latency: 1-3 seconds
|
||||
- Use for: Initial triage, quick responses
|
||||
|
||||
### Tier 3: Maximum Quality (When Resources Allow)
|
||||
|
||||
**Llama3.1-8B or Mistral-7B**
|
||||
- Size: ~4-5GB
|
||||
- Strength: Strong general capabilities
|
||||
- Consideration: Higher resource requirements
|
||||
- Latency: 3-7 seconds
|
||||
- Use for: Complex emotional situations
|
||||
|
||||
### Specialized Safety Model
|
||||
|
||||
**Llama-Guard3** (available on Ollama)
|
||||
- Purpose-built for content safety
|
||||
- Can be used as a secondary safety filter
|
||||
- Detects harmful content and self-harm references
|
||||
|
||||
---
|
||||
|
||||
## 6. Fine-Tuning Potential
|
||||
|
||||
Research shows fine-tuning dramatically improves crisis detection:
|
||||
|
||||
- **Without fine-tuning:** Best LLM lags supervised models by 6.95% (suicide task) to 31.53% (cognitive distortion)
|
||||
- **With fine-tuning:** Gap narrows to 4.31% and 3.14% respectively
|
||||
- **Key insight:** Even a 1.5B model, when fine-tuned, outperforms larger general models
|
||||
|
||||
### Recommended Fine-Tuning Approach
|
||||
1. Collect crisis conversation data (anonymized)
|
||||
2. Fine-tune on suicidal ideation detection
|
||||
3. Fine-tune on empathetic response generation
|
||||
4. Fine-tune on safety protocol adherence
|
||||
5. Evaluate with PsyCrisisBench methodology
|
||||
|
||||
---
|
||||
|
||||
## 7. Comparison: Local vs Cloud Models
|
||||
|
||||
| Factor | Local (Ollama) | Cloud (GPT-4/Claude) |
|
||||
|--------|----------------|----------------------|
|
||||
| **Privacy** | Complete | Data sent to third party |
|
||||
| **Latency** | Predictable | Variable (network) |
|
||||
| **Cost** | Hardware only | Per-token pricing |
|
||||
| **Availability** | Always online | Dependent on service |
|
||||
| **Quality** | Good (7B+) | Excellent |
|
||||
| **Safety** | Must implement | Built-in guardrails |
|
||||
| **Crisis Detection** | F1 ~0.85-0.90 | F1 ~0.88-0.92 |
|
||||
|
||||
**Verdict:** Local models are GOOD ENOUGH for crisis support, especially with fine-tuning and proper safety guardrails.
|
||||
|
||||
---
|
||||
|
||||
## 8. Implementation Recommendations
|
||||
|
||||
### For the Most Sacred Moment Protocol:
|
||||
|
||||
1. **Use a two-model architecture:**
|
||||
- Primary: Qwen2.5-7B for conversation
|
||||
- Safety: Llama-Guard3 for content filtering
|
||||
|
||||
2. **Implement strict escalation rules:**
|
||||
```
|
||||
IF suicidal_ideation_detected OR risk_level >= MODERATE:
|
||||
- Immediately provide 988 Lifeline number
|
||||
- Log conversation for human review
|
||||
- Continue supportive engagement
|
||||
- Alert monitoring system
|
||||
```
|
||||
|
||||
3. **System prompt must include:**
|
||||
- Crisis intervention guidelines
|
||||
- Mandatory safety behaviors
|
||||
- Escalation procedures
|
||||
- Empathetic communication principles
|
||||
|
||||
4. **Testing protocol:**
|
||||
- Evaluate with PsyCrisisBench-style metrics
|
||||
- Test with clinical scenarios
|
||||
- Validate with mental health professionals
|
||||
- Regular safety audits
|
||||
|
||||
---
|
||||
|
||||
## 9. Risks and Limitations
|
||||
|
||||
### Critical Risks
|
||||
1. **False negatives:** Missing someone in crisis (12-17% rate)
|
||||
2. **Over-reliance:** Users may treat AI as substitute for professional help
|
||||
3. **Hallucination:** Model may generate inappropriate or harmful advice
|
||||
4. **Liability:** Legal responsibility for AI-mediated crisis intervention
|
||||
|
||||
### Mitigations
|
||||
- Always include human escalation path
|
||||
- Clear disclaimers about AI limitations
|
||||
- Regular human review of conversations
|
||||
- Insurance and legal consultation
|
||||
|
||||
---
|
||||
|
||||
## 10. Key Citations
|
||||
|
||||
1. Deng et al. (2025). "Evaluating Large Language Models in Crisis Detection: A Real-World Benchmark from Psychological Support Hotlines." arXiv:2506.01329. PsyCrisisBench.
|
||||
|
||||
2. Wiest et al. (2024). "Detection of suicidality from medical text using privacy-preserving large language models." British Journal of Psychiatry, 225(6), 532-537.
|
||||
|
||||
3. Holmes et al. (2025). "Applications of Large Language Models in the Field of Suicide Prevention: Scoping Review." J Med Internet Res, 27, e63126.
|
||||
|
||||
4. Levkovich & Omar (2024). "Evaluating of BERT-based and Large Language Models for Suicide Detection, Prevention, and Risk Assessment." J Med Syst, 48(1), 113.
|
||||
|
||||
5. Shukla et al. (2026). "Effectiveness of Hybrid AI and Human Suicide Detection Within Digital Peer Support." J Clin Med, 15(5), 1929.
|
||||
|
||||
6. Qi et al. (2025). "Supervised Learning and Large Language Model Benchmarks on Mental Health Datasets." Bioengineering, 12(8), 882.
|
||||
|
||||
7. Liu et al. (2025). "Enhanced large language models for effective screening of depression and anxiety." Commun Med, 5(1), 457.
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
**Local models ARE good enough for the Most Sacred Moment protocol.**
|
||||
|
||||
The research is clear:
|
||||
- Crisis detection F1 scores of 0.88-0.91 are achievable
|
||||
- Fine-tuned small models (1.5B-7B) can match or exceed human performance
|
||||
- Local deployment ensures complete privacy for vulnerable users
|
||||
- Latency is acceptable for real-time conversation
|
||||
- With proper safety guardrails, local models can serve as effective first responders
|
||||
|
||||
**The Most Sacred Moment protocol should:**
|
||||
1. Use Qwen2.5-7B or similar as primary conversational model
|
||||
2. Implement Llama-Guard3 as safety filter
|
||||
3. Build in immediate 988 Lifeline escalation
|
||||
4. Maintain human oversight and review
|
||||
5. Fine-tune on crisis-specific data when possible
|
||||
6. Test rigorously with clinical scenarios
|
||||
|
||||
The men in pain deserve privacy, speed, and compassionate support. Local models deliver all three.
|
||||
|
||||
---
|
||||
|
||||
*Report generated: 2026-04-14*
|
||||
*Research sources: PubMed, OpenAlex, ArXiv, Ollama Library*
|
||||
*For: Most Sacred Moment Protocol Development*
|
||||
@@ -1,168 +0,0 @@
|
||||
# SOTA Research: Structured Memory Systems for AI Agents
|
||||
|
||||
**Date**: 2026-04-14
|
||||
**Purpose**: Inform MemPalace integration for Hermes Agent
|
||||
|
||||
---
|
||||
|
||||
## 1. Landscape Overview
|
||||
|
||||
| System | Type | License | Retrieval Method | Storage |
|
||||
|--------|------|---------|-----------------|---------|
|
||||
| **MemPalace** | Local verbatim store | Open Source | ChromaDB vector + metadata filtering (wings/rooms) | ChromaDB + filesystem |
|
||||
| **Mem0** | Managed memory layer | Apache 2.0 | Vector DB + LLM extraction/consolidation | Qdrant/Chroma/Pinecone + graph |
|
||||
| **MemGPT/Letta** | OS-inspired memory tiers | MIT | Hierarchical recall (core/recall/archival) | In-context + DB archival |
|
||||
| **Zep** | Context engineering platform | Commercial | Temporal knowledge graph (Graphiti) + vector | Graph DB + vector |
|
||||
| **LangMem** | Memory toolkit (LangChain) | MIT | LangGraph store (semantic search) | Postgres/in-memory store |
|
||||
| **Engram** | CLI binary (Rust) | MIT | Hybrid Gemini Embed + FTS5 + RRF | SQLite FTS5 + embeddings |
|
||||
|
||||
---
|
||||
|
||||
## 2. Benchmark Comparison (LongMemEval)
|
||||
|
||||
LongMemEval is the primary academic benchmark for long-term memory retrieval. 500 questions, 96% distractors.
|
||||
|
||||
| System | LongMemEval R@5 | LongMemEval R@1 | API Required | Notes |
|
||||
|--------|----------------|-----------------|--------------|-------|
|
||||
| **MemPalace (raw)** | **96.6%** | — | None | Zero API calls, pure ChromaDB |
|
||||
| **MemPalace (hybrid+Haiku rerank)** | **100%** (500/500) | — | Optional | Reranking adds cost |
|
||||
| **MemPalace (AAAK compression)** | 84.2% | — | None | Lossy, 12.4pt regression vs raw |
|
||||
| **Engram (hybrid)** | 99.0% | 91.0% | Gemini API | R@5 beats MemPalace by 0.6pt |
|
||||
| **Engram (+Cohere rerank)** | 98.0% | 93.0% | Gemini+Cohere | First 100 Qs only |
|
||||
| **Mem0** | ~85% | — | Yes | On LOCOMO benchmark |
|
||||
| **Zep** | ~85% | — | Yes | Cloud service |
|
||||
| **Mastra** | 94.87% | — | Yes (GPT) | — |
|
||||
| **Supermemory ASMR** | ~99% | — | Yes | — |
|
||||
|
||||
### LOCOMO Benchmark (Mem0's paper, arXiv:2504.19413)
|
||||
|
||||
| Method | Accuracy | Median Search Latency | p95 Search Latency | End-to-End p95 | Tokens/Convo |
|
||||
|--------|----------|----------------------|-------------------|----------------|-------------|
|
||||
| **Full Context** | 72.9% | — | — | 17.12s | ~26,000 |
|
||||
| **Standard RAG** | 61.0% | 0.70s | 0.26s | — | — |
|
||||
| **OpenAI Memory** | 52.9% | — | — | — | — |
|
||||
| **Mem0** | 66.9% | 0.20s | 0.15s | 1.44s | ~1,800 |
|
||||
| **Mem0ᵍ (graph)** | 68.4% | 0.66s | 0.48s | 2.59s | — |
|
||||
|
||||
**Key Mem0 claims**: +26% accuracy over OpenAI Memory, 91% lower p95 latency vs full-context, 90% token savings.
|
||||
|
||||
---
|
||||
|
||||
## 3. Retrieval Latency
|
||||
|
||||
| System | Reported Latency | Notes |
|
||||
|--------|-----------------|-------|
|
||||
| **Mem0** | 0.20s median search, 0.71s end-to-end | LOCOMO benchmark |
|
||||
| **Zep** | <200ms claimed | Cloud service, sub-200ms SLA |
|
||||
| **MemPalace** | ~seconds for ChromaDB search | Local, depends on corpus size; raw mode is fast |
|
||||
| **Engram** | Fast (Rust binary) | No published latency numbers |
|
||||
| **LangMem** | Depends on underlying store | In-memory fast, Postgres slower |
|
||||
| **MemGPT/Letta** | Variable by tier | Core (in-context) is instant; archival has DB latency |
|
||||
|
||||
**Target for Hermes**: <100ms is achievable with local ChromaDB + small embedding model (all-MiniLM-L6-v2, ~50MB).
|
||||
|
||||
---
|
||||
|
||||
## 4. Compression Techniques
|
||||
|
||||
| System | Technique | Compression Ratio | Fidelity Impact |
|
||||
|--------|-----------|-------------------|-----------------|
|
||||
| **MemPalace AAAK** | Lossy abbreviation dialect (entity codes, truncation) | Claimed ~30x (disputed) | 12.4pt R@5 regression (96.6% → 84.2%) |
|
||||
| **Mem0** | LLM extraction → structured facts | ~14x token reduction (26K → 1.8K) | 6pt accuracy loss vs full-context |
|
||||
| **MemGPT** | Hierarchical summarization + eviction | Variable | Depends on tier management |
|
||||
| **Zep** | Graph compression + temporal invalidation | N/A | Maintains temporal accuracy |
|
||||
| **Engram** | None (stores raw) | 1x | No loss |
|
||||
| **LangMem** | Background consolidation via LLM | Variable | Depends on LLM quality |
|
||||
|
||||
**Key insight**: MemPalace's raw mode (no compression) achieves the best retrieval scores. Compression trades fidelity for token density. For Hermes, raw storage + semantic search is the safest starting point.
|
||||
|
||||
---
|
||||
|
||||
## 5. Architecture Patterns
|
||||
|
||||
### MemPalace (recommended for Hermes integration)
|
||||
- **Hierarchical**: Wings (scope: global/workspace) → Rooms (priority: explicit/implicit)
|
||||
- **Dual-store**: SQLite for canonical data, ChromaDB for vector search
|
||||
- **Verbatim storage**: No LLM extraction, raw conversation storage
|
||||
- **Explicit-first ranking**: User instructions always surface above auto-extracted context
|
||||
- **Workspace isolation**: Memories scoped per project
|
||||
|
||||
### Mem0 (graph-enhanced)
|
||||
- **Two-phase pipeline**: Extraction → Update
|
||||
- **LLM-driven**: Uses LLM to extract candidate memories, decide ADD/UPDATE/DELETE/NOOP
|
||||
- **Graph variant (Mem0ᵍ)**: Entity extraction → relationship graph → conflict detection → temporal updates
|
||||
- **Multi-level**: User, Session, Agent state
|
||||
|
||||
### Letta/MemGPT (OS-inspired)
|
||||
- **Memory tiers**: Core (in-context), Recall (searchable), Archival (deep storage)
|
||||
- **Self-editing**: Agent manages its own memory via function calls
|
||||
- **Interrupts**: Control flow between agent and user
|
||||
|
||||
### Zep (knowledge graph)
|
||||
- **Temporal knowledge graph**: Facts have valid_at/invalid_at timestamps
|
||||
- **Graph RAG**: Relationship-aware retrieval
|
||||
- **Powered by Graphiti**: Open-source temporal KG framework
|
||||
|
||||
---
|
||||
|
||||
## 6. Integration Patterns for Hermes
|
||||
|
||||
### Current Hermes Memory (memory_tool.py)
|
||||
- File-backed: MEMORY.md + USER.md
|
||||
- Delimiter-based entries (§)
|
||||
- Frozen snapshot in system prompt
|
||||
- No semantic search
|
||||
|
||||
### MemPalace Plugin (hermes_memorypalace)
|
||||
- Implements `MemoryProvider` ABC
|
||||
- ChromaDB + SQLite dual-store
|
||||
- Lifecycle hooks: initialize, system_prompt_block, prefetch, sync_turn
|
||||
- Tools: mempalace_remember_explicit, mempalace_store_implicit, mempalace_recall
|
||||
- Local embedding model (all-MiniLM-L6-v2)
|
||||
|
||||
### Recommended Integration Approach
|
||||
1. **Keep MEMORY.md/USER.md** as L0 (always-loaded baseline)
|
||||
2. **Add MemPalace** as L1 (semantic search layer)
|
||||
3. **Prefetch on each turn**: Run vector search before response generation
|
||||
4. **Background sync**: Store conversation turns as implicit context
|
||||
5. **Workspace scoping**: Isolate memories per project
|
||||
|
||||
---
|
||||
|
||||
## 7. Critical Caveats
|
||||
|
||||
1. **Retrieval ≠ Answer accuracy**: Engram team showed R@5 of 98.4% (MemPalace) can yield only 17% correct answers when an LLM actually tries to answer. The retrieval-to-accuracy gap is the real bottleneck.
|
||||
|
||||
2. **MemPalace's 96.6% is retrieval only**: Not end-to-end QA accuracy. End-to-end numbers are much lower (~17-40% depending on question difficulty).
|
||||
|
||||
3. **AAAK compression is lossy**: 12.4pt regression. Use raw mode for accuracy-critical work.
|
||||
|
||||
4. **Mem0's LOCOMO numbers are on a different benchmark**: Not directly comparable to LongMemEval scores.
|
||||
|
||||
5. **Latency depends heavily on corpus size and hardware**: Local ChromaDB on M2 Ultra runs fast; older hardware may not meet <100ms targets.
|
||||
|
||||
---
|
||||
|
||||
## 8. Recommendations for Hermes MemPalace Integration
|
||||
|
||||
| Metric | Target | Achievable? | Approach |
|
||||
|--------|--------|-------------|----------|
|
||||
| Retrieval latency | <100ms | Yes | Local ChromaDB + small model, pre-indexed |
|
||||
| Retrieval accuracy (R@5) | >95% | Yes | Raw verbatim mode, no compression |
|
||||
| Token efficiency | <2000 tokens/convo | Yes | Selective retrieval, not full-context |
|
||||
| Workspace isolation | Per-project | Yes | Wing-based scoping |
|
||||
| Zero cloud dependency | 100% local | Yes | all-MiniLM-L6-v2 runs offline |
|
||||
|
||||
**Priority**: Integrate existing hermes_memorypalace plugin with raw mode. Defer AAAK compression. Focus on retrieval latency and explicit-first ranking.
|
||||
|
||||
---
|
||||
|
||||
## Sources
|
||||
|
||||
- Mem0 paper: arXiv:2504.19413
|
||||
- MemGPT paper: arXiv:2310.08560
|
||||
- MemPalace repo: github.com/MemPalace/mempalace
|
||||
- Engram benchmarks: github.com/199-biotechnologies/engram-2
|
||||
- Hermes MemPalace plugin: github.com/neilharding/hermes_memorypalace
|
||||
- LOCOMO benchmark results from mem0.ai/research
|
||||
- LongMemEval: huggingface.co/datasets/xiaowu0162/longmemeval-cleaned
|
||||
@@ -1,529 +0,0 @@
|
||||
# Multi-Agent Coordination SOTA Research Report
|
||||
## Fleet Knowledge Graph — Architecture Patterns & Integration Recommendations
|
||||
|
||||
**Date**: 2025-04-14
|
||||
**Scope**: Agent-to-agent communication, shared memory, task delegation, consensus protocols, conflict resolution
|
||||
**Frameworks Analyzed**: CrewAI, AutoGen, MetaGPT, ChatDev, CAMEL, LangGraph
|
||||
**Target Fleet**: Hermes (orchestrator), Timmy, Claude Code, Gemini, Kimi
|
||||
|
||||
---
|
||||
|
||||
## 1. EXECUTIVE SUMMARY
|
||||
|
||||
Six major multi-agent frameworks each solve coordination differently. The SOTA converges on **four core patterns**: role-based delegation with capability matching, shared state via publish-subscribe messaging, directed-graph task flows with conditional routing, and layered memory (short-term context + long-term knowledge graph). For our fleet, the optimal architecture combines **AutoGen's GraphFlow** (dag-based task routing), **CrewAI's hierarchical memory** (short-term RAG + long-term SQLite + entity memory), **MetaGPT's standardized output contracts** (typed task artifacts), and **CAMEL's role-playing delegation protocol** (inception-prompted agent negotiation).
|
||||
|
||||
---
|
||||
|
||||
## 2. FRAMEWORK-BY-FRAMEWORK ANALYSIS
|
||||
|
||||
### 2.1 CrewAI (v1.14.x) — Role-Based Crews with Hierarchical Orchestration
|
||||
|
||||
**Core Architecture:**
|
||||
- **Process modes**: `Process.sequential` (tasks execute in order), `Process.hierarchical` (manager agent delegates to workers)
|
||||
- **Agent delegation**: `allow_delegation=True` enables agents to call other agents as tools, selecting the best agent for subtasks
|
||||
- **Memory system**: Crew-level `memory=True` enables UnifiedMemory with:
|
||||
- **Short-term**: RAG-backed (embeddings → vector store) for recent task context
|
||||
- **Long-term**: SQLite-backed for persistent task outcomes
|
||||
- **Entity memory**: Tracks entities (people, companies, concepts) across tasks
|
||||
- **User memory**: Per-user preference tracking
|
||||
- **Embedder**: Configurable (OpenAI, Cohere, Jina, local ONNX, etc.)
|
||||
- **Knowledge sources**: `knowledge_sources=[StringKnowledgeSource(...)]` for RAG-grounded context per agent or crew
|
||||
- **Flows**: `@start`, `@listen`, `@router` decorators for DAG orchestration across crews. `or_()` and `and_()` combinators for conditional triggers
|
||||
- **Callbacks**: `before_kickoff_callbacks`, `after_kickoff_callbacks`, `step_callback`, `task_callback`
|
||||
|
||||
**Key Patterns for Fleet:**
|
||||
- **Delegation-as-tool**: Agents can invoke other agents by role → our fleet agents could expose themselves as callable tools to each other
|
||||
- **Sequential handoff**: Task output from Agent A feeds directly as input to Agent B → pipeline pattern
|
||||
- **Hierarchical manager**: A manager LLM decomposes goals and assigns tasks → matches Hermes-as-orchestrator pattern
|
||||
- **Shared memory with scopes**: Crew-level memory visible to all agents, agent-level memory private
|
||||
|
||||
**Limitations:**
|
||||
- No native inter-process communication — all agents live in the same process
|
||||
- Manager/hierarchical mode requires an LLM call just for delegation decisions (extra latency/cost)
|
||||
- No built-in conflict resolution for concurrent writes to shared memory
|
||||
|
||||
### 2.2 AutoGen (v0.7.5) — Flexible Team Topologies with Graph-Based Coordination
|
||||
|
||||
**Core Architecture:**
|
||||
- **Team topologies** (5 types):
|
||||
- `RoundRobinGroupChat`: Sequential turn-taking, each agent speaks in order
|
||||
- `SelectorGroupChat`: LLM selects next speaker based on conversation context (`selector_prompt` template)
|
||||
- `MagenticOneGroupChat`: Orchestrator-driven (from Microsoft's Magentic-One paper), with stall detection and replanning
|
||||
- `Swarm`: Handoff-based — current speaker explicitly hands off to target via `HandoffMessage`
|
||||
- `GraphFlow`: **Directed acyclic graph** execution — agents execute based on DAG edges with conditional routing, fan-out, join patterns, and loop support
|
||||
- **Agent types**:
|
||||
- `AssistantAgent`: Standard LLM agent with tools
|
||||
- `CodeExecutorAgent`: Runs code in isolated environments
|
||||
- `UserProxyAgent`: Human-in-the-loop proxy
|
||||
- `SocietyOfMindAgent`: **Meta-agent** — wraps an inner team and summarizes their output as a single response (composable nesting)
|
||||
- `MessageFilterAgent`: Filters/transforms messages between agents
|
||||
- **Termination conditions**: `TextMentionTermination`, `MaxMessageTermination`, `SourceMatchTermination`, `HandoffTermination`, `TimeoutTermination`, `FunctionCallTermination`, `TokenUsageTermination`, `ExternalTermination` (programmatic control), `FunctionalTermination` (custom function)
|
||||
- **Memory**: `Sequence[Memory]` on agents — per-agent memory stores (RAG-backed)
|
||||
- **GraphFlow specifics**:
|
||||
- `DiGraphBuilder.add_node(agent, activation='all'|'any')`
|
||||
- `DiGraphBuilder.add_edge(source, target, condition=callable|str)` — conditional edges
|
||||
- `set_entry_point(agent)` — defines graph root
|
||||
- Supports: sequential, parallel fan-out, conditional branching, join patterns, loops with exit conditions
|
||||
- Node activation: `'all'` (wait for all incoming edges) vs `'any'` (trigger on first)
|
||||
|
||||
**Key Patterns for Fleet:**
|
||||
- **GraphFlow is the SOTA pattern** for multi-agent orchestration — DAG-based, conditional, supports parallel branches and joins
|
||||
- **SocietyOfMindAgent** enables hierarchical composition — a team of agents wrapped as a single agent that can participate in a larger team
|
||||
- **Selector pattern** (LLM picks next speaker) is elegant for heterogeneous fleets where capability matching matters
|
||||
- **Swarm handoff** maps directly to our ACP handoff mechanism
|
||||
- **Termination conditions** are composable — `termination_a | termination_b` (OR), `termination_a & termination_b` (AND)
|
||||
|
||||
### 2.3 MetaGPT — SOP-Driven Multi-Agent with Standardized Artifacts
|
||||
|
||||
**Core Architecture (from paper + codebase):**
|
||||
- **SOP (Standard Operating Procedure)**: Tasks decomposed into phases, each with specific roles and required artifacts
|
||||
- **Role-based agents**: Each role has `name`, `profile`, `goal`, `constraints`, `actions` (specific output types)
|
||||
- **Shared Message Environment**: All agents publish to and subscribe from a shared `Environment` object
|
||||
- **Publish-Subscribe**: Agents subscribe to message types/topics they care about, ignore others
|
||||
- **Standardized Output**: Each action produces a typed artifact (e.g., `SystemDesign`, `Task`, `Code`) — structured contracts between agents
|
||||
- **Memory**: `Memory` class stores all messages, retrievable by relevance. `Role.react()` calls `observe()` then `act()` based on observed messages
|
||||
- **Communication**: Asynchronous message passing — agents publish results to environment, interested agents react
|
||||
|
||||
**Key Patterns for Fleet:**
|
||||
- **Typed artifact contracts**: Each agent publishes structured outputs (not free-form text) → reduces ambiguity in inter-agent communication
|
||||
- **Pub-sub messaging**: Decouples sender from receiver — agents don't need to know about each other, just subscribe to relevant topics
|
||||
- **SOP-driven phases**: Define workflow phases (e.g., "analysis" → "implementation" → "review") with specific agents per phase
|
||||
- **Environment as blackboard**: Shared state all agents can read/write — classic blackboard architecture for AI systems
|
||||
|
||||
### 2.4 ChatDev — Chat-Chain Architecture for Software Development
|
||||
|
||||
**Core Architecture:**
|
||||
- **Chat Chain**: Sequential phases (design → code → test → document), each phase is a two-agent conversation
|
||||
- **Role pairing**: Each phase pairs complementary roles (e.g., CEO ↔ CTO, Programmer ↔ Reviewer)
|
||||
- **Communicative dehallucination**: Agents communicate through structured prompts that constrain outputs to prevent hallucination
|
||||
- **Phase transitions**: Phase completion triggers next phase, output from one phase seeds the next
|
||||
- **Memory**: Conversation history within each phase; phase outputs stored as artifacts
|
||||
|
||||
**Key Patterns for Fleet:**
|
||||
- **Phase-gated pipeline**: Each phase must produce a specific artifact type before proceeding
|
||||
- **Complementary role pairing**: Pair agents with opposing perspectives (creator ↔ reviewer) for higher quality
|
||||
- **Communicative protocols**: Structured conversation templates reduce free-form ambiguity
|
||||
|
||||
### 2.5 CAMEL — Role-Playing Autonomous Multi-Agent Communication
|
||||
|
||||
**Core Architecture:**
|
||||
- **RolePlaying society**: Two agents (assistant + user) collaborate with inception prompting
|
||||
- **Task specification**: `with_task_specify=True` uses a task-specify agent to refine the initial prompt into a concrete task
|
||||
- **Task planning**: `with_task_planner=True` adds a planning agent that decomposes the task
|
||||
- **Critic-in-the-loop**: `with_critic_in_the_loop=True` adds a critic agent that evaluates and approves/rejects
|
||||
- **Inception prompting**: Both agents receive system messages that establish their roles, goals, and communication protocol
|
||||
- **Termination**: Agents signal completion via specific tokens or phrases
|
||||
|
||||
**Key Patterns for Fleet:**
|
||||
- **Inception prompting**: Agents negotiate a shared understanding of the task before executing
|
||||
- **Critic-in-the-loop**: A dedicated reviewer agent validates outputs before acceptance
|
||||
- **Role-playing protocol**: Structured back-and-forth between complementary agents
|
||||
- **Task refinement chain**: Raw goal → specified task → planned subtasks → executed
|
||||
|
||||
### 2.6 LangGraph — Graph-Based Stateful Agent Workflows
|
||||
|
||||
**Core Architecture (from documentation/paper):**
|
||||
- **StateGraph**: Typed state schema shared across all nodes (agents/tools)
|
||||
- **Nodes**: Functions (agents, tools, transforms) that read/modify shared state
|
||||
- **Edges**: Conditional routing based on state or agent decisions
|
||||
- **Checkpointer**: Persistent state snapshots (SQLite, Postgres, in-memory) — enables pause/resume
|
||||
- **Human-in-the-loop**: Interrupt nodes for approval, edit, review
|
||||
- **Streaming**: Real-time node-by-node or token-by-token output
|
||||
- **Subgraphs**: Composable graph composition — subgraph as a node in parent graph
|
||||
- **State channels**: Multiple state namespaces for different aspects of the workflow
|
||||
|
||||
**Key Patterns for Fleet:**
|
||||
- **Shared typed state**: All agents operate on a well-defined state schema — eliminates ambiguity about what data each agent sees
|
||||
- **Checkpoint persistence**: Workflow can be paused, resumed, forked — critical for long-running agent tasks
|
||||
- **Conditional edges**: Route based on agent output type or state values
|
||||
- **Subgraph composition**: Each fleet agent could be a subgraph, composed into larger workflows
|
||||
- **Command-based routing**: Nodes return `Command(goto="node_name", update={...})` for explicit control flow
|
||||
|
||||
---
|
||||
|
||||
## 3. CROSS-CUTTING PATTERNS ANALYSIS
|
||||
|
||||
### 3.1 Agent-to-Agent Communication
|
||||
|
||||
| Pattern | Frameworks | Latency | Decoupling | Structured |
|
||||
|---------|-----------|---------|------------|------------|
|
||||
| Direct tool invocation | CrewAI, AutoGen | Low | Low | Medium |
|
||||
| Pub-sub messaging | MetaGPT | Medium | High | High |
|
||||
| Handoff messages | AutoGen Swarm | Low | Medium | High |
|
||||
| Chat-chain conversations | ChatDev, CAMEL | High | Low | Medium |
|
||||
| Shared state graph | LangGraph, AutoGen GraphFlow | Low | Medium | High |
|
||||
|
||||
**Recommendation**: Use **handoff + shared state** pattern. Agents communicate via typed handoff messages (what task was completed, what artifacts produced) while sharing a typed state object (knowledge graph entries).
|
||||
|
||||
### 3.2 Shared Memory Patterns
|
||||
|
||||
| Pattern | Frameworks | Persistence | Scope | Query Method |
|
||||
|---------|-----------|-------------|-------|-------------|
|
||||
| RAG-backed short-term | CrewAI, AutoGen | Session | Crew/Team | Embedding similarity |
|
||||
| SQLite long-term | CrewAI | Cross-session | Global | SQL + embeddings |
|
||||
| Entity memory | CrewAI | Cross-session | Global | Entity lookup |
|
||||
| Message store | MetaGPT | Session | Environment | Relevance search |
|
||||
| Typed state channels | LangGraph | Checkpointed | Graph | State field access |
|
||||
| Frozen snapshot | Hermes (current) | Cross-session | Agent | System prompt injection |
|
||||
|
||||
**Recommendation**: Implement **three-tier memory**:
|
||||
1. **Session state** (LangGraph-style typed state graph) — shared within a workflow
|
||||
2. **Fleet knowledge graph** (new) — structured triples/relations between entities, projects, decisions
|
||||
3. **Agent-local memory** (existing MEMORY.md pattern) — per-agent persistent notes
|
||||
|
||||
### 3.3 Task Delegation
|
||||
|
||||
| Pattern | Frameworks | Decision Maker | Granularity |
|
||||
|---------|-----------|---------------|-------------|
|
||||
| Manager decomposition | CrewAI hierarchical | Manager LLM | Task-level |
|
||||
| Delegation-as-tool | CrewAI | Self-selecting | Subtask |
|
||||
| Selector-based | AutoGen SelectorGroupChat | LLM selector | Turn-level |
|
||||
| Handoff-based | AutoGen Swarm | Current agent | Message-level |
|
||||
| Graph-defined | AutoGen GraphFlow, LangGraph | Pre-defined DAG | Node-level |
|
||||
| SOP-based | MetaGPT | Phase rules | Phase-level |
|
||||
|
||||
**Recommendation**: Use **hybrid delegation**:
|
||||
- **Graph-based** for known workflows (CI/CD, code review pipelines) — pre-defined DAGs
|
||||
- **Selector-based** for exploratory tasks (research, debugging) — LLM picks best agent
|
||||
- **Handoff-based** for agent-initiated delegation — current agent explicitly hands off
|
||||
|
||||
### 3.4 Consensus Protocols
|
||||
|
||||
No framework implements true consensus protocols (Raft, PBFT). Instead:
|
||||
|
||||
| Pattern | What It Solves |
|
||||
|---------|---------------|
|
||||
| Critic-in-the-loop (CAMEL) | Single reviewer approves/rejects |
|
||||
| Aggregator synthesis (MoA/Mixture-of-Agents) | Multiple responses synthesized into one |
|
||||
| Hierarchical manager (CrewAI) | Manager makes final decision |
|
||||
| MagenticOne orchestrator (AutoGen) | Orchestrator plans and replans |
|
||||
|
||||
**Recommendation for Fleet**: Implement **weighted ensemble consensus**:
|
||||
1. Multiple agents produce independent solutions
|
||||
2. A synthesis agent aggregates (like MoA pattern already in Hermes)
|
||||
3. For critical decisions, require 2-of-3 agreement from designated expert agents
|
||||
|
||||
### 3.5 Conflict Resolution
|
||||
|
||||
| Conflict Type | Resolution Strategy |
|
||||
|--------------|-------------------|
|
||||
| Concurrent memory writes | File locking + atomic rename (Hermes already does this) |
|
||||
| Conflicting agent outputs | Critic/validator agent evaluates both |
|
||||
| Task assignment conflicts | Single orchestrator (Hermes) assigns, no self-assignment |
|
||||
| State graph race conditions | LangGraph checkpoint + merge strategies |
|
||||
|
||||
**Recommendation**:
|
||||
- **Write conflicts**: Atomic operations with optimistic locking (existing pattern)
|
||||
- **Output conflicts**: Dedicate one agent as "judge" for each workflow
|
||||
- **Assignment conflicts**: Centralized orchestrator (Hermes) — no agent self-delegation to other fleet members without approval
|
||||
|
||||
---
|
||||
|
||||
## 4. FLEET ARCHITECTURE RECOMMENDATION
|
||||
|
||||
### 4.1 Proposed Architecture: "Fleet Knowledge Graph" (FKG)
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ FLEET KNOWLEDGE GRAPH │
|
||||
│ │
|
||||
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
|
||||
│ │ Entities │ │ Relations│ │ Artifacts│ │ Decisions│ │
|
||||
│ │ (nodes) │──│ (edges) │──│ (typed) │──│ (history)│ │
|
||||
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
|
||||
│ │
|
||||
│ Storage: SQLite + FTS5 (existing hermes_state.py pattern) │
|
||||
│ Schema: RDF-lite triples with typed properties │
|
||||
└─────────────────────┬───────────────────────────────────────┘
|
||||
│
|
||||
┌───────────┼───────────┐
|
||||
│ │ │
|
||||
┌────▼────┐ ┌────▼────┐ ┌───▼─────┐
|
||||
│ Session │ │ Agent │ │ Workflow│
|
||||
│ State │ │ Memory │ │ History │
|
||||
│ (shared)│ │ (local) │ │ (audit) │
|
||||
└─────────┘ └─────────┘ └─────────┘
|
||||
```
|
||||
|
||||
### 4.2 Fleet Member Roles
|
||||
|
||||
| Agent | Role | Strengths | Delegation Style |
|
||||
|-------|------|-----------|-----------------|
|
||||
| **Hermes** | Orchestrator | Planning, tool use, multi-platform | Delegator (spawns others) |
|
||||
| **Claude Code** | Code specialist | Deep code reasoning, ACP integration | Executor (receives tasks) |
|
||||
| **Gemini** | Multimodal analyst | Vision, large context, fast | Executor (receives tasks) |
|
||||
| **Kimi** | Coding assistant | Code generation, long context | Executor (receives tasks) |
|
||||
| **Timmy** | (Details TBD) | TBD | Executor (receives tasks) |
|
||||
|
||||
### 4.3 Communication Protocol
|
||||
|
||||
**Inter-Agent Message Format** (inspired by MetaGPT's typed artifacts):
|
||||
|
||||
```json
|
||||
{
|
||||
"message_type": "task_request|task_response|handoff|knowledge_update|conflict",
|
||||
"source_agent": "hermes",
|
||||
"target_agent": "claude_code",
|
||||
"task_id": "uuid",
|
||||
"parent_task_id": "uuid|null",
|
||||
"payload": {
|
||||
"goal": "...",
|
||||
"context": "...",
|
||||
"artifacts": [{"type": "code", "path": "..."}, {"type": "analysis", "content": "..."}],
|
||||
"constraints": ["..."],
|
||||
"priority": "high|medium|low"
|
||||
},
|
||||
"knowledge_graph_refs": ["entity:project-x", "relation:depends-on"],
|
||||
"timestamp": "ISO8601",
|
||||
"signature": "hmac-or-uuid"
|
||||
}
|
||||
```
|
||||
|
||||
### 4.4 Task Flow Patterns
|
||||
|
||||
**Pattern 1: Pipeline (ChatDev-style)**
|
||||
```
|
||||
Hermes → [Analyze] → Claude Code → [Implement] → Gemini → [Review] → Hermes → [Deliver]
|
||||
```
|
||||
|
||||
**Pattern 2: Fan-out/Fan-in (AutoGen GraphFlow-style)**
|
||||
```
|
||||
┌→ Claude Code (code) ──┐
|
||||
Hermes ──┼→ Gemini (analysis) ───┼→ Hermes (synthesize)
|
||||
└→ Kimi (docs) ─────────┘
|
||||
```
|
||||
|
||||
**Pattern 3: Debate (CAMEL-style)**
|
||||
```
|
||||
Claude Code (proposal) ↔ Gemini (critic) → Hermes (judge)
|
||||
```
|
||||
|
||||
**Pattern 4: Selector (AutoGen SelectorGroupChat)**
|
||||
```
|
||||
Hermes (orchestrator) → LLM selects best agent → Agent executes → Result → Repeat
|
||||
```
|
||||
|
||||
### 4.5 Knowledge Graph Schema
|
||||
|
||||
```sql
|
||||
-- Core entities
|
||||
CREATE TABLE fkg_entities (
|
||||
id TEXT PRIMARY KEY,
|
||||
entity_type TEXT NOT NULL, -- 'project', 'file', 'agent', 'task', 'concept', 'decision'
|
||||
name TEXT NOT NULL,
|
||||
properties JSON, -- Flexible typed properties
|
||||
created_by TEXT, -- Agent that created this
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Relations between entities
|
||||
CREATE TABLE fkg_relations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source_entity TEXT REFERENCES fkg_entities(id),
|
||||
target_entity TEXT REFERENCES fkg_entities(id),
|
||||
relation_type TEXT NOT NULL, -- 'depends-on', 'created-by', 'reviewed-by', 'part-of', 'conflicts-with'
|
||||
properties JSON,
|
||||
confidence REAL DEFAULT 1.0,
|
||||
created_by TEXT,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Task execution history
|
||||
CREATE TABLE fkg_task_history (
|
||||
task_id TEXT PRIMARY KEY,
|
||||
parent_task_id TEXT,
|
||||
goal TEXT,
|
||||
assigned_agent TEXT,
|
||||
status TEXT, -- 'pending', 'running', 'completed', 'failed', 'conflict'
|
||||
result_summary TEXT,
|
||||
artifacts JSON, -- List of produced artifacts
|
||||
knowledge_refs JSON, -- Entities/relations this task touched
|
||||
started_at TIMESTAMP,
|
||||
completed_at TIMESTAMP
|
||||
);
|
||||
|
||||
-- Conflict tracking
|
||||
CREATE TABLE fkg_conflicts (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
entity_id TEXT REFERENCES fkg_entities(id),
|
||||
conflict_type TEXT, -- 'concurrent_write', 'contradictory_output', 'resource_contention'
|
||||
agent_a TEXT,
|
||||
agent_b TEXT,
|
||||
resolution TEXT,
|
||||
resolved_by TEXT,
|
||||
resolved_at TIMESTAMP
|
||||
);
|
||||
|
||||
-- Full-text search across everything
|
||||
CREATE VIRTUAL TABLE fkg_search USING fts5(
|
||||
entity_name, entity_type, properties_text,
|
||||
content='fkg_entities', content_rowid='rowid'
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. INTEGRATION RECOMMENDATIONS
|
||||
|
||||
### 5.1 Phase 1: Foundation (Immediate — 1-2 weeks)
|
||||
|
||||
1. **Implement FKG SQLite database** at `~/.hermes/fleet_knowledge.db`
|
||||
- Extend existing `hermes_state.py` pattern (already uses SQLite + FTS5)
|
||||
- Add schema from §4.5
|
||||
- Create `tools/fleet_knowledge_tool.py` with CRUD operations
|
||||
|
||||
2. **Create fleet agent registry** in `agent/fleet_registry.py`
|
||||
- Map agent names → transport (ACP, API, subprocess)
|
||||
- Store capabilities, specializations, availability status
|
||||
- Integrate with existing `acp_adapter/` and `delegate_tool.py`
|
||||
|
||||
3. **Define message protocol** as typed Python dataclasses
|
||||
- `FleetMessage`, `TaskRequest`, `TaskResponse`, `KnowledgeUpdate`
|
||||
- Validation via Pydantic (already a CrewAI/dependency)
|
||||
|
||||
### 5.2 Phase 2: Communication Layer (2-4 weeks)
|
||||
|
||||
4. **Build fleet delegation on top of existing `delegate_tool.py`**
|
||||
- Extend to support cross-agent delegation (not just child subagents)
|
||||
- ACP transport for Claude Code (already supported via `acp_command`)
|
||||
- OpenRouter/OpenAI-compatible API for Gemini, Kimi
|
||||
- Reuse existing credential pool and provider resolution
|
||||
|
||||
5. **Implement selector-based task routing** (AutoGen SelectorGroupChat pattern)
|
||||
- LLM-based agent selection based on task description + agent capabilities
|
||||
- Hermes acts as the selector/orchestrator
|
||||
- Simple heuristic fallback (code → Claude Code, vision → Gemini, etc.)
|
||||
|
||||
6. **Add typed artifact contracts** (MetaGPT pattern)
|
||||
- Each task produces a typed artifact (code, analysis, docs, review)
|
||||
- Artifacts stored in FKG with entity relations
|
||||
- Downstream agents consume typed inputs, not free-form text
|
||||
|
||||
### 5.3 Phase 3: Advanced Patterns (4-6 weeks)
|
||||
|
||||
7. **Implement workflow DAGs** (AutoGen GraphFlow pattern)
|
||||
- Pre-defined workflows as directed graphs (code review pipeline, research pipeline)
|
||||
- Conditional routing based on artifact types or agent decisions
|
||||
- Fan-out/fan-in for parallel execution across fleet agents
|
||||
|
||||
8. **Add conflict resolution**
|
||||
- Detect concurrent writes to same FKG entities
|
||||
- Critic agent validates contradictory outputs
|
||||
- Track resolution history for learning
|
||||
|
||||
9. **Build consensus mechanism** for critical decisions
|
||||
- Weighted voting based on agent expertise
|
||||
- MoA-style aggregation (already implemented in `mixture_of_agents_tool.py`)
|
||||
- Escalation to human for irreconcilable conflicts
|
||||
|
||||
### 5.4 Phase 4: Intelligence (6-8 weeks)
|
||||
|
||||
10. **Learning from delegation history**
|
||||
- Track which agent performs best for which task types
|
||||
- Adjust routing weights over time
|
||||
- RL-style improvement of delegation decisions
|
||||
|
||||
11. **Fleet-level memory evolution**
|
||||
- Entities and relations in FKG become the "shared brain"
|
||||
- Agents contribute knowledge as they work
|
||||
- Cross-agent knowledge synthesis (one agent's discovery benefits all)
|
||||
|
||||
---
|
||||
|
||||
## 6. BENCHMARKS & PERFORMANCE CONSIDERATIONS
|
||||
|
||||
### 6.1 Latency Estimates
|
||||
|
||||
| Pattern | Overhead | Notes |
|
||||
|---------|----------|-------|
|
||||
| Direct delegation (current) | ~30s per subagent | Spawn + run + collect |
|
||||
| ACP transport (Claude Code) | ~2-5s connection + task time | Subprocess handshake |
|
||||
| API-based (Gemini/Kimi) | ~1-2s + task time | Standard HTTP |
|
||||
| Selector routing | +1 LLM call (~2-5s) | For agent selection |
|
||||
| GraphFlow routing | +state overhead (~100ms) | Pre-defined, no LLM call |
|
||||
| FKG query | ~1-5ms | SQLite indexed query |
|
||||
| MoA consensus | ~15-30s (4 parallel + 1 aggregator) | Already implemented |
|
||||
|
||||
### 6.2 Recommended Configuration
|
||||
|
||||
```yaml
|
||||
# Fleet coordination config (add to config.yaml)
|
||||
fleet:
|
||||
enabled: true
|
||||
knowledge_db: "~/.hermes/fleet_knowledge.db"
|
||||
|
||||
agents:
|
||||
hermes:
|
||||
role: orchestrator
|
||||
transport: local
|
||||
claude_code:
|
||||
role: code_specialist
|
||||
transport: acp
|
||||
acp_command: "claude"
|
||||
acp_args: ["--acp", "--stdio"]
|
||||
capabilities: ["code", "debugging", "architecture"]
|
||||
gemini:
|
||||
role: multimodal_analyst
|
||||
transport: api
|
||||
provider: openrouter
|
||||
model: "google/gemini-3-pro-preview"
|
||||
capabilities: ["vision", "analysis", "large_context"]
|
||||
kimi:
|
||||
role: coding_assistant
|
||||
transport: api
|
||||
provider: kimi-coding
|
||||
capabilities: ["code", "long_context"]
|
||||
|
||||
delegation:
|
||||
strategy: selector # selector | pipeline | graph
|
||||
max_concurrent: 3
|
||||
timeout_seconds: 300
|
||||
|
||||
consensus:
|
||||
enabled: true
|
||||
min_agreement: 2 # 2-of-3 for critical decisions
|
||||
escalation_agent: hermes
|
||||
|
||||
knowledge:
|
||||
auto_extract: true # Extract entities from task results
|
||||
relation_confidence_threshold: 0.7
|
||||
search_provider: fts5 # fts5 | vector | hybrid
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. EXISTING HERMES INFRASTRUCTURE TO LEVERAGE
|
||||
|
||||
| Component | What It Provides | Reuse For |
|
||||
|-----------|-----------------|-----------|
|
||||
| `delegate_tool.py` | Subagent spawning, isolated contexts | Fleet delegation transport |
|
||||
| `mixture_of_agents_tool.py` | Multi-model consensus/aggregation | Fleet consensus protocol |
|
||||
| `memory_tool.py` | Bounded persistent memory with atomic writes | Pattern for FKG writes |
|
||||
| `acp_adapter/` | ACP server for IDE integration | Claude Code transport |
|
||||
| `hermes_state.py` | SQLite + FTS5 session store | FKG database foundation |
|
||||
| `tools/registry.py` | Central tool registry | Fleet knowledge tool registration |
|
||||
| `agent/credential_pool.py` | Credential rotation | Multi-provider auth |
|
||||
| `hermes_cli/runtime_provider.py` | Provider resolution | Fleet agent connection |
|
||||
|
||||
---
|
||||
|
||||
## 8. KEY TAKEAWAYS
|
||||
|
||||
1. **GraphFlow (AutoGen) is the SOTA orchestration pattern** — DAG-based execution with conditional routing beats sequential chains and pure LLM-delegation for structured workflows
|
||||
|
||||
2. **Three-tier memory is essential** — Session state (volatile), knowledge graph (persistent structured), agent memory (persistent per-agent notes)
|
||||
|
||||
3. **Typed artifacts over free-form text** — MetaGPT's approach of standardized output contracts dramatically reduces inter-agent ambiguity
|
||||
|
||||
4. **Hybrid delegation beats any single pattern** — Pre-defined DAGs for known workflows, LLM selection for exploratory tasks, handoff for agent-initiated delegation
|
||||
|
||||
5. **Critic-in-the-loop is the practical consensus mechanism** — Don't implement Byzantine fault tolerance; a dedicated reviewer agent with clear acceptance criteria is sufficient
|
||||
|
||||
6. **Our existing infrastructure covers ~60% of what's needed** — delegate_tool, MoA, memory_tool, ACP adapter, and SQLite patterns are solid foundations to build on
|
||||
|
||||
7. **The fleet knowledge graph is the differentiator** — No existing framework has a proper shared knowledge graph that persists across agent interactions. Building this gives us a unique advantage.
|
||||
|
||||
---
|
||||
|
||||
*Report generated from analysis of CrewAI v1.14.1, AutoGen v0.7.5, CAMEL v0.2.90 (installed locally), plus MetaGPT, ChatDev, and LangGraph documentation.*
|
||||
@@ -1,338 +0,0 @@
|
||||
# Research Report: R@5 vs End-to-End Accuracy Gap
|
||||
|
||||
## Executive Summary
|
||||
|
||||
The gap between retrieval recall (R@5) and end-to-end answer accuracy is a **fundamental bottleneck** in RAG systems, not merely an engineering problem. MemPalace's finding of 98.4% R@5 but only 17% correct answers (81-point gap) represents an extreme but not unusual case of this phenomenon. Academic research confirms this pattern: even with *oracle retrieval* (guaranteed correct documents), models below 7B parameters fail to extract correct answers 85-100% of the time on questions they cannot answer alone.
|
||||
|
||||
---
|
||||
|
||||
## 1. WHY Does Retrieval Succeed but Answering Fail?
|
||||
|
||||
### 1.1 The Fundamental Utilization Bottleneck
|
||||
|
||||
**Key Finding:** The gap is primarily a *reader/LLM utilization problem*, not a retrieval problem.
|
||||
|
||||
**Source:** "Can Small Language Models Use What They Retrieve?" (Pandey, 2026 - arXiv:2603.11513)
|
||||
|
||||
This study evaluated five model sizes (360M to 8B) across three architecture families under four retrieval conditions (no retrieval, BM25, dense, and oracle). Key findings:
|
||||
|
||||
- Even with **oracle retrieval** (guaranteed correct answer in context), models of 7B or smaller fail to extract the correct answer **85-100% of the time** on questions they cannot answer alone
|
||||
- Adding retrieval context **destroys 42-100% of answers** the model previously knew (distraction effect)
|
||||
- The dominant failure mode is **"irrelevant generation"** - the model ignores the provided context entirely
|
||||
- These patterns hold across multiple prompt templates and retrieval methods
|
||||
|
||||
### 1.2 Context Faithfulness Problem
|
||||
|
||||
**Key Finding:** LLMs often prioritize their parametric knowledge over retrieved context, creating a "knowledge conflict."
|
||||
|
||||
**Source:** "Context-faithful Prompting for Large Language Models" (Zhou et al., 2023 - arXiv:2303.11315)
|
||||
|
||||
- LLMs encode parametric knowledge that can cause them to overlook contextual cues
|
||||
- This leads to incorrect predictions in context-sensitive tasks
|
||||
- Faithfulness can be significantly improved with carefully designed prompting strategies
|
||||
|
||||
### 1.3 The Distraction Effect
|
||||
|
||||
**Key Finding:** Retrieved context can actually *hurt* performance by distracting the model from answers it already knows.
|
||||
|
||||
**Source:** "Can Small Language Models Use What They Retrieve?" (arXiv:2603.11513)
|
||||
|
||||
- When retrieval context is added (even good context), models lose 42-100% of previously correct answers
|
||||
- This suggests the model is "confused" by the presence of context rather than effectively utilizing it
|
||||
- The distraction is driven by the *presence* of context rather than its quality
|
||||
|
||||
### 1.4 Multi-Hop Reasoning Failures
|
||||
|
||||
**Key Finding:** Complex queries requiring synthesis from multiple documents create cascading errors.
|
||||
|
||||
**Source:** "Tree of Reviews" (Li et al., 2024 - arXiv:2404.14464)
|
||||
|
||||
- Retrieved irrelevant paragraphs can mislead reasoning
|
||||
- An error in chain-of-thought structure leads to cascade of errors
|
||||
- Traditional chain methods are fragile to noise in retrieval
|
||||
|
||||
### 1.5 Similarity ≠ Utility
|
||||
|
||||
**Key Finding:** Cosine similarity between query and document doesn't guarantee the document will be *useful* for answering.
|
||||
|
||||
**Source:** "Similarity is Not All You Need: MetRag" (Gan et al., 2024 - arXiv:2405.19893)
|
||||
|
||||
- Existing RAG models use similarity as the bridge between queries and documents
|
||||
- Relying solely on similarity sometimes degrades RAG performance
|
||||
- Utility-oriented retrieval (what's actually helpful for answering) differs from similarity-oriented retrieval
|
||||
|
||||
### 1.6 Query Complexity Levels
|
||||
|
||||
**Source:** "Retrieval Augmented Generation (RAG) and Beyond" (Zhao et al., 2024 - arXiv:2409.14924)
|
||||
|
||||
The survey identifies four levels of query complexity, each with different utilization challenges:
|
||||
|
||||
1. **Explicit fact queries** - Simple extraction (high utilization expected)
|
||||
2. **Implicit fact queries** - Require inference across documents (moderate utilization)
|
||||
3. **Interpretable rationale queries** - Require understanding domain logic (low utilization)
|
||||
4. **Hidden rationale queries** - Require deep synthesis (very low utilization)
|
||||
|
||||
The MemPalace crisis support domain likely involves levels 3-4, explaining the extreme gap.
|
||||
|
||||
---
|
||||
|
||||
## 2. Patterns That Bridge the Gap
|
||||
|
||||
### 2.1 Reader-Guided Reranking (RIDER)
|
||||
|
||||
**Effectiveness:** 10-20 absolute gains in top-1 retrieval accuracy, 1-4 EM gains
|
||||
|
||||
**Source:** "Rider: Reader-Guided Passage Reranking" (Mao et al., 2021 - arXiv:2101.00294)
|
||||
|
||||
**Pattern:** Use the reader's own predictions to rerank passages before final answer generation. This aligns retrieval with what the reader can actually use.
|
||||
|
||||
- Achieves 48.3 EM on Natural Questions with only 1,024 tokens (7.8 passages avg)
|
||||
- Outperforms state-of-the-art transformer-based supervised rerankers
|
||||
- No training required - uses reader's top predictions as signal
|
||||
|
||||
**Recommendation:** Implement reader-in-the-loop reranking to prioritize passages the LLM can actually utilize.
|
||||
|
||||
### 2.2 Context-Faithful Prompting
|
||||
|
||||
**Effectiveness:** Significant improvement in faithfulness to context
|
||||
|
||||
**Source:** "Context-faithful Prompting" (Zhou et al., 2023 - arXiv:2303.11315)
|
||||
|
||||
**Two most effective techniques:**
|
||||
|
||||
1. **Opinion-based prompts:** Reframe context as a narrator's statement and ask about the narrator's opinions
|
||||
- Example: Instead of "Answer based on: [context]", use "According to the following testimony: [context]. What does the narrator suggest about X?"
|
||||
|
||||
2. **Counterfactual demonstrations:** Use examples containing false facts to improve faithfulness
|
||||
- The model learns to prioritize context over parametric knowledge
|
||||
|
||||
**Recommendation:** Use opinion-based framing and counterfactual examples in crisis support prompts.
|
||||
|
||||
### 2.3 Retrieval-Augmented Thoughts (RAT)
|
||||
|
||||
**Effectiveness:** 13-43% relative improvement across tasks
|
||||
|
||||
**Source:** "RAT: Retrieval Augmented Thoughts" (Wang et al., 2024 - arXiv:2403.05313)
|
||||
|
||||
**Pattern:** Iteratively revise each chain-of-thought step with retrieved information relevant to:
|
||||
- The task query
|
||||
- The current thought step
|
||||
- Past thought steps
|
||||
|
||||
**Results:**
|
||||
- Code generation: +13.63%
|
||||
- Mathematical reasoning: +16.96%
|
||||
- Creative writing: +19.2%
|
||||
- Embodied task planning: +42.78%
|
||||
|
||||
**Recommendation:** Implement iterative CoT revision with retrieval at each step.
|
||||
|
||||
### 2.4 FAIR-RAG: Structured Evidence Assessment
|
||||
|
||||
**Effectiveness:** 8.3 absolute F1 improvement on HotpotQA
|
||||
|
||||
**Source:** "FAIR-RAG" (Asl et al., 2025 - arXiv:2510.22344)
|
||||
|
||||
**Pattern:** Transform RAG into a dynamic reasoning process with:
|
||||
1. Decompose query into checklist of required findings
|
||||
2. Audit aggregated evidence to identify confirmed facts AND explicit gaps
|
||||
3. Generate targeted sub-queries to fill gaps
|
||||
4. Repeat until evidence is sufficient
|
||||
|
||||
**Recommendation:** For crisis support, implement gap-aware evidence assessment before generating answers.
|
||||
|
||||
### 2.5 Two-Stage Retrieval with Marginal-Utility Reranking
|
||||
|
||||
**Source:** "Enhancing RAG with Two-Stage Retrieval" (George, 2025 - arXiv:2601.03258)
|
||||
|
||||
**Pattern:**
|
||||
- Stage 1: LLM-driven query expansion for high recall
|
||||
- Stage 2: Fast reranker (FlashRank) that dynamically selects optimal evidence subset under token budget
|
||||
- Utility modeled as: relevance + novelty + brevity + cross-encoder evidence
|
||||
|
||||
**Recommendation:** Use marginal-utility reranking to balance relevance, novelty, and token efficiency.
|
||||
|
||||
### 2.6 Multi-Layered Thoughts (MetRag)
|
||||
|
||||
**Source:** "Similarity is Not All You Need" (Gan et al., 2024 - arXiv:2405.19893)
|
||||
|
||||
**Pattern:** Three types of "thought" layers:
|
||||
1. **Similarity-oriented** - Standard retrieval
|
||||
2. **Utility-oriented** - Small utility model supervised by LLM
|
||||
3. **Compactness-oriented** - Task-adaptive summarization of retrieved documents
|
||||
|
||||
**Recommendation:** Add utility scoring and document summarization before LLM processing.
|
||||
|
||||
### 2.7 Retrieval Augmented Fine-Tuning (RAFT)
|
||||
|
||||
**Source:** "An Empirical Study of RAG with Chain-of-Thought" (Zhao et al., 2024 - arXiv:2407.15569)
|
||||
|
||||
**Pattern:** Combine chain-of-thought with supervised fine-tuning and RAG:
|
||||
- Model learns to extract relevant information from noisy contexts
|
||||
- Enhanced information extraction and logical reasoning
|
||||
- Works for both long-form and short-form QA
|
||||
|
||||
**Recommendation:** Fine-tune on domain-specific data with CoT examples to improve utilization.
|
||||
|
||||
### 2.8 Monte Carlo Tree Search for Thought Generation
|
||||
|
||||
**Source:** "Retrieval Augmented Thought Process" (Pouplin et al., 2024 - arXiv:2402.07812)
|
||||
|
||||
**Effectiveness:** 35% additional accuracy vs. in-context RAG
|
||||
|
||||
**Pattern:** Formulate thought generation as a multi-step decision process optimized with MCTS:
|
||||
- Learn a proxy reward function for cost-efficient inference
|
||||
- Robust to imperfect retrieval
|
||||
- Particularly effective for private/sensitive data domains
|
||||
|
||||
**Recommendation:** For crisis support, consider MCTS-based reasoning to handle imperfect retrieval gracefully.
|
||||
|
||||
---
|
||||
|
||||
## 3. Minimum Viable Retrieval for Crisis Support
|
||||
|
||||
### 3.1 Critical Insight: The Gap is LARGER for Complex Domains
|
||||
|
||||
Crisis support queries are likely at the "interpretable rationale" or "hidden rationale" level (from the RAG survey taxonomy). This means:
|
||||
- Simple fact extraction won't work
|
||||
- The model needs to understand nuanced guidance
|
||||
- Multi-document synthesis is often required
|
||||
- The stakes of incorrect answers are extremely high
|
||||
|
||||
### 3.2 Minimum Viable Components
|
||||
|
||||
Based on the research, the minimum viable RAG system for crisis support needs:
|
||||
|
||||
#### A. Retrieval Layer (Still Important)
|
||||
- **Hybrid retrieval** (dense + sparse) for broad coverage
|
||||
- **Reranking** with reader feedback (RIDER pattern)
|
||||
- **Distractor filtering** - removing passages that hurt performance
|
||||
|
||||
#### B. Context Processing Layer (The Key Gap)
|
||||
- **Context compression/summarization** - reduce noise
|
||||
- **Relevance scoring** per passage, not just retrieval
|
||||
- **Utility-oriented ranking** beyond similarity
|
||||
|
||||
#### C. Generation Layer (Most Critical)
|
||||
- **Explicit faithfulness instructions** in prompts
|
||||
- **Opinion-based framing** for context utilization
|
||||
- **Chain-of-thought with retrieval revision** (RAT pattern)
|
||||
- **Evidence gap detection** before answering
|
||||
|
||||
#### D. Safety Layer
|
||||
- **Answer verification** against retrieved context
|
||||
- **Confidence calibration** - knowing when NOT to answer
|
||||
- **Fallback to human escalation** when utilization fails
|
||||
|
||||
### 3.3 Recommended Architecture for Crisis Support
|
||||
|
||||
```
|
||||
Query → Hybrid Retrieval → Reader-Guided Reranking → Context Compression
|
||||
→ Faithfulness-Optimized Prompt → CoT with Retrieval Revision
|
||||
→ Evidence Verification → Answer/Hold/Escalate Decision
|
||||
```
|
||||
|
||||
### 3.4 Expected Performance
|
||||
|
||||
Based on the literature:
|
||||
- **Naive RAG:** R@5 ~95%, E2E accuracy ~15-25%
|
||||
- **With reranking:** E2E accuracy +1-4 points
|
||||
- **With faithfulness prompting:** E2E accuracy +5-15 points
|
||||
- **With iterative CoT+retrieval:** E2E accuracy +10-20 points
|
||||
- **Combined interventions:** E2E accuracy 50-70% (realistic target)
|
||||
|
||||
The gap can be reduced from 81 points to ~25-45 points with proper interventions.
|
||||
|
||||
---
|
||||
|
||||
## 4. Key Takeaways
|
||||
|
||||
### The Gap is Fundamental, Not Accidental
|
||||
- Even oracle retrieval doesn't guarantee correct answers
|
||||
- Smaller models (<7B) have a "utilization bottleneck"
|
||||
- The distraction effect means more context can hurt
|
||||
|
||||
### Bridging the Gap Requires Multi-Pronged Approach
|
||||
1. **Better retrieval alignment** (reader-guided, utility-oriented)
|
||||
2. **Better context processing** (compression, filtering, summarization)
|
||||
3. **Better prompting** (faithfulness, opinion-based, CoT)
|
||||
4. **Better verification** (evidence checking, gap detection)
|
||||
|
||||
### Crisis Support Specific Considerations
|
||||
- High stakes mean low tolerance for hallucination
|
||||
- Complex queries require multi-step reasoning
|
||||
- Domain expertise needs explicit encoding in prompts
|
||||
- Safety requires explicit hold/escalate mechanisms
|
||||
|
||||
---
|
||||
|
||||
## 5. References
|
||||
|
||||
1. Pandey, S. (2026). "Can Small Language Models Use What They Retrieve?" arXiv:2603.11513
|
||||
2. Zhou, W. et al. (2023). "Context-faithful Prompting for Large Language Models." arXiv:2303.11315
|
||||
3. Zhao, S. et al. (2024). "Retrieval Augmented Generation (RAG) and Beyond." arXiv:2409.14924
|
||||
4. Mao, Y. et al. (2021). "Rider: Reader-Guided Passage Reranking." arXiv:2101.00294
|
||||
5. George, S. (2025). "Enhancing RAG with Two-Stage Retrieval." arXiv:2601.03258
|
||||
6. Asl, M.A. et al. (2025). "FAIR-RAG: Faithful Adaptive Iterative Refinement." arXiv:2510.22344
|
||||
7. Zhao, Y. et al. (2024). "An Empirical Study of RAG with Chain-of-Thought." arXiv:2407.15569
|
||||
8. Wang, Z. et al. (2024). "RAT: Retrieval Augmented Thoughts." arXiv:2403.05313
|
||||
9. Gan, C. et al. (2024). "Similarity is Not All You Need: MetRag." arXiv:2405.19893
|
||||
10. Pouplin, T. et al. (2024). "Retrieval Augmented Thought Process." arXiv:2402.07812
|
||||
11. Li, J. et al. (2024). "Tree of Reviews." arXiv:2404.14464
|
||||
12. Tian, F. et al. (2026). "Predicting Retrieval Utility and Answer Quality in RAG." arXiv:2601.14546
|
||||
13. Qi, J. et al. (2025). "On the Consistency of Multilingual Context Utilization in RAG." arXiv:2504.00597
|
||||
|
||||
---
|
||||
|
||||
## 6. Implementation Recommendations
|
||||
|
||||
Based on the root-cause analysis above, the following concrete steps are recommended for the Hermes agent memory pipeline (see issue #659 for the parent epic and #876 for this research report):
|
||||
|
||||
### 6.1 Chunk-Overlap Retrieval
|
||||
|
||||
**Problem:** Relevant information is frequently split across chunk boundaries. Retrieval finds one chunk but the answer spans two.
|
||||
|
||||
**Recommendation:** Implement 50% overlap between adjacent chunks during the retrieval indexing phase. This ensures that cross-boundary facts are present in at least one retrieved chunk without increasing the number of chunks returned to the LLM.
|
||||
|
||||
### 6.2 Retrieval Confidence Scoring
|
||||
|
||||
**Problem:** The model generates plausible-sounding but wrong answers because retrieved context provides false confidence.
|
||||
|
||||
**Recommendation:** Add a confidence score to each retrieved chunk (e.g., cosine-similarity threshold + source-reliability weight). Only inject chunks that score above a configurable threshold into the live context window. Chunks below threshold are silently dropped and the behavior is logged for evaluation.
|
||||
|
||||
### 6.3 Chain-of-Thought Over Retrieved Context
|
||||
|
||||
**Problem:** The model retrieves correctly but fails to chain multi-hop reasoning across chunks.
|
||||
|
||||
**Recommendation:** Do not simply concatenate retrieved chunks into the user message. Instead, prepend a structured reasoning prompt that forces the model to:
|
||||
1. Quote the specific chunk that supports each step.
|
||||
2. Flag when two chunks must be combined to reach a conclusion.
|
||||
3. Stop and emit "I don't know" if no chunk supports a required inference step.
|
||||
|
||||
### 6.4 "I Don't Know" Fallback
|
||||
|
||||
**Problem:** Confidence miscalibration leads to hallucinated answers that sound authoritative.
|
||||
|
||||
**Recommendation:** When retrieval confidence is low (no chunk above threshold, or the reasoning chain cannot be completed), the agent must emit an explicit "I don't know" rather than generating from parametric knowledge. This should be wired into the `AIAgent` conversation loop as a first-class behavior, not a post-hoc filter.
|
||||
|
||||
### 6.5 Architecture Impact
|
||||
|
||||
Our existing holographic memory (HRR) may partially address context-window dilution (root cause #1) by binding related chunks together, but it does not solve reasoning-chain breaks (root cause #3). An explicit reasoning layer between retrieval and generation is still required.
|
||||
|
||||
---
|
||||
|
||||
## 7. Limitations of This Research
|
||||
|
||||
1. **MemPalace/Engram team analysis not found** - The specific analysis that discovered the 17% figure was not located through academic search. This may be from internal reports, blog posts, or presentations not indexed in arXiv.
|
||||
|
||||
2. **Domain specificity** - Most RAG research focuses on general QA, not crisis support. The patterns may need adaptation for high-stakes, sensitive domains.
|
||||
|
||||
3. **Model size effects** - The utilization bottleneck is worse for smaller models. The MemPalace system's model size is unknown.
|
||||
|
||||
4. **Evaluation methodology** - Different papers use different metrics (EM, F1, accuracy), making direct comparison difficult.
|
||||
|
||||
---
|
||||
|
||||
*Research conducted: April 14, 2026*
|
||||
*Researcher: Hermes Agent (subagent)*
|
||||
*Task: Research Task #1 - R@5 vs End-to-End Accuracy Gap*
|
||||
@@ -1,208 +0,0 @@
|
||||
# Open-Source Text-to-Music-Video Pipeline Research
|
||||
|
||||
## Executive Summary
|
||||
|
||||
**The complete text-to-music-video pipeline does NOT exist as a single open-source tool.** The landscape consists of powerful individual components that must be manually stitched together. This is the gap our Video Forge can fill.
|
||||
|
||||
---
|
||||
|
||||
## 1. EXISTING OPEN-SOURCE PIPELINES
|
||||
|
||||
### Complete (but crude) Pipelines
|
||||
|
||||
| Project | Stars | Description | Status |
|
||||
|---------|-------|-------------|--------|
|
||||
| **MusicVideoMaker** | 3 | Stable Diffusion pipeline for music videos from lyrics. Uses Excel spreadsheet for lyrics+timing, generates key frames, smooths between them. | Proof-of-concept, Jupyter notebook, not production-ready |
|
||||
| **DuckTapeVideos** | 0 | Node-based AI pipeline for beat-synced music videos from lyrics | Minimal, early stage |
|
||||
| **song-video-gen** | 0 | Stable Diffusion lyrics-based generative AI pipeline | Fork/copy of above |
|
||||
| **TikTok-Lyric-Video-Pipeline** | 1 | Automated Python pipeline for TikTok lyric videos (10-15/day) | Focused on lyric overlay, not generative visuals |
|
||||
|
||||
**Verdict: Nothing production-ready exists as a complete pipeline.**
|
||||
|
||||
---
|
||||
|
||||
## 2. INDIVIDUAL COMPONENTS (What's Already Free)
|
||||
|
||||
### A. Music Generation (Suno Alternatives)
|
||||
|
||||
| Project | Stars | License | Self-Hostable | Quality |
|
||||
|---------|-------|---------|---------------|---------|
|
||||
| **YuE** | 6,144 | Apache-2.0 | ✅ Yes | Full-song generation with vocals, Suno-level quality |
|
||||
| **HeartMuLa** | 4,037 | Apache-2.0 | ✅ Yes | Most powerful open-source music model (2026), multilingual |
|
||||
| **ACE-Step 1.5 + UI** | 970 | MIT | ✅ Yes | Professional Spotify-like UI, full song gen, 4+ min with vocals |
|
||||
| **Facebook MusicGen** | ~45k downloads | MIT | ✅ Yes | Good quality, melody conditioning, well-documented |
|
||||
| **Riffusion** | ~6k stars | Apache-2.0 | ✅ Yes | Spectrogram-based, unique approach |
|
||||
|
||||
**Status: Suno is effectively "given away" for free. YuE and HeartMuLa are production-ready.**
|
||||
|
||||
### B. Image Generation (Per-Scene/Beat)
|
||||
|
||||
| Project | Downloads/Stars | License | Notes |
|
||||
|---------|-----------------|---------|-------|
|
||||
| **Stable Diffusion XL** | 1.9M downloads | CreativeML | Best quality, huge ecosystem |
|
||||
| **Stable Diffusion 1.5** | 1.6M downloads | CreativeML | Fast, lightweight |
|
||||
| **FLUX** | Emerging | Apache-2.0 | Newest, excellent quality |
|
||||
| **ComfyUI** | 60k+ stars | GPL-3.0 | Node-based pipeline editor, massive plugin ecosystem |
|
||||
|
||||
**Status: Image generation is completely "given away." SD XL + ComfyUI is production-grade.**
|
||||
|
||||
### C. Text-to-Video Generation
|
||||
|
||||
| Project | Stars | License | Capabilities |
|
||||
|---------|-------|---------|--------------|
|
||||
| **Wan2.1** | 15,815 | Apache-2.0 | State-of-the-art, text-to-video and image-to-video |
|
||||
| **CogVideoX** | 12,634 | Apache-2.0 | Text and image to video, good quality |
|
||||
| **HunyuanVideo** | 11,965 | Custom | Tencent's framework, high quality |
|
||||
| **Stable Video Diffusion** | 3k+ likes | Stability AI | Image-to-video, good for short clips |
|
||||
| **LTX-Video** | Growing | Apache-2.0 | Fast inference, good quality |
|
||||
|
||||
**Status: Text-to-video is rapidly being "given away." Wan2.1 is production-ready for short clips (4-6 seconds).**
|
||||
|
||||
### D. Video Composition & Assembly
|
||||
|
||||
| Project | Stars | License | Use Case |
|
||||
|---------|-------|---------|----------|
|
||||
| **Remotion** | 43,261 | Custom (SSPL) | Programmatic video with React, production-grade |
|
||||
| **MoviePy** | 12k+ stars | MIT | Python video editing, widely used |
|
||||
| **Mosaico** | 16 | MIT | Python video composition with AI integration |
|
||||
| **FFmpeg** | N/A | LGPL/GPL | The universal video tool |
|
||||
|
||||
**Status: Video composition tools are mature and free. Remotion is production-grade.**
|
||||
|
||||
### E. Lyrics/Text Processing
|
||||
|
||||
| Component | Status | Notes |
|
||||
|-----------|--------|-------|
|
||||
| **Lyrics-to-scene segmentation** | ❌ Missing | No good open-source tool for breaking lyrics into visual scenes |
|
||||
| **Beat detection** | ✅ Exists | Librosa, madmom, aubio - all free and mature |
|
||||
| **Text-to-prompt generation** | ✅ Exists | LLMs (Ollama, local models) can do this |
|
||||
| **LRC/SRT parsing** | ✅ Exists | Many libraries available |
|
||||
|
||||
---
|
||||
|
||||
## 3. WHAT'S BEEN "GIVEN AWAY" FOR FREE
|
||||
|
||||
### Fully Solved (Production-Ready, Self-Hostable)
|
||||
- ✅ **Music generation**: YuE, HeartMuLa, ACE-Step match Suno quality
|
||||
- ✅ **Image generation**: SD XL, FLUX - commercial quality
|
||||
- ✅ **Video composition**: FFmpeg, MoviePy, Remotion
|
||||
- ✅ **Beat/audio analysis**: Librosa, madmom
|
||||
- ✅ **Text-to-video (short clips)**: Wan2.1, CogVideoX
|
||||
- ✅ **TTS/voice**: XTTS-v2, Kokoro, Bark
|
||||
|
||||
### Partially Solved
|
||||
- ⚠️ **Image-to-video**: Good for 4-6 second clips, struggles with longer sequences
|
||||
- ⚠️ **Style consistency**: LoRAs and ControlNet help, but not perfect across scenes
|
||||
- ⚠️ **Prompt engineering**: LLMs can help, but no dedicated lyrics-to-visual-prompt tool
|
||||
|
||||
---
|
||||
|
||||
## 4. WHERE THE REAL GAPS ARE
|
||||
|
||||
### Critical Gaps (Our Opportunity)
|
||||
|
||||
1. **Unified Pipeline Orchestration**
|
||||
- NO tool chains: lyrics → music → scene segmentation → image prompts → video composition
|
||||
- Everything requires manual stitching
|
||||
- Our Video Forge can be THE glue layer
|
||||
|
||||
2. **Lyrics-to-Visual-Scene Segmentation**
|
||||
- No tool analyzes lyrics and breaks them into visual beats/scenes
|
||||
- MusicVideoMaker uses manual Excel entry - absurd
|
||||
- Opportunity: LLM-powered scene segmentation with beat alignment
|
||||
|
||||
3. **Temporal Coherence Across Scenes**
|
||||
- Short clips (4-6s) work fine, but maintaining visual coherence across a 3-4 minute video is unsolved
|
||||
- Character consistency, color palette continuity, style drift
|
||||
- Opportunity: Style anchoring + scene-to-scene conditioning
|
||||
|
||||
4. **Beat-Synchronized Visual Transitions**
|
||||
- No tool automatically syncs visual cuts to musical beats
|
||||
- Manual timing is required everywhere
|
||||
- Opportunity: Beat detection → transition scheduling → FFmpeg composition
|
||||
|
||||
5. **Long-Form Video Generation**
|
||||
- Text-to-video models max out at 4-6 seconds
|
||||
- Stitching clips with consistent style/characters is manual
|
||||
- Opportunity: Automated clip chaining with style transfer
|
||||
|
||||
6. **One-Click "Lyrics In, Video Out"**
|
||||
- The dream pipeline doesn't exist
|
||||
- Current workflows require 5+ separate tools
|
||||
- Opportunity: Single command/endpoint that does everything
|
||||
|
||||
### Technical Debt in Existing Tools
|
||||
|
||||
- **YuE/HeartMuLa**: No video awareness - just audio generation
|
||||
- **Wan2.1/CogVideoX**: No lyrics/text awareness - just prompt-to-video
|
||||
- **ComfyUI**: Great for images, weak for video composition
|
||||
- **Remotion**: Great for composition, no AI generation built-in
|
||||
|
||||
---
|
||||
|
||||
## 5. RECOMMENDED ARCHITECTURE FOR VIDEO FORGE
|
||||
|
||||
Based on this research, the optimal Video Forge pipeline:
|
||||
|
||||
```
|
||||
[Lyrics/Poem Text]
|
||||
↓
|
||||
[LLM Scene Segmenter] → Beat-aligned scene descriptions + visual prompts
|
||||
↓
|
||||
[HeartMuLa/YuE] → Music audio (.wav)
|
||||
↓
|
||||
[Beat Detector (librosa)] → Beat timestamps + energy curve
|
||||
↓
|
||||
[SD XL / FLUX] → Scene images (one per beat/section)
|
||||
↓
|
||||
[Wan2.1 img2vid] → Short video clips per scene (4-6s each)
|
||||
↓
|
||||
[FFmpeg + Beat Sync] → Transitions aligned to beats
|
||||
↓
|
||||
[Final Music Video (.mp4)]
|
||||
```
|
||||
|
||||
### Key Design Decisions
|
||||
|
||||
1. **Music**: HeartMuLa (best quality, multilingual, Apache-2.0)
|
||||
2. **Images**: SD XL via ComfyUI (most mature ecosystem)
|
||||
3. **Video clips**: Wan2.1 for img2vid (state-of-the-art)
|
||||
4. **Composition**: FFmpeg (universal, battle-tested)
|
||||
5. **Orchestration**: Python pipeline with config file
|
||||
6. **Scene segmentation**: Local LLM (Ollama + Llama 3 or similar)
|
||||
|
||||
### What We Build vs. What We Use
|
||||
|
||||
| Component | Build or Use | Reasoning |
|
||||
|-----------|--------------|-----------|
|
||||
| Lyrics → Scenes | **BUILD** | No good tool exists, core differentiator |
|
||||
| Music generation | **USE** HeartMuLa/YuE | Already excellent, Apache-2.0 |
|
||||
| Image generation | **USE** SD XL | Mature, huge ecosystem |
|
||||
| Beat detection | **USE** librosa | Mature, reliable |
|
||||
| Video clips | **USE** Wan2.1 | Best quality, Apache-2.0 |
|
||||
| Video composition | **BUILD** (ffmpeg wrapper) | Need beat-sync logic |
|
||||
| Pipeline orchestration | **BUILD** | The main value-add |
|
||||
|
||||
---
|
||||
|
||||
## 6. COMPETITIVE LANDSCAPE SUMMARY
|
||||
|
||||
### Commercial (Not Self-Hostable)
|
||||
- **Suno**: Music only, no video
|
||||
- **Runway**: Video only, expensive
|
||||
- **Pika**: Short clips only
|
||||
- **Kaiber**: Closest to music video, but closed/subscription
|
||||
- **Synthesia**: Avatar-based, not generative art
|
||||
|
||||
### Open-Source Gaps That Matter
|
||||
1. Nobody has built the orchestration layer
|
||||
2. Nobody has solved lyrics-to-visual-scene well
|
||||
3. Nobody has beat-synced visual transitions automated
|
||||
4. Nobody maintains temporal coherence across minutes
|
||||
|
||||
**Our Video Forge fills the most important gap: the glue that makes individual AI components work together to produce a complete music video from text.**
|
||||
|
||||
---
|
||||
|
||||
*Research conducted: April 14, 2026*
|
||||
*Sources: GitHub API, HuggingFace API, project READMEs*
|
||||
91
run_agent.py
91
run_agent.py
@@ -106,7 +106,7 @@ from agent.trajectory import (
|
||||
convert_scratchpad_to_think, has_incomplete_scratchpad,
|
||||
save_trajectory as _save_trajectory_to_file,
|
||||
)
|
||||
from utils import atomic_json_write, env_var_enabled, repair_and_load_json
|
||||
from utils import atomic_json_write, env_var_enabled
|
||||
|
||||
|
||||
|
||||
@@ -277,7 +277,7 @@ def _should_parallelize_tool_batch(tool_calls) -> bool:
|
||||
for tool_call in tool_calls:
|
||||
tool_name = tool_call.function.name
|
||||
try:
|
||||
function_args = repair_and_load_json(tool_call.function.arguments, default={})
|
||||
function_args = json.loads(tool_call.function.arguments)
|
||||
except Exception:
|
||||
logging.debug(
|
||||
"Could not parse args for %s — defaulting to sequential; raw=%s",
|
||||
@@ -2246,8 +2246,9 @@ class AIAgent:
|
||||
for msg in getattr(review_agent, "_session_messages", []):
|
||||
if not isinstance(msg, dict) or msg.get("role") != "tool":
|
||||
continue
|
||||
data = repair_and_load_json(msg.get("content", "{}"), default=None, context="trajectory_content")
|
||||
if data is None:
|
||||
try:
|
||||
data = json.loads(msg.get("content", "{}"))
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
if not data.get("success"):
|
||||
continue
|
||||
@@ -2495,13 +2496,13 @@ class AIAgent:
|
||||
if not tool_call or not isinstance(tool_call, dict): continue
|
||||
# Parse arguments - should always succeed since we validate during conversation
|
||||
# but keep try-except as safety net
|
||||
raw_args = tool_call["function"]["arguments"]
|
||||
if isinstance(raw_args, str):
|
||||
arguments = repair_and_load_json(raw_args, default={}, context="trajectory_tool_call")
|
||||
if arguments == {} and raw_args.strip() not in ("{}", ""):
|
||||
logging.warning("Unexpected invalid JSON in trajectory conversion: %.100s", raw_args)
|
||||
else:
|
||||
arguments = raw_args
|
||||
try:
|
||||
arguments = json.loads(tool_call["function"]["arguments"]) if isinstance(tool_call["function"]["arguments"], str) else tool_call["function"]["arguments"]
|
||||
except json.JSONDecodeError:
|
||||
# This shouldn't happen since we validate and retry during conversation,
|
||||
# but if it does, log warning and use empty dict
|
||||
logging.warning(f"Unexpected invalid JSON in trajectory conversion: {tool_call['function']['arguments'][:100]}")
|
||||
arguments = {}
|
||||
|
||||
tool_call_json = {
|
||||
"name": tool_call["function"]["name"],
|
||||
@@ -2529,10 +2530,11 @@ class AIAgent:
|
||||
|
||||
# Try to parse tool content as JSON if it looks like JSON
|
||||
tool_content = tool_msg["content"]
|
||||
if isinstance(tool_content, str) and tool_content.strip().startswith(("{", "[")):
|
||||
parsed = repair_and_load_json(tool_content, default=None, context="trajectory_tool_content")
|
||||
if parsed is not None:
|
||||
tool_content = parsed
|
||||
try:
|
||||
if tool_content.strip().startswith(("{", "[")):
|
||||
tool_content = json.loads(tool_content)
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
pass # Keep as string if not valid JSON
|
||||
|
||||
tool_index = len(tool_responses)
|
||||
tool_name = (
|
||||
@@ -2883,21 +2885,14 @@ class AIAgent:
|
||||
# with partial history and would otherwise clobber the full JSON log.
|
||||
if self.session_log_file.exists():
|
||||
try:
|
||||
existing = repair_and_load_json(
|
||||
self.session_log_file.read_text(encoding="utf-8"),
|
||||
default=None,
|
||||
context="session_log_load",
|
||||
)
|
||||
if existing is None:
|
||||
logging.warning("Session log at %s could not be parsed; allowing overwrite", self.session_log_file)
|
||||
else:
|
||||
existing_count = existing.get("message_count", len(existing.get("messages", [])))
|
||||
if existing_count > len(cleaned):
|
||||
logging.debug(
|
||||
"Skipping session log overwrite: existing has %d messages, current has %d",
|
||||
existing_count, len(cleaned),
|
||||
)
|
||||
return
|
||||
existing = json.loads(self.session_log_file.read_text(encoding="utf-8"))
|
||||
existing_count = existing.get("message_count", len(existing.get("messages", [])))
|
||||
if existing_count > len(cleaned):
|
||||
logging.debug(
|
||||
"Skipping session log overwrite: existing has %d messages, current has %d",
|
||||
existing_count, len(cleaned),
|
||||
)
|
||||
return
|
||||
except Exception:
|
||||
pass # corrupted existing file — allow the overwrite
|
||||
|
||||
@@ -3120,12 +3115,13 @@ class AIAgent:
|
||||
# Quick check: todo responses contain "todos" key
|
||||
if '"todos"' not in content:
|
||||
continue
|
||||
data = repair_and_load_json(content, default=None, context="todo_content")
|
||||
if data is None:
|
||||
try:
|
||||
data = json.loads(content)
|
||||
if "todos" in data and isinstance(data["todos"], list):
|
||||
last_todo_response = data["todos"]
|
||||
break
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
continue
|
||||
if "todos" in data and isinstance(data["todos"], list):
|
||||
last_todo_response = data["todos"]
|
||||
break
|
||||
|
||||
if last_todo_response:
|
||||
# Replay the items into the store (replace mode)
|
||||
@@ -5964,7 +5960,7 @@ class AIAgent:
|
||||
result_json = asyncio.run(
|
||||
vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt)
|
||||
)
|
||||
result = repair_and_load_json(result_json, default={}, context="vision_result") if isinstance(result_json, str) else {}
|
||||
result = json.loads(result_json) if isinstance(result_json, str) else {}
|
||||
description = (result.get("analysis") or "").strip()
|
||||
except Exception as e:
|
||||
description = f"Image analysis failed: {e}"
|
||||
@@ -6762,7 +6758,7 @@ class AIAgent:
|
||||
for tc in tool_calls:
|
||||
if tc.function.name == "memory":
|
||||
try:
|
||||
args = repair_and_load_json(tc.function.arguments, default={}, context="memory_flush")
|
||||
args = json.loads(tc.function.arguments)
|
||||
flush_target = args.get("target", "memory")
|
||||
from tools.memory_tool import memory_tool as _memory_tool
|
||||
_memory_tool(
|
||||
@@ -7069,7 +7065,7 @@ class AIAgent:
|
||||
self._iters_since_skill = 0
|
||||
|
||||
try:
|
||||
function_args = repair_and_load_json(tool_call.function.arguments, default={})
|
||||
function_args = json.loads(tool_call.function.arguments)
|
||||
except json.JSONDecodeError:
|
||||
function_args = {}
|
||||
if not isinstance(function_args, dict):
|
||||
@@ -7266,7 +7262,7 @@ class AIAgent:
|
||||
function_name = tool_call.function.name
|
||||
|
||||
try:
|
||||
function_args = repair_and_load_json(tool_call.function.arguments, default={})
|
||||
function_args = json.loads(tool_call.function.arguments)
|
||||
except json.JSONDecodeError as e:
|
||||
logging.warning(f"Unexpected JSON error after validation: {e}")
|
||||
function_args = {}
|
||||
@@ -8301,15 +8297,14 @@ class AIAgent:
|
||||
for tc in tcs:
|
||||
if isinstance(tc, dict) and "function" in tc:
|
||||
try:
|
||||
args_obj = repair_and_load_json(tc["function"]["arguments"], default=None, context="cache_serialization")
|
||||
if args_obj is not None:
|
||||
tc = {**tc, "function": {
|
||||
**tc["function"],
|
||||
"arguments": json.dumps(
|
||||
args_obj, separators=(",", ":"),
|
||||
sort_keys=True,
|
||||
),
|
||||
}}
|
||||
args_obj = json.loads(tc["function"]["arguments"])
|
||||
tc = {**tc, "function": {
|
||||
**tc["function"],
|
||||
"arguments": json.dumps(
|
||||
args_obj, separators=(",", ":"),
|
||||
sort_keys=True,
|
||||
),
|
||||
}}
|
||||
except Exception:
|
||||
pass
|
||||
new_tcs.append(tc)
|
||||
|
||||
@@ -1,181 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
cron-audit — Audit and clean up dead cron jobs.
|
||||
|
||||
Finds jobs with zero completions, low success rates, or stale schedules.
|
||||
Can disable or delete dead jobs.
|
||||
|
||||
Usage:
|
||||
python scripts/cron_audit.py # Show dead jobs
|
||||
python scripts/cron_audit.py --disable # Disable dead jobs
|
||||
python scripts/cron_audit.py --delete # Delete dead jobs
|
||||
python scripts/cron_audit.py --threshold 0 # Jobs with 0 completions
|
||||
python scripts/cron_audit.py --older-than 7d # Jobs with no runs in 7 days
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
HERMES_HOME = Path.home() / ".hermes"
|
||||
JOBS_FILE = HERMES_HOME / "cron" / "jobs.json"
|
||||
|
||||
|
||||
def load_jobs() -> List[Dict[str, Any]]:
|
||||
"""Load cron jobs from jobs.json."""
|
||||
if not JOBS_FILE.exists():
|
||||
print(f"Error: {JOBS_FILE} not found")
|
||||
return []
|
||||
with open(JOBS_FILE) as f:
|
||||
data = json.load(f)
|
||||
return data.get("jobs", [])
|
||||
|
||||
|
||||
def save_jobs(jobs: List[Dict[str, Any]]):
|
||||
"""Save jobs back to jobs.json."""
|
||||
JOBS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(JOBS_FILE, "r") as f:
|
||||
data = json.load(f)
|
||||
data["jobs"] = jobs
|
||||
with open(JOBS_FILE, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
|
||||
def find_dead_jobs(
|
||||
jobs: List[Dict[str, Any]],
|
||||
completion_threshold: int = 0,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Find jobs with completions at or below threshold."""
|
||||
dead = []
|
||||
for job in jobs:
|
||||
repeat = job.get("repeat", {})
|
||||
completed = repeat.get("completed", 0)
|
||||
if completed <= completion_threshold:
|
||||
dead.append(job)
|
||||
return dead
|
||||
|
||||
|
||||
def find_stale_jobs(
|
||||
jobs: List[Dict[str, Any]],
|
||||
max_age_hours: float = 168, # 7 days
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Find jobs that haven't run in max_age_hours."""
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
|
||||
stale = []
|
||||
now = time.time()
|
||||
|
||||
for job in jobs:
|
||||
last_run = job.get("last_run_at")
|
||||
if not last_run:
|
||||
# Never ran — check creation time
|
||||
created = job.get("created_at")
|
||||
if created:
|
||||
try:
|
||||
dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
||||
age_hours = (now - dt.timestamp()) / 3600
|
||||
if age_hours > max_age_hours:
|
||||
stale.append(job)
|
||||
except Exception:
|
||||
stale.append(job)
|
||||
else:
|
||||
stale.append(job)
|
||||
else:
|
||||
try:
|
||||
dt = datetime.fromisoformat(last_run.replace("Z", "+00:00"))
|
||||
age_hours = (now - dt.timestamp()) / 3600
|
||||
if age_hours > max_age_hours:
|
||||
stale.append(job)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return stale
|
||||
|
||||
|
||||
def format_job(job: Dict[str, Any]) -> str:
|
||||
"""Format a job for display."""
|
||||
name = job.get("name", job.get("id", "?"))
|
||||
schedule = job.get("schedule_display", "?")
|
||||
repeat = job.get("repeat", {})
|
||||
completed = repeat.get("completed", 0)
|
||||
times = repeat.get("times")
|
||||
enabled = job.get("enabled", True)
|
||||
state = job.get("state", "unknown")
|
||||
last_run = job.get("last_run_at", "never")
|
||||
|
||||
status = "enabled" if enabled else "disabled"
|
||||
if state == "paused":
|
||||
status = "paused"
|
||||
|
||||
repeat_str = f"{completed}/{times}" if times else f"{completed}/∞"
|
||||
|
||||
return f" {name:40s} | {schedule:20s} | done: {repeat_str:8s} | {status}"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Audit and clean up dead cron jobs")
|
||||
parser.add_argument("--disable", action="store_true", help="Disable dead jobs")
|
||||
parser.add_argument("--delete", action="store_true", help="Delete dead jobs")
|
||||
parser.add_argument("--threshold", type=int, default=0, help="Completion threshold (default: 0)")
|
||||
parser.add_argument("--older-than", type=str, help="Find jobs with no runs in N days (e.g., 7d)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would change")
|
||||
args = parser.parse_args()
|
||||
|
||||
jobs = load_jobs()
|
||||
if not jobs:
|
||||
print("No jobs found.")
|
||||
return
|
||||
|
||||
print(f"Total jobs: {len(jobs)}")
|
||||
|
||||
# Find dead jobs
|
||||
dead = find_dead_jobs(jobs, args.threshold)
|
||||
print(f"Jobs with <= {args.threshold} completions: {len(dead)}")
|
||||
|
||||
if args.older_than:
|
||||
days = int(args.older_than.rstrip("d"))
|
||||
stale = find_stale_jobs(jobs, max_age_hours=days * 24)
|
||||
print(f"Jobs with no runs in {days} days: {len(stale)}")
|
||||
dead = list({j["id"]: j for j in dead + stale}.values())
|
||||
|
||||
if not dead:
|
||||
print("No dead jobs found.")
|
||||
return
|
||||
|
||||
print(f"\nDead jobs ({len(dead)}):")
|
||||
for job in dead:
|
||||
print(format_job(job))
|
||||
|
||||
if args.disable:
|
||||
if args.dry_run:
|
||||
print(f"\nDRY RUN: Would disable {len(dead)} jobs")
|
||||
return
|
||||
|
||||
job_ids = {j["id"] for j in dead}
|
||||
for job in jobs:
|
||||
if job["id"] in job_ids:
|
||||
job["enabled"] = False
|
||||
job["state"] = "disabled"
|
||||
|
||||
save_jobs(jobs)
|
||||
print(f"\nDisabled {len(dead)} jobs.")
|
||||
|
||||
elif args.delete:
|
||||
if args.dry_run:
|
||||
print(f"\nDRY RUN: Would delete {len(dead)} jobs")
|
||||
return
|
||||
|
||||
job_ids = {j["id"] for j in dead}
|
||||
jobs = [j for j in jobs if j["id"] not in job_ids]
|
||||
save_jobs(jobs)
|
||||
print(f"\nDeleted {len(dead)} jobs.")
|
||||
|
||||
else:
|
||||
print(f"\nUse --disable or --delete to take action. Add --dry-run to preview.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,129 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# gen_agent_cert.sh — Generate a TLS certificate for a fleet agent.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/gen_agent_cert.sh --agent <name> [--ca-dir <dir>] [--out-dir <dir>]
|
||||
#
|
||||
# Known agents: timmy, allegro, ezra (case-insensitive; any name is accepted)
|
||||
#
|
||||
# Outputs (default: ~/.hermes/pki/agents/<name>/):
|
||||
# <name>.key — agent private key (chmod 600, stays on the agent host)
|
||||
# <name>.crt — agent certificate (signed by the fleet CA)
|
||||
#
|
||||
# Run gen_fleet_ca.sh first if you haven't already.
|
||||
# Refs #806
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CERT_DAYS=365 # 1 year; rotate annually
|
||||
KEY_BITS=2048
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse args
|
||||
# ---------------------------------------------------------------------------
|
||||
AGENT_NAME=""
|
||||
CA_DIR="${HOME}/.hermes/pki/ca"
|
||||
OUT_DIR=""
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--agent) AGENT_NAME="${2,,}"; shift 2 ;; # lower-case
|
||||
--ca-dir) CA_DIR="$2"; shift 2 ;;
|
||||
--out-dir) OUT_DIR="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 --agent <name> [--ca-dir <dir>] [--out-dir <dir>]"
|
||||
echo " Known agents: timmy, allegro, ezra"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$AGENT_NAME" ]]; then
|
||||
echo "ERROR: --agent <name> is required." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
OUT_DIR="${OUT_DIR:-${HOME}/.hermes/pki/agents/${AGENT_NAME}}"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prereq check
|
||||
# ---------------------------------------------------------------------------
|
||||
if ! command -v openssl &>/dev/null; then
|
||||
echo "ERROR: openssl not found." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CA_KEY="$CA_DIR/fleet-ca.key"
|
||||
CA_CRT="$CA_DIR/fleet-ca.crt"
|
||||
|
||||
if [[ ! -f "$CA_KEY" || ! -f "$CA_CRT" ]]; then
|
||||
echo "ERROR: Fleet CA not found in $CA_DIR" >&2
|
||||
echo " Run scripts/gen_fleet_ca.sh first." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$OUT_DIR"
|
||||
chmod 700 "$OUT_DIR"
|
||||
|
||||
AGENT_KEY="$OUT_DIR/${AGENT_NAME}.key"
|
||||
AGENT_CRT="$OUT_DIR/${AGENT_NAME}.crt"
|
||||
AGENT_CSR="$OUT_DIR/${AGENT_NAME}.csr"
|
||||
|
||||
if [[ -f "$AGENT_KEY" || -f "$AGENT_CRT" ]]; then
|
||||
echo "Cert for agent '$AGENT_NAME' already exists in $OUT_DIR"
|
||||
echo " $AGENT_KEY"
|
||||
echo " $AGENT_CRT"
|
||||
echo "Delete them manually if you want to regenerate."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Generating cert for agent '$AGENT_NAME' ..."
|
||||
|
||||
SUBJECT="/CN=${AGENT_NAME}.fleet.hermes/O=Hermes/OU=Fleet Agent"
|
||||
|
||||
# Agent private key
|
||||
openssl genrsa -out "$AGENT_KEY" "$KEY_BITS" 2>/dev/null
|
||||
chmod 600 "$AGENT_KEY"
|
||||
|
||||
# Certificate Signing Request
|
||||
openssl req -new \
|
||||
-key "$AGENT_KEY" \
|
||||
-out "$AGENT_CSR" \
|
||||
-subj "$SUBJECT" 2>/dev/null
|
||||
|
||||
# Sign with fleet CA — include SAN so modern TLS stacks accept it
|
||||
EXT_CONF=$(mktemp)
|
||||
trap 'rm -f "$EXT_CONF" "$AGENT_CSR"' EXIT
|
||||
|
||||
cat > "$EXT_CONF" <<EOF
|
||||
[v3_agent]
|
||||
basicConstraints = CA:FALSE
|
||||
keyUsage = critical, digitalSignature, keyEncipherment
|
||||
extendedKeyUsage = clientAuth, serverAuth
|
||||
subjectKeyIdentifier = hash
|
||||
authorityKeyIdentifier = keyid,issuer
|
||||
subjectAltName = DNS:${AGENT_NAME}.fleet.hermes, DNS:${AGENT_NAME}
|
||||
EOF
|
||||
|
||||
openssl x509 -req \
|
||||
-in "$AGENT_CSR" \
|
||||
-CA "$CA_CRT" \
|
||||
-CAkey "$CA_KEY" \
|
||||
-CAcreateserial \
|
||||
-out "$AGENT_CRT" \
|
||||
-days "$CERT_DAYS" \
|
||||
-extfile "$EXT_CONF" \
|
||||
-extensions v3_agent 2>/dev/null
|
||||
|
||||
chmod 644 "$AGENT_CRT"
|
||||
|
||||
echo ""
|
||||
echo "Agent cert generated:"
|
||||
echo " Private key : $AGENT_KEY"
|
||||
echo " Certificate : $AGENT_CRT"
|
||||
echo ""
|
||||
openssl x509 -in "$AGENT_CRT" -noout -subject -issuer -dates
|
||||
@@ -1,83 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# gen_fleet_ca.sh — Generate the Hermes fleet Certificate Authority.
|
||||
#
|
||||
# Usage:
|
||||
# ./scripts/gen_fleet_ca.sh [--out-dir <dir>]
|
||||
#
|
||||
# Outputs (default: ~/.hermes/pki/ca/):
|
||||
# fleet-ca.key — CA private key (chmod 600, keep secret)
|
||||
# fleet-ca.crt — CA certificate (distribute to all fleet nodes)
|
||||
#
|
||||
# The CA is valid for 10 years. Regenerate + redistribute when it expires.
|
||||
# Refs #806
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CA_SUBJECT="/CN=Hermes Fleet CA/O=Hermes/OU=Fleet"
|
||||
CA_DAYS=3650 # 10 years
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parse args
|
||||
# ---------------------------------------------------------------------------
|
||||
OUT_DIR="${HOME}/.hermes/pki/ca"
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--out-dir) OUT_DIR="$2"; shift 2 ;;
|
||||
-h|--help)
|
||||
echo "Usage: $0 [--out-dir <dir>]"
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prereq check
|
||||
# ---------------------------------------------------------------------------
|
||||
if ! command -v openssl &>/dev/null; then
|
||||
echo "ERROR: openssl not found. Install OpenSSL and re-run." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p "$OUT_DIR"
|
||||
chmod 700 "$OUT_DIR"
|
||||
|
||||
CA_KEY="$OUT_DIR/fleet-ca.key"
|
||||
CA_CRT="$OUT_DIR/fleet-ca.crt"
|
||||
|
||||
if [[ -f "$CA_KEY" || -f "$CA_CRT" ]]; then
|
||||
echo "Fleet CA already exists in $OUT_DIR"
|
||||
echo " $CA_KEY"
|
||||
echo " $CA_CRT"
|
||||
echo "Delete them manually if you want to regenerate."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Generating fleet CA in $OUT_DIR ..."
|
||||
|
||||
# Generate 4096-bit RSA key for the CA
|
||||
openssl genrsa -out "$CA_KEY" 4096 2>/dev/null
|
||||
chmod 600 "$CA_KEY"
|
||||
|
||||
# Self-sign the CA certificate
|
||||
openssl req -new -x509 \
|
||||
-key "$CA_KEY" \
|
||||
-out "$CA_CRT" \
|
||||
-days "$CA_DAYS" \
|
||||
-subj "$CA_SUBJECT" \
|
||||
-addext "basicConstraints=critical,CA:TRUE,pathlen:0" \
|
||||
-addext "keyUsage=critical,keyCertSign,cRLSign" \
|
||||
-addext "subjectKeyIdentifier=hash" 2>/dev/null
|
||||
|
||||
chmod 644 "$CA_CRT"
|
||||
|
||||
echo ""
|
||||
echo "Fleet CA generated successfully:"
|
||||
echo " Private key : $CA_KEY (keep secret)"
|
||||
echo " Certificate : $CA_CRT (distribute to all fleet nodes)"
|
||||
echo ""
|
||||
openssl x509 -in "$CA_CRT" -noout -subject -dates
|
||||
@@ -1,278 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Poka-yoke: Hardcoded path linter for hermes-agent.
|
||||
|
||||
Scans Python files for hardcoded home-directory paths that break
|
||||
multi-user/multi-profile deployments. Catches:
|
||||
- Path.home() / ".hermes" without HERMES_HOME env var fallback
|
||||
- Hardcoded /Users/<name>/ paths
|
||||
- Hardcoded /home/<name>/ paths
|
||||
- Raw ~/.hermes in code (not in comments/docstrings)
|
||||
|
||||
Usage:
|
||||
python3 scripts/lint_hardcoded_paths.py # lint all .py files
|
||||
python3 scripts/lint_hardcoded_paths.py --fix # suggest fixes
|
||||
python3 scripts/lint_hardcoded_paths.py --staged # lint git staged files only
|
||||
|
||||
Exit codes:
|
||||
0 = no violations
|
||||
1 = violations found
|
||||
2 = error
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ── Patterns ──────────────────────────────────────────────────────
|
||||
|
||||
VIOLATIONS = [
|
||||
{
|
||||
"id": "direct-home-hermes",
|
||||
"name": "Direct Path.home()/.hermes",
|
||||
"pattern": r'Path\.home\(\)\s*/\s*["\']\.hermes["\']',
|
||||
"exclude_with": r'os\.getenv\(|os\.environ\.get\(|_get_profiles_root|profiles_parent|current_default|native_home',
|
||||
"message": "Use `Path(os.getenv('HERMES_HOME', Path.home() / '.hermes'))` instead of direct `Path.home() / '.hermes'`",
|
||||
},
|
||||
{
|
||||
"id": "hardcoded-user-path",
|
||||
"name": "Hardcoded /Users/<name>/",
|
||||
"pattern": r'["\']/Users/[a-zA-Z_][a-zA-Z0-9_]*/',
|
||||
"exclude_with": r'#|""".*"""\s*$',
|
||||
"message": "Use environment variables or relative paths instead of hardcoded /Users/<name>/",
|
||||
},
|
||||
{
|
||||
"id": "hardcoded-home-path",
|
||||
"name": "Hardcoded /home/<name>/",
|
||||
"pattern": r'["\']/home/[a-zA-Z_][a-zA-Z0-9_]*/',
|
||||
"exclude_with": r'#|""".*"""\s*$',
|
||||
"message": "Use environment variables or relative paths instead of hardcoded /home/<name>/",
|
||||
},
|
||||
{
|
||||
"id": "expanduser-hermes",
|
||||
"name": "os.path.expanduser ~/.hermes (non-fallback)",
|
||||
"pattern": r'os\.path\.expanduser\(["\']~/.hermes',
|
||||
"exclude_with": r'#',
|
||||
"message": "Use `os.environ.get('HERMES_HOME', os.path.expanduser('~/.hermes'))` instead",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ── Exceptions ─────────────────────────────────────────────────────
|
||||
# Files where hardcoded paths are acceptable (tests with mock data,
|
||||
# migration scripts, docs generation)
|
||||
|
||||
EXCEPTIONS = [
|
||||
"tests/", # Test fixtures can use mock paths
|
||||
"scripts/", # One-off scripts
|
||||
"optional-skills/", # Skills not in core
|
||||
"skills/", # External skills
|
||||
"plugins/", # Plugins
|
||||
"website/", # Docs site
|
||||
"mcp_serve.py", # Standalone MCP server
|
||||
"docs/", # Documentation
|
||||
]
|
||||
|
||||
|
||||
# ── Scanner ────────────────────────────────────────────────────────
|
||||
|
||||
def is_exception(filepath: str) -> bool:
|
||||
"""Check if file is in the exception list."""
|
||||
for exc in EXCEPTIONS:
|
||||
if filepath.startswith(exc) or f"/{exc}" in filepath:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_in_comment_or_docstring(line: str, lines: list, line_idx: int) -> bool:
|
||||
"""Check if the match is in a comment or docstring."""
|
||||
stripped = line.strip()
|
||||
|
||||
# Line comment
|
||||
if stripped.startswith("#"):
|
||||
return True
|
||||
|
||||
# Inline comment — check if match is after #
|
||||
if "#" in line:
|
||||
code_part = line[:line.index("#")]
|
||||
for v in VIOLATIONS:
|
||||
if re.search(v["pattern"], code_part):
|
||||
return False # Match is in code, not comment
|
||||
return True # No match in code part, must be in comment
|
||||
|
||||
# Simple docstring check: look for triple quotes before this line
|
||||
in_docstring = False
|
||||
quote_count = 0
|
||||
for i in range(max(0, line_idx - 20), line_idx + 1):
|
||||
for char in ['"""', "'''"]:
|
||||
quote_count += lines[i].count(char)
|
||||
if quote_count % 2 == 1:
|
||||
in_docstring = True
|
||||
|
||||
# Also check current line for docstring delimiters
|
||||
if '"""' in line or "'''" in line:
|
||||
# If line is entirely within a docstring block, skip
|
||||
before_match = line[:line.find(re.search(VIOLATIONS[0]["pattern"], line).group())] if re.search(VIOLATIONS[0]["pattern"], line) else ""
|
||||
if '"""' in before_match or "'''" in before_match:
|
||||
in_docstring = True
|
||||
|
||||
return in_docstring
|
||||
|
||||
|
||||
def scan_file(filepath: str) -> list:
|
||||
"""Scan a single file for violations."""
|
||||
try:
|
||||
with open(filepath) as f:
|
||||
content = f.read()
|
||||
lines = content.split("\n")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
return []
|
||||
|
||||
violations_found = []
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
for v in VIOLATIONS:
|
||||
match = re.search(v["pattern"], line)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
# Check if excluded by context (e.g., it's part of a fallback pattern)
|
||||
if v.get("exclude_with"):
|
||||
if re.search(v["exclude_with"], line):
|
||||
continue
|
||||
|
||||
# Skip comments and docstrings
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("#"):
|
||||
continue
|
||||
|
||||
# Check if in inline comment
|
||||
if "#" in line:
|
||||
code_part = line[:line.index("#")]
|
||||
if not re.search(v["pattern"], code_part):
|
||||
continue
|
||||
|
||||
violations_found.append({
|
||||
"file": filepath,
|
||||
"line": i + 1,
|
||||
"rule": v["id"],
|
||||
"name": v["name"],
|
||||
"message": v["message"],
|
||||
"text": stripped[:120],
|
||||
})
|
||||
|
||||
return violations_found
|
||||
|
||||
|
||||
def get_staged_files() -> list:
|
||||
"""Get list of staged Python files from git."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
return [f for f in result.stdout.strip().split("\n") if f.endswith(".py")]
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
return []
|
||||
|
||||
|
||||
def scan_all(root: str = ".") -> list:
|
||||
"""Scan all Python files in the repo."""
|
||||
all_violations = []
|
||||
for dirpath, dirnames, filenames in os.walk(root):
|
||||
dirnames[:] = [d for d in dirnames if d not in (".git", "venv", "__pycache__", "node_modules")]
|
||||
for f in filenames:
|
||||
if not f.endswith(".py"):
|
||||
continue
|
||||
filepath = os.path.join(dirpath, f)
|
||||
rel = os.path.relpath(filepath, root)
|
||||
|
||||
if is_exception(rel):
|
||||
continue
|
||||
|
||||
all_violations.extend(scan_file(filepath))
|
||||
|
||||
return all_violations
|
||||
|
||||
|
||||
# ── Output ─────────────────────────────────────────────────────────
|
||||
|
||||
def print_violations(violations: list) -> None:
|
||||
"""Print violations in a readable format."""
|
||||
if not violations:
|
||||
print("PASS: No hardcoded path violations found")
|
||||
return
|
||||
|
||||
print(f"FAIL: {len(violations)} hardcoded path violation(s) found\n")
|
||||
|
||||
by_rule = {}
|
||||
for v in violations:
|
||||
by_rule.setdefault(v["rule"], []).append(v)
|
||||
|
||||
for rule, items in sorted(by_rule.items()):
|
||||
print(f" [{rule}] {items[0]['name']}")
|
||||
print(f" {items[0]['message']}")
|
||||
for item in items:
|
||||
print(f" {item['file']}:{item['line']}: {item['text']}")
|
||||
print()
|
||||
|
||||
|
||||
def print_fix_suggestions(violations: list) -> None:
|
||||
"""Print fix suggestions for violations."""
|
||||
if not violations:
|
||||
return
|
||||
|
||||
print("\n=== Fix Suggestions ===\n")
|
||||
|
||||
for v in violations:
|
||||
print(f" {v['file']}:{v['line']}")
|
||||
print(f" Current: {v['text']}")
|
||||
|
||||
if v["rule"] == "direct-home-hermes":
|
||||
print(f" Fix: Use `Path(os.getenv('HERMES_HOME', Path.home() / '.hermes'))`")
|
||||
elif v["rule"] in ("hardcoded-user-path", "hardcoded-home-path"):
|
||||
print(f" Fix: Use `os.environ.get('HOME')` or `Path.home()`")
|
||||
elif v["rule"] == "expanduser-hermes":
|
||||
print(f" Fix: Use `os.environ.get('HERMES_HOME', os.path.expanduser('~/.hermes'))`")
|
||||
print()
|
||||
|
||||
|
||||
# ── Main ───────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Lint hardcoded paths in hermes-agent")
|
||||
parser.add_argument("--staged", action="store_true", help="Only scan git staged files")
|
||||
parser.add_argument("--fix", action="store_true", help="Show fix suggestions")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
parser.add_argument("--root", default=".", help="Root directory to scan")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.staged:
|
||||
files = get_staged_files()
|
||||
if not files:
|
||||
print("No staged Python files")
|
||||
sys.exit(0)
|
||||
violations = []
|
||||
for f in files:
|
||||
if not is_exception(f):
|
||||
violations.extend(scan_file(f))
|
||||
else:
|
||||
violations = scan_all(args.root)
|
||||
|
||||
if args.json:
|
||||
import json
|
||||
print(json.dumps(violations, indent=2))
|
||||
else:
|
||||
print_violations(violations)
|
||||
if args.fix:
|
||||
print_fix_suggestions(violations)
|
||||
|
||||
sys.exit(1 if violations else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,265 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Hermes MCP Server — expose hermes-agent tools to fleet peers.
|
||||
|
||||
Runs as a standalone MCP server that other agents can connect to
|
||||
and invoke hermes tools remotely.
|
||||
|
||||
Safe tools exposed:
|
||||
- terminal (safe commands only)
|
||||
- file_read, file_search
|
||||
- web_search, web_extract
|
||||
- session_search
|
||||
|
||||
NOT exposed (internal tools):
|
||||
- approval, delegate, memory, config
|
||||
|
||||
Usage:
|
||||
python -m tools.mcp_server --port 8081
|
||||
hermes mcp-server --port 8081
|
||||
python scripts/mcp_server.py --port 8081 --auth-key SECRET
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Tools safe to expose to other agents
|
||||
SAFE_TOOLS = {
|
||||
"terminal": {
|
||||
"name": "terminal",
|
||||
"description": "Execute safe shell commands. Dangerous commands are blocked.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {"type": "string", "description": "Shell command to execute"},
|
||||
},
|
||||
"required": ["command"],
|
||||
},
|
||||
},
|
||||
"file_read": {
|
||||
"name": "file_read",
|
||||
"description": "Read the contents of a file.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {"type": "string", "description": "File path to read"},
|
||||
"offset": {"type": "integer", "description": "Start line", "default": 1},
|
||||
"limit": {"type": "integer", "description": "Max lines", "default": 200},
|
||||
},
|
||||
"required": ["path"],
|
||||
},
|
||||
},
|
||||
"file_search": {
|
||||
"name": "file_search",
|
||||
"description": "Search file contents using regex.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {"type": "string", "description": "Regex pattern"},
|
||||
"path": {"type": "string", "description": "Directory to search", "default": "."},
|
||||
},
|
||||
"required": ["pattern"],
|
||||
},
|
||||
},
|
||||
"web_search": {
|
||||
"name": "web_search",
|
||||
"description": "Search the web for information.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string", "description": "Search query"},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
"session_search": {
|
||||
"name": "session_search",
|
||||
"description": "Search past conversation sessions.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string", "description": "Search query"},
|
||||
"limit": {"type": "integer", "description": "Max results", "default": 3},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Tools explicitly blocked
|
||||
BLOCKED_TOOLS = {
|
||||
"approval", "delegate", "memory", "config", "skill_install",
|
||||
"mcp_tool", "cronjob", "tts", "send_message",
|
||||
}
|
||||
|
||||
|
||||
class MCPServer:
|
||||
"""Simple MCP-compatible server for exposing hermes tools."""
|
||||
|
||||
def __init__(self, host: str = "127.0.0.1", port: int = 8081,
|
||||
auth_key: Optional[str] = None):
|
||||
self._host = host
|
||||
self._port = port
|
||||
self._auth_key = auth_key or os.getenv("MCP_AUTH_KEY", "")
|
||||
|
||||
async def handle_tools_list(self, request: dict) -> dict:
|
||||
"""Return available tools."""
|
||||
tools = list(SAFE_TOOLS.values())
|
||||
return {"tools": tools}
|
||||
|
||||
async def handle_tools_call(self, request: dict) -> dict:
|
||||
"""Execute a tool call."""
|
||||
tool_name = request.get("name", "")
|
||||
arguments = request.get("arguments", {})
|
||||
|
||||
if tool_name in BLOCKED_TOOLS:
|
||||
return {"error": f"Tool '{tool_name}' is not exposed via MCP"}
|
||||
if tool_name not in SAFE_TOOLS:
|
||||
return {"error": f"Unknown tool: {tool_name}"}
|
||||
|
||||
try:
|
||||
result = await self._execute_tool(tool_name, arguments)
|
||||
return {"content": [{"type": "text", "text": str(result)}]}
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _execute_tool(self, tool_name: str, arguments: dict) -> str:
|
||||
"""Execute a tool and return result."""
|
||||
if tool_name == "terminal":
|
||||
import subprocess
|
||||
cmd = arguments.get("command", "")
|
||||
# Block dangerous commands
|
||||
from tools.approval import detect_dangerous_command
|
||||
is_dangerous, _, desc = detect_dangerous_command(cmd)
|
||||
if is_dangerous:
|
||||
return f"BLOCKED: Dangerous command detected ({desc}). This tool only executes safe commands."
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=30)
|
||||
return result.stdout or result.stderr or "(no output)"
|
||||
|
||||
elif tool_name == "file_read":
|
||||
path = arguments.get("path", "")
|
||||
offset = arguments.get("offset", 1)
|
||||
limit = arguments.get("limit", 200)
|
||||
with open(path) as f:
|
||||
lines = f.readlines()
|
||||
return "".join(lines[offset-1:offset-1+limit])
|
||||
|
||||
elif tool_name == "file_search":
|
||||
import re
|
||||
pattern = arguments.get("pattern", "")
|
||||
path = arguments.get("path", ".")
|
||||
results = []
|
||||
for p in Path(path).rglob("*.py"):
|
||||
try:
|
||||
content = p.read_text()
|
||||
for i, line in enumerate(content.split("\n"), 1):
|
||||
if re.search(pattern, line, re.IGNORECASE):
|
||||
results.append(f"{p}:{i}: {line.strip()}")
|
||||
if len(results) >= 20:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
if len(results) >= 20:
|
||||
break
|
||||
return "\n".join(results) or "No matches found"
|
||||
|
||||
elif tool_name == "web_search":
|
||||
try:
|
||||
from tools.web_tools import web_search
|
||||
return web_search(arguments.get("query", ""))
|
||||
except ImportError:
|
||||
return "Web search not available"
|
||||
|
||||
elif tool_name == "session_search":
|
||||
try:
|
||||
from tools.session_search_tool import session_search
|
||||
return session_search(
|
||||
query=arguments.get("query", ""),
|
||||
limit=arguments.get("limit", 3),
|
||||
)
|
||||
except ImportError:
|
||||
return "Session search not available"
|
||||
|
||||
return f"Tool {tool_name} not implemented"
|
||||
|
||||
async def start_http(self):
|
||||
"""Start HTTP server for MCP endpoints."""
|
||||
try:
|
||||
from aiohttp import web
|
||||
except ImportError:
|
||||
logger.error("aiohttp required: pip install aiohttp")
|
||||
return
|
||||
|
||||
app = web.Application()
|
||||
|
||||
async def handle_tools_list_route(request):
|
||||
if self._auth_key:
|
||||
auth = request.headers.get("Authorization", "")
|
||||
if auth != f"Bearer {self._auth_key}":
|
||||
return web.json_response({"error": "Unauthorized"}, status=401)
|
||||
result = await self.handle_tools_list({})
|
||||
return web.json_response(result)
|
||||
|
||||
async def handle_tools_call_route(request):
|
||||
if self._auth_key:
|
||||
auth = request.headers.get("Authorization", "")
|
||||
if auth != f"Bearer {self._auth_key}":
|
||||
return web.json_response({"error": "Unauthorized"}, status=401)
|
||||
body = await request.json()
|
||||
result = await self.handle_tools_call(body)
|
||||
return web.json_response(result)
|
||||
|
||||
async def handle_health(request):
|
||||
return web.json_response({"status": "ok", "tools": len(SAFE_TOOLS)})
|
||||
|
||||
app.router.add_get("/mcp/tools", handle_tools_list_route)
|
||||
app.router.add_post("/mcp/tools/call", handle_tools_call_route)
|
||||
app.router.add_get("/health", handle_health)
|
||||
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
site = web.TCPSite(runner, self._host, self._port)
|
||||
await site.start()
|
||||
logger.info("MCP server on http://%s:%s", self._host, self._port)
|
||||
logger.info("Tools: %s", ", ".join(SAFE_TOOLS.keys()))
|
||||
if self._auth_key:
|
||||
logger.info("Auth: Bearer token required")
|
||||
else:
|
||||
logger.warning("Auth: No MCP_AUTH_KEY set — server is open")
|
||||
|
||||
try:
|
||||
await asyncio.Event().wait()
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
finally:
|
||||
await runner.cleanup()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Hermes MCP Server")
|
||||
parser.add_argument("--host", default="127.0.0.1")
|
||||
parser.add_argument("--port", type=int, default=8081)
|
||||
parser.add_argument("--auth-key", default=None, help="Bearer token for auth")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s")
|
||||
|
||||
server = MCPServer(host=args.host, port=args.port, auth_key=args.auth_key)
|
||||
print(f"Starting MCP server on http://{args.host}:{args.port}")
|
||||
print(f"Exposed tools: {', '.join(SAFE_TOOLS.keys())}")
|
||||
asyncio.run(server.start_http())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,7 +0,0 @@
|
||||
#!/usr/bin/sh
|
||||
# Pre-commit hook: block commits with hardcoded home-directory paths
|
||||
# Install: cp scripts/pre-commit-hardcoded-paths.sh .git/hooks/pre-commit && chmod +x .git/hooks/pre-commit
|
||||
# Or: git config core.hooksPath .githooks
|
||||
|
||||
python3 scripts/lint_hardcoded_paths.py --staged
|
||||
exit $?
|
||||
@@ -1,147 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Queue Health Check — Verify dispatch queue is operational.
|
||||
|
||||
Checks:
|
||||
1. Queue file exists and is readable
|
||||
2. Queue has pending items
|
||||
3. Queue is not stuck (items processing)
|
||||
4. Queue age (stale items)
|
||||
|
||||
Usage:
|
||||
python scripts/queue_health_check.py
|
||||
python scripts/queue_health_check.py --json
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def check_queue_health(queue_path: str = "~/.hermes/queue.json") -> dict:
|
||||
"""Check queue health status."""
|
||||
path = Path(queue_path).expanduser()
|
||||
|
||||
result = {
|
||||
"healthy": True,
|
||||
"checks": {},
|
||||
"warnings": [],
|
||||
"errors": []
|
||||
}
|
||||
|
||||
# Check 1: File exists
|
||||
if not path.exists():
|
||||
result["healthy"] = False
|
||||
result["errors"].append(f"Queue file not found: {path}")
|
||||
result["checks"]["file_exists"] = False
|
||||
return result
|
||||
|
||||
result["checks"]["file_exists"] = True
|
||||
|
||||
# Check 2: File is readable
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
data = json.load(f)
|
||||
except Exception as e:
|
||||
result["healthy"] = False
|
||||
result["errors"].append(f"Cannot read queue: {e}")
|
||||
result["checks"]["readable"] = False
|
||||
return result
|
||||
|
||||
result["checks"]["readable"] = True
|
||||
|
||||
# Check 3: Queue structure
|
||||
if not isinstance(data, dict):
|
||||
result["healthy"] = False
|
||||
result["errors"].append("Queue is not a dict")
|
||||
result["checks"]["valid_structure"] = False
|
||||
return result
|
||||
|
||||
result["checks"]["valid_structure"] = True
|
||||
|
||||
# Check 4: Pending items
|
||||
pending = data.get("pending", [])
|
||||
processing = data.get("processing", [])
|
||||
completed = data.get("completed", [])
|
||||
|
||||
result["checks"]["pending_count"] = len(pending)
|
||||
result["checks"]["processing_count"] = len(processing)
|
||||
result["checks"]["completed_count"] = len(completed)
|
||||
|
||||
if len(pending) == 0 and len(processing) == 0:
|
||||
result["warnings"].append("Queue is empty")
|
||||
|
||||
# Check 5: Stale processing items
|
||||
now = datetime.now()
|
||||
stale_threshold = timedelta(hours=1)
|
||||
|
||||
for item in processing:
|
||||
started = item.get("started_at")
|
||||
if started:
|
||||
try:
|
||||
started_time = datetime.fromisoformat(started.replace("Z", "+00:00"))
|
||||
if now - started_time > stale_threshold:
|
||||
result["warnings"].append(f"Stale item: {item.get('id', 'unknown')} (started {started})")
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check 6: Queue age
|
||||
if pending:
|
||||
oldest = min(pending, key=lambda x: x.get("added_at", ""))
|
||||
added = oldest.get("added_at")
|
||||
if added:
|
||||
try:
|
||||
added_time = datetime.fromisoformat(added.replace("Z", "+00:00"))
|
||||
age = now - added_time
|
||||
if age > timedelta(hours=24):
|
||||
result["warnings"].append(f"Old item in queue: {oldest.get('id', 'unknown')} (added {added})")
|
||||
except:
|
||||
pass
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function."""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Queue health check")
|
||||
parser.add_argument("--queue", default="~/.hermes/queue.json", help="Queue file path")
|
||||
parser.add_argument("--json", action="store_true", help="Output as JSON")
|
||||
args = parser.parse_args()
|
||||
|
||||
result = check_queue_health(args.queue)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(result, indent=2))
|
||||
else:
|
||||
print("Queue Health Check")
|
||||
print("=" * 50)
|
||||
print(f"Healthy: {'✓' if result['healthy'] else '✗'}")
|
||||
print()
|
||||
|
||||
print("Checks:")
|
||||
for check, value in result["checks"].items():
|
||||
if isinstance(value, bool):
|
||||
print(f" {check}: {'✓' if value else '✗'}")
|
||||
else:
|
||||
print(f" {check}: {value}")
|
||||
|
||||
if result["warnings"]:
|
||||
print()
|
||||
print("Warnings:")
|
||||
for warning in result["warnings"]:
|
||||
print(f" ⚠ {warning}")
|
||||
|
||||
if result["errors"]:
|
||||
print()
|
||||
print("Errors:")
|
||||
for error in result["errors"]:
|
||||
print(f" ✗ {error}")
|
||||
|
||||
sys.exit(0 if result["healthy"] else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,145 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
time-aware-model-router.py — Route cron jobs to better models during high-error hours.
|
||||
|
||||
Empirical finding (audit 2026-04-12): Error rate peaks at 18:00 (9.4%) during
|
||||
evening cron batches vs 4.0% at 09:00 during interactive work.
|
||||
|
||||
This script provides a model resolver that selects a more capable model during
|
||||
high-error hours (17:00-22:00) and the default model otherwise.
|
||||
|
||||
Usage:
|
||||
# As a standalone resolver
|
||||
python3 scripts/time-aware-model-router.py
|
||||
# Returns: {"provider": "nous", "model": "xiaomi/mimo-v2-pro"}
|
||||
|
||||
# With hour override for testing
|
||||
python3 scripts/time-aware-model-router.py --hour 18
|
||||
# Returns: {"provider": "openrouter", "model": "anthropic/claude-sonnet-4"}
|
||||
|
||||
# As a cron job wrapper
|
||||
python3 scripts/time-aware-model-router.py --wrap -- prompt goes here
|
||||
|
||||
Environment variables:
|
||||
HERMES_DEFAULT_PROVIDER: Default provider for normal hours (default: nous)
|
||||
HERMES_DEFAULT_MODEL: Default model for normal hours (default: xiaomi/mimo-v2-pro)
|
||||
HERMES_PEAK_PROVIDER: Provider for high-error hours (default: openrouter)
|
||||
HERMES_PEAK_MODEL: Model for high-error hours (default: anthropic/claude-sonnet-4)
|
||||
HERMES_PEAK_HOURS: Comma-separated hours for peak routing (default: 17,18,19,20,21,22)
|
||||
|
||||
Refs: hermes-agent#889
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
DEFAULT_PROVIDER = os.environ.get("HERMES_DEFAULT_PROVIDER", "nous")
|
||||
DEFAULT_MODEL = os.environ.get("HERMES_DEFAULT_MODEL", "xiaomi/mimo-v2-pro")
|
||||
PEAK_PROVIDER = os.environ.get("HERMES_PEAK_PROVIDER", "openrouter")
|
||||
PEAK_MODEL = os.environ.get("HERMES_PEAK_MODEL", "anthropic/claude-sonnet-4")
|
||||
PEAK_HOURS = set(int(h) for h in os.environ.get("HERMES_PEAK_HOURS", "17,18,19,20,21,22").split(","))
|
||||
|
||||
# ── Time-aware routing ─────────────────────────────────────────────────────
|
||||
|
||||
def get_current_hour():
|
||||
"""Get the current local hour (0-23)."""
|
||||
return datetime.now().hour
|
||||
|
||||
|
||||
def is_peak_hour(hour=None):
|
||||
"""Check if the given hour (or current hour) is a high-error period."""
|
||||
if hour is None:
|
||||
hour = get_current_hour()
|
||||
return hour in PEAK_HOURS
|
||||
|
||||
|
||||
def resolve_model(hour=None):
|
||||
"""
|
||||
Resolve which model to use based on time of day.
|
||||
|
||||
Returns dict with 'provider' and 'model' keys.
|
||||
During peak hours (high error rate), uses a more capable model.
|
||||
During normal hours, uses the default model.
|
||||
"""
|
||||
if is_peak_hour(hour):
|
||||
return {
|
||||
"provider": PEAK_PROVIDER,
|
||||
"model": PEAK_MODEL,
|
||||
"reason": f"peak_hour ({hour if hour is not None else get_current_hour()}:00)",
|
||||
"confidence_note": "Using stronger model during high-error period"
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"provider": DEFAULT_PROVIDER,
|
||||
"model": DEFAULT_MODEL,
|
||||
"reason": "normal_hour",
|
||||
"confidence_note": "Default model sufficient during low-error period"
|
||||
}
|
||||
|
||||
|
||||
def get_routing_info():
|
||||
"""Get full routing info including current state and config."""
|
||||
hour = get_current_hour()
|
||||
resolved = resolve_model(hour)
|
||||
return {
|
||||
"current_hour": hour,
|
||||
"is_peak": is_peak_hour(hour),
|
||||
"peak_hours": sorted(PEAK_HOURS),
|
||||
"routing": resolved,
|
||||
"config": {
|
||||
"default": {"provider": DEFAULT_PROVIDER, "model": DEFAULT_MODEL},
|
||||
"peak": {"provider": PEAK_PROVIDER, "model": PEAK_MODEL},
|
||||
},
|
||||
"source": "hermes-agent#889 — empirical audit 2026-04-12",
|
||||
}
|
||||
|
||||
|
||||
# ── CLI ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
args = sys.argv[1:]
|
||||
|
||||
# Parse --hour
|
||||
hour = None
|
||||
if "--hour" in args:
|
||||
idx = args.index("--hour")
|
||||
if idx + 1 < len(args):
|
||||
hour = int(args[idx + 1])
|
||||
|
||||
# Parse --wrap mode
|
||||
if "--wrap" in args:
|
||||
# Run the remaining args as a command with model override
|
||||
resolved = resolve_model(hour)
|
||||
wrap_idx = args.index("--wrap")
|
||||
cmd_parts = args[wrap_idx + 1:]
|
||||
|
||||
# Inject model/provider into environment
|
||||
env = os.environ.copy()
|
||||
env["HERMES_MODEL"] = resolved["model"]
|
||||
env["HERMES_PROVIDER"] = resolved["provider"]
|
||||
|
||||
if cmd_parts:
|
||||
import subprocess
|
||||
result = subprocess.run(cmd_parts, env=env)
|
||||
sys.exit(result.returncode)
|
||||
else:
|
||||
print(json.dumps(resolved, indent=2))
|
||||
sys.exit(0)
|
||||
|
||||
# Parse --info mode
|
||||
if "--info" in args:
|
||||
print(json.dumps(get_routing_info(), indent=2))
|
||||
sys.exit(0)
|
||||
|
||||
# Default: output resolved model as JSON
|
||||
resolved = resolve_model(hour)
|
||||
print(json.dumps(resolved, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,574 +0,0 @@
|
||||
"""
|
||||
Tests for A2A mutual-TLS authentication.
|
||||
|
||||
Scenarios covered:
|
||||
- authorized agent (valid fleet-CA-signed cert) is accepted
|
||||
- unauthorized agent (self-signed cert) is rejected with SSLError
|
||||
- missing client cert is rejected
|
||||
- build_server_ssl_context raises FileNotFoundError for missing paths
|
||||
- build_client_ssl_context raises FileNotFoundError for missing paths
|
||||
- A2AServer.start() / stop() lifecycle (no network I/O)
|
||||
|
||||
All TLS I/O is done in-process against a loopback server so no ports need
|
||||
to be opened on a CI runner.
|
||||
|
||||
Refs #806
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import ipaddress
|
||||
import ssl
|
||||
import threading
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import pytest
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers — generate self-signed certs in-memory with Python's ``cryptography``
|
||||
# library (dev extra). If cryptography is unavailable we skip the network
|
||||
# tests gracefully.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.primitives import hashes, serialization
|
||||
from cryptography.hazmat.primitives.asymmetric import rsa
|
||||
from cryptography.x509.oid import NameOID
|
||||
import cryptography.hazmat.backends as _backends
|
||||
_CRYPTO_AVAILABLE = True
|
||||
except ImportError:
|
||||
_CRYPTO_AVAILABLE = False
|
||||
|
||||
_requires_crypto = pytest.mark.skipif(
|
||||
not _CRYPTO_AVAILABLE,
|
||||
reason="cryptography package not installed",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixture helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_ca_keypair(tmp_path: Path) -> Tuple[Path, Path]:
|
||||
"""Generate a self-signed CA cert+key and write to *tmp_path*."""
|
||||
key = rsa.generate_private_key(public_exponent=65537, key_size=2048)
|
||||
name = x509.Name([
|
||||
x509.NameAttribute(NameOID.COMMON_NAME, "Test Fleet CA"),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, "TestOrg"),
|
||||
])
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(name)
|
||||
.issuer_name(name)
|
||||
.public_key(key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(now)
|
||||
.not_valid_after(now + datetime.timedelta(days=3650))
|
||||
.add_extension(x509.BasicConstraints(ca=True, path_length=0), critical=True)
|
||||
.add_extension(
|
||||
x509.KeyUsage(
|
||||
digital_signature=False, key_cert_sign=True, crl_sign=True,
|
||||
content_commitment=False, key_encipherment=False,
|
||||
data_encipherment=False, key_agreement=False,
|
||||
encipher_only=False, decipher_only=False,
|
||||
),
|
||||
critical=True,
|
||||
)
|
||||
.sign(key, hashes.SHA256())
|
||||
)
|
||||
key_path = tmp_path / "ca.key"
|
||||
cert_path = tmp_path / "ca.crt"
|
||||
key_path.write_bytes(key.private_bytes(
|
||||
serialization.Encoding.PEM,
|
||||
serialization.PrivateFormat.TraditionalOpenSSL,
|
||||
serialization.NoEncryption(),
|
||||
))
|
||||
cert_path.write_bytes(cert.public_bytes(serialization.Encoding.PEM))
|
||||
return cert_path, key_path
|
||||
|
||||
|
||||
def _make_agent_keypair(
|
||||
tmp_path: Path,
|
||||
name: str,
|
||||
ca_cert_path: Path,
|
||||
ca_key_path: Path,
|
||||
) -> Tuple[Path, Path]:
|
||||
"""Generate an agent cert signed by the test CA."""
|
||||
ca_cert = x509.load_pem_x509_certificate(ca_cert_path.read_bytes())
|
||||
ca_key = serialization.load_pem_private_key(
|
||||
ca_key_path.read_bytes(), password=None
|
||||
)
|
||||
|
||||
key = rsa.generate_private_key(public_exponent=65537, key_size=2048)
|
||||
subject = x509.Name([
|
||||
x509.NameAttribute(NameOID.COMMON_NAME, f"{name}.fleet.hermes"),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, "TestOrg"),
|
||||
])
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(subject)
|
||||
.issuer_name(ca_cert.subject)
|
||||
.public_key(key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(now)
|
||||
.not_valid_after(now + datetime.timedelta(days=365))
|
||||
.add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True)
|
||||
.add_extension(
|
||||
x509.SubjectAlternativeName([
|
||||
x509.DNSName(f"{name}.fleet.hermes"),
|
||||
x509.DNSName(name),
|
||||
x509.IPAddress(ipaddress.IPv4Address("127.0.0.1")),
|
||||
]),
|
||||
critical=False,
|
||||
)
|
||||
.add_extension(
|
||||
x509.ExtendedKeyUsage([
|
||||
x509.ExtendedKeyUsageOID.CLIENT_AUTH,
|
||||
x509.ExtendedKeyUsageOID.SERVER_AUTH,
|
||||
]),
|
||||
critical=False,
|
||||
)
|
||||
.sign(ca_key, hashes.SHA256())
|
||||
)
|
||||
key_path = tmp_path / f"{name}.key"
|
||||
cert_path = tmp_path / f"{name}.crt"
|
||||
key_path.write_bytes(key.private_bytes(
|
||||
serialization.Encoding.PEM,
|
||||
serialization.PrivateFormat.TraditionalOpenSSL,
|
||||
serialization.NoEncryption(),
|
||||
))
|
||||
cert_path.write_bytes(cert.public_bytes(serialization.Encoding.PEM))
|
||||
return cert_path, key_path
|
||||
|
||||
|
||||
def _make_self_signed_keypair(tmp_path: Path, name: str) -> Tuple[Path, Path]:
|
||||
"""Generate a self-signed cert NOT signed by the test CA (unauthorized)."""
|
||||
key = rsa.generate_private_key(public_exponent=65537, key_size=2048)
|
||||
subject = x509.Name([
|
||||
x509.NameAttribute(NameOID.COMMON_NAME, f"{name}.rogue"),
|
||||
])
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(subject)
|
||||
.issuer_name(subject)
|
||||
.public_key(key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(now)
|
||||
.not_valid_after(now + datetime.timedelta(days=365))
|
||||
.add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True)
|
||||
.add_extension(
|
||||
x509.SubjectAlternativeName([x509.IPAddress(ipaddress.IPv4Address("127.0.0.1"))]),
|
||||
critical=False,
|
||||
)
|
||||
.sign(key, hashes.SHA256())
|
||||
)
|
||||
key_path = tmp_path / f"{name}_rogue.key"
|
||||
cert_path = tmp_path / f"{name}_rogue.crt"
|
||||
key_path.write_bytes(key.private_bytes(
|
||||
serialization.Encoding.PEM,
|
||||
serialization.PrivateFormat.TraditionalOpenSSL,
|
||||
serialization.NoEncryption(),
|
||||
))
|
||||
cert_path.write_bytes(cert.public_bytes(serialization.Encoding.PEM))
|
||||
return cert_path, key_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests — no network I/O
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBuildSslContextErrors:
|
||||
def test_server_context_missing_cert(self, tmp_path):
|
||||
from agent.a2a_mtls import build_server_ssl_context
|
||||
with pytest.raises(FileNotFoundError, match="mTLS"):
|
||||
build_server_ssl_context(
|
||||
cert=tmp_path / "nope.crt",
|
||||
key=tmp_path / "nope.key",
|
||||
ca=tmp_path / "nope.crt",
|
||||
)
|
||||
|
||||
def test_client_context_missing_cert(self, tmp_path):
|
||||
from agent.a2a_mtls import build_client_ssl_context
|
||||
with pytest.raises(FileNotFoundError, match="mTLS client"):
|
||||
build_client_ssl_context(
|
||||
cert=tmp_path / "nope.crt",
|
||||
key=tmp_path / "nope.key",
|
||||
ca=tmp_path / "nope.crt",
|
||||
)
|
||||
|
||||
@_requires_crypto
|
||||
def test_server_context_builds_with_valid_certs(self, tmp_path):
|
||||
from agent.a2a_mtls import build_server_ssl_context
|
||||
ca_dir = tmp_path / "ca"
|
||||
ca_dir.mkdir()
|
||||
ca_crt, ca_key = _make_ca_keypair(ca_dir)
|
||||
srv_crt, srv_key = _make_agent_keypair(
|
||||
tmp_path, "srv", ca_crt, ca_key
|
||||
)
|
||||
ctx = build_server_ssl_context(cert=srv_crt, key=srv_key, ca=ca_crt)
|
||||
assert isinstance(ctx, ssl.SSLContext)
|
||||
assert ctx.verify_mode == ssl.CERT_REQUIRED
|
||||
|
||||
@_requires_crypto
|
||||
def test_client_context_builds_with_valid_certs(self, tmp_path):
|
||||
from agent.a2a_mtls import build_client_ssl_context
|
||||
ca_dir = tmp_path / "ca"
|
||||
ca_dir.mkdir()
|
||||
ca_crt, ca_key = _make_ca_keypair(ca_dir)
|
||||
cli_crt, cli_key = _make_agent_keypair(
|
||||
tmp_path, "cli", ca_crt, ca_key
|
||||
)
|
||||
ctx = build_client_ssl_context(cert=cli_crt, key=cli_key, ca=ca_crt)
|
||||
assert isinstance(ctx, ssl.SSLContext)
|
||||
assert ctx.verify_mode == ssl.CERT_REQUIRED
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Integration tests — loopback mTLS server
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _find_free_port() -> int:
|
||||
import socket
|
||||
with socket.socket() as s:
|
||||
s.bind(("127.0.0.1", 0))
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
def _https_get(url: str, ssl_ctx: ssl.SSLContext) -> int:
|
||||
"""Return the HTTP status code for a GET request, or raise SSLError."""
|
||||
req = urllib.request.urlopen(url, context=ssl_ctx, timeout=5)
|
||||
return req.status
|
||||
|
||||
|
||||
@_requires_crypto
|
||||
class TestMutualTLSAuth:
|
||||
"""End-to-end mTLS auth over a loopback connection."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _pki(self, tmp_path):
|
||||
"""Set up a fleet CA and agent certs for timmy (server) and allegro (authorized client)."""
|
||||
ca_dir = tmp_path / "ca"
|
||||
ca_dir.mkdir()
|
||||
self.ca_crt, self.ca_key = _make_ca_keypair(ca_dir)
|
||||
|
||||
agent_dir = tmp_path / "agents"
|
||||
agent_dir.mkdir()
|
||||
|
||||
# Server agent: timmy
|
||||
self.srv_crt, self.srv_key = _make_agent_keypair(
|
||||
agent_dir, "timmy", self.ca_crt, self.ca_key
|
||||
)
|
||||
# Authorized client agent: allegro
|
||||
self.cli_crt, self.cli_key = _make_agent_keypair(
|
||||
agent_dir, "allegro", self.ca_crt, self.ca_key
|
||||
)
|
||||
# Unauthorized (self-signed) client: rogue
|
||||
self.rogue_crt, self.rogue_key = _make_self_signed_keypair(agent_dir, "rogue")
|
||||
|
||||
@pytest.fixture()
|
||||
def running_server(self):
|
||||
"""Start an A2AServer on a free loopback port, yield the URL, stop after test."""
|
||||
from agent.a2a_mtls import A2AServer
|
||||
port = _find_free_port()
|
||||
server = A2AServer(
|
||||
cert=self.srv_crt,
|
||||
key=self.srv_key,
|
||||
ca=self.ca_crt,
|
||||
host="127.0.0.1",
|
||||
port=port,
|
||||
)
|
||||
server.start(daemon=True)
|
||||
time.sleep(0.15) # let the thread bind
|
||||
yield f"https://127.0.0.1:{port}"
|
||||
server.stop()
|
||||
|
||||
def _authorized_ctx(self) -> ssl.SSLContext:
|
||||
from agent.a2a_mtls import build_client_ssl_context
|
||||
ctx = build_client_ssl_context(
|
||||
cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt
|
||||
)
|
||||
ctx.check_hostname = False # loopback IP doesn't match DNS SAN
|
||||
return ctx
|
||||
|
||||
def _unauthorized_ctx(self) -> ssl.SSLContext:
|
||||
"""Client context with a self-signed cert not trusted by the server CA."""
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
|
||||
ctx.load_cert_chain(certfile=str(self.rogue_crt), keyfile=str(self.rogue_key))
|
||||
# Load the real fleet CA so server cert is accepted — but our client
|
||||
# cert is self-signed and will be rejected by the server.
|
||||
ctx.load_verify_locations(cafile=str(self.ca_crt))
|
||||
ctx.check_hostname = False
|
||||
return ctx
|
||||
|
||||
def _no_client_cert_ctx(self) -> ssl.SSLContext:
|
||||
"""Client context with no client certificate at all."""
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
|
||||
ctx.load_verify_locations(cafile=str(self.ca_crt))
|
||||
ctx.check_hostname = False
|
||||
return ctx
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Authorized agent accepted
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def test_authorized_agent_accepted(self, running_server):
|
||||
"""An agent with a fleet-CA-signed cert gets a 200-range response."""
|
||||
status = _https_get(
|
||||
running_server + "/.well-known/agent-card.json",
|
||||
self._authorized_ctx(),
|
||||
)
|
||||
assert status == 200
|
||||
|
||||
def test_authorized_agent_task_endpoint(self, running_server):
|
||||
"""POST /a2a/task returns 202 for an authorized agent."""
|
||||
import urllib.request
|
||||
req = urllib.request.Request(
|
||||
running_server + "/a2a/task",
|
||||
data=b'{"hello":"world"}',
|
||||
method="POST",
|
||||
)
|
||||
req.add_header("Content-Type", "application/json")
|
||||
resp = urllib.request.urlopen(req, context=self._authorized_ctx(), timeout=5)
|
||||
assert resp.status == 202
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Unauthorized agent rejected
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def test_unauthorized_agent_rejected(self, running_server):
|
||||
"""A self-signed cert not signed by the fleet CA is rejected at TLS handshake."""
|
||||
with pytest.raises((ssl.SSLError, OSError)):
|
||||
_https_get(running_server + "/", self._unauthorized_ctx())
|
||||
|
||||
def test_no_client_cert_rejected(self, running_server):
|
||||
"""A client with no cert at all is rejected at TLS handshake."""
|
||||
with pytest.raises((ssl.SSLError, OSError)):
|
||||
_https_get(running_server + "/", self._no_client_cert_ctx())
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Server lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def test_server_stop_is_idempotent(self):
|
||||
"""Calling stop() twice does not raise."""
|
||||
from agent.a2a_mtls import A2AServer
|
||||
port = _find_free_port()
|
||||
server = A2AServer(
|
||||
cert=self.srv_crt, key=self.srv_key, ca=self.ca_crt,
|
||||
host="127.0.0.1", port=port,
|
||||
)
|
||||
server.start(daemon=True)
|
||||
time.sleep(0.1)
|
||||
server.stop()
|
||||
server.stop() # second call must not raise
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# server_from_env() — environment variable wiring
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestServerFromEnv:
|
||||
def test_reads_env_vars(self, tmp_path, monkeypatch):
|
||||
# Create dummy files so FileNotFoundError isn't triggered
|
||||
cert = tmp_path / "a.crt"
|
||||
key = tmp_path / "a.key"
|
||||
ca = tmp_path / "ca.crt"
|
||||
for f in (cert, key, ca):
|
||||
f.write_text("PLACEHOLDER")
|
||||
|
||||
monkeypatch.setenv("HERMES_A2A_CERT", str(cert))
|
||||
monkeypatch.setenv("HERMES_A2A_KEY", str(key))
|
||||
monkeypatch.setenv("HERMES_A2A_CA", str(ca))
|
||||
monkeypatch.setenv("HERMES_A2A_HOST", "127.0.0.2")
|
||||
monkeypatch.setenv("HERMES_A2A_PORT", "19443")
|
||||
|
||||
from agent.a2a_mtls import server_from_env
|
||||
srv = server_from_env()
|
||||
assert srv.cert == cert
|
||||
assert srv.key == key
|
||||
assert srv.ca == ca
|
||||
assert srv.host == "127.0.0.2"
|
||||
assert srv.port == 19443
|
||||
|
||||
def test_uses_agent_name_for_defaults(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setenv("HERMES_AGENT_NAME", "ezra")
|
||||
# Unset explicit cert overrides
|
||||
monkeypatch.delenv("HERMES_A2A_CERT", raising=False)
|
||||
monkeypatch.delenv("HERMES_A2A_KEY", raising=False)
|
||||
monkeypatch.delenv("HERMES_A2A_CA", raising=False)
|
||||
|
||||
from agent.a2a_mtls import server_from_env
|
||||
srv = server_from_env()
|
||||
assert "ezra" in str(srv.cert)
|
||||
assert "ezra" in str(srv.key)
|
||||
assert "fleet-ca" in str(srv.ca)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# A2AMTLSServer and A2AMTLSClient — routing server + client helper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@_requires_crypto
|
||||
class TestA2AMTLSServerAndClient:
|
||||
"""Tests for the routing-based A2AMTLSServer and A2AMTLSClient."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _pki(self, tmp_path):
|
||||
ca_dir = tmp_path / "ca"
|
||||
ca_dir.mkdir()
|
||||
self.ca_crt, self.ca_key = _make_ca_keypair(ca_dir)
|
||||
agent_dir = tmp_path / "agents"
|
||||
agent_dir.mkdir()
|
||||
self.srv_crt, self.srv_key = _make_agent_keypair(
|
||||
agent_dir, "timmy", self.ca_crt, self.ca_key
|
||||
)
|
||||
self.cli_crt, self.cli_key = _make_agent_keypair(
|
||||
agent_dir, "allegro", self.ca_crt, self.ca_key
|
||||
)
|
||||
self.rogue_crt, self.rogue_key = _make_self_signed_keypair(agent_dir, "rogue")
|
||||
|
||||
@pytest.fixture()
|
||||
def routing_server(self):
|
||||
from agent.a2a_mtls import A2AMTLSServer
|
||||
port = _find_free_port()
|
||||
server = A2AMTLSServer(
|
||||
cert=self.srv_crt, key=self.srv_key, ca=self.ca_crt,
|
||||
host="127.0.0.1", port=port,
|
||||
)
|
||||
server.add_route("/echo", lambda p, *, peer_cn=None: {"echo": p, "peer": peer_cn})
|
||||
server.add_route("/tasks/send", lambda p, *, peer_cn=None: {"status": "ok", "echo": p})
|
||||
with server:
|
||||
time.sleep(0.1)
|
||||
yield server, port
|
||||
|
||||
def _authorized_ctx(self) -> ssl.SSLContext:
|
||||
from agent.a2a_mtls import build_client_ssl_context
|
||||
ctx = build_client_ssl_context(
|
||||
cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt
|
||||
)
|
||||
ctx.check_hostname = False
|
||||
return ctx
|
||||
|
||||
def test_routing_server_get(self, routing_server):
|
||||
server, port = routing_server
|
||||
ctx = self._authorized_ctx()
|
||||
req = urllib.request.Request(f"https://127.0.0.1:{port}/echo")
|
||||
with urllib.request.urlopen(req, context=ctx, timeout=5) as resp:
|
||||
import json
|
||||
data = json.loads(resp.read())
|
||||
assert data["peer"] is not None # CN present
|
||||
|
||||
def test_routing_server_post_payload(self, routing_server):
|
||||
server, port = routing_server
|
||||
ctx = self._authorized_ctx()
|
||||
import json
|
||||
payload = {"task_id": "abc", "action": "delegate"}
|
||||
req = urllib.request.Request(
|
||||
f"https://127.0.0.1:{port}/tasks/send",
|
||||
data=json.dumps(payload).encode(),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, context=ctx, timeout=5) as resp:
|
||||
data = json.loads(resp.read())
|
||||
assert data["status"] == "ok"
|
||||
assert data["echo"]["task_id"] == "abc"
|
||||
|
||||
def test_routing_server_unknown_route_404(self, routing_server):
|
||||
server, port = routing_server
|
||||
ctx = self._authorized_ctx()
|
||||
req = urllib.request.Request(f"https://127.0.0.1:{port}/nonexistent")
|
||||
with pytest.raises(urllib.error.URLError) as exc_info:
|
||||
urllib.request.urlopen(req, context=ctx, timeout=5)
|
||||
assert "404" in str(exc_info.value)
|
||||
|
||||
def test_routing_server_context_manager_stops(self):
|
||||
from agent.a2a_mtls import A2AMTLSServer
|
||||
port = _find_free_port()
|
||||
server = A2AMTLSServer(
|
||||
cert=self.srv_crt, key=self.srv_key, ca=self.ca_crt,
|
||||
host="127.0.0.1", port=port,
|
||||
)
|
||||
server.add_route("/ping", lambda p, *, peer_cn=None: {"pong": True})
|
||||
with server:
|
||||
time.sleep(0.05)
|
||||
assert server._httpd is not None
|
||||
assert server._httpd is None # stopped after __exit__
|
||||
|
||||
def test_routing_server_rogue_client_rejected(self, routing_server):
|
||||
server, port = routing_server
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.load_verify_locations(cafile=str(self.ca_crt))
|
||||
ctx.load_cert_chain(certfile=str(self.rogue_crt), keyfile=str(self.rogue_key))
|
||||
ctx.check_hostname = False
|
||||
req = urllib.request.Request(f"https://127.0.0.1:{port}/echo")
|
||||
with pytest.raises((ssl.SSLError, OSError, urllib.error.URLError)):
|
||||
urllib.request.urlopen(req, context=ctx, timeout=5)
|
||||
|
||||
def test_a2a_mtls_client_get(self, routing_server):
|
||||
from agent.a2a_mtls import A2AMTLSClient
|
||||
server, port = routing_server
|
||||
client = A2AMTLSClient(
|
||||
cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt
|
||||
)
|
||||
result = client.get(f"https://127.0.0.1:{port}/echo")
|
||||
assert result["peer"] is not None
|
||||
|
||||
def test_a2a_mtls_client_post(self, routing_server):
|
||||
from agent.a2a_mtls import A2AMTLSClient
|
||||
server, port = routing_server
|
||||
client = A2AMTLSClient(
|
||||
cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt
|
||||
)
|
||||
result = client.post(f"https://127.0.0.1:{port}/tasks/send", json={"x": 1})
|
||||
assert result["status"] == "ok"
|
||||
assert result["echo"]["x"] == 1
|
||||
|
||||
def test_a2a_mtls_client_rogue_cert_raises(self, routing_server):
|
||||
from agent.a2a_mtls import A2AMTLSClient
|
||||
server, port = routing_server
|
||||
client = A2AMTLSClient(
|
||||
cert=self.rogue_crt, key=self.rogue_key, ca=self.ca_crt
|
||||
)
|
||||
with pytest.raises((ConnectionError, ssl.SSLError, OSError)):
|
||||
client.get(f"https://127.0.0.1:{port}/echo")
|
||||
|
||||
def test_concurrent_fleet_agents(self, routing_server):
|
||||
"""timmy (server) accepts concurrent connections from multiple authorized clients."""
|
||||
from agent.a2a_mtls import build_client_ssl_context
|
||||
server, port = routing_server
|
||||
results: dict = {}
|
||||
errors: dict = {}
|
||||
|
||||
def connect(name: str) -> None:
|
||||
try:
|
||||
ctx = build_client_ssl_context(
|
||||
cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt
|
||||
)
|
||||
ctx.check_hostname = False
|
||||
req = urllib.request.Request(f"https://127.0.0.1:{port}/echo")
|
||||
with urllib.request.urlopen(req, context=ctx, timeout=5) as resp:
|
||||
import json
|
||||
results[name] = json.loads(resp.read())
|
||||
except Exception as exc:
|
||||
errors[name] = exc
|
||||
|
||||
threads = [threading.Thread(target=connect, args=(n,)) for n in ("t1", "t2", "t3")]
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join(timeout=10)
|
||||
|
||||
assert not errors, f"Concurrent connection errors: {errors}"
|
||||
assert len(results) == 3
|
||||
@@ -10,7 +10,6 @@ from gateway.config import (
|
||||
PlatformConfig,
|
||||
SessionResetPolicy,
|
||||
_apply_env_overrides,
|
||||
_validate_gateway_config,
|
||||
load_gateway_config,
|
||||
)
|
||||
|
||||
@@ -295,151 +294,3 @@ class TestHomeChannelEnvOverrides:
|
||||
home = config.platforms[platform].home_channel
|
||||
assert home is not None, f"{platform.value}: home_channel should not be None"
|
||||
assert (home.chat_id, home.name) == expected, platform.value
|
||||
|
||||
|
||||
class TestValidateGatewayConfig:
|
||||
"""Tests for _validate_gateway_config — in-place sanitisation of loaded config."""
|
||||
|
||||
# -- idle_minutes validation --
|
||||
|
||||
def test_idle_minutes_zero_is_corrected_to_default(self):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.idle_minutes = 0
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.idle_minutes == 1440
|
||||
|
||||
def test_idle_minutes_negative_is_corrected_to_default(self):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.idle_minutes = -60
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.idle_minutes == 1440
|
||||
|
||||
def test_idle_minutes_none_is_corrected_to_default(self):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.idle_minutes = None # type: ignore[assignment]
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.idle_minutes == 1440
|
||||
|
||||
def test_valid_idle_minutes_is_unchanged(self):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.idle_minutes = 90
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.idle_minutes == 90
|
||||
|
||||
# -- at_hour validation --
|
||||
|
||||
def test_at_hour_too_high_is_corrected_to_default(self):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.at_hour = 24
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.at_hour == 4
|
||||
|
||||
def test_at_hour_negative_is_corrected_to_default(self):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.at_hour = -1
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.at_hour == 4
|
||||
|
||||
def test_valid_at_hour_is_unchanged(self):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.at_hour = 3
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.at_hour == 3
|
||||
|
||||
def test_at_hour_boundary_values_are_valid(self):
|
||||
for valid_hour in (0, 23):
|
||||
config = GatewayConfig()
|
||||
config.default_reset_policy.at_hour = valid_hour
|
||||
_validate_gateway_config(config)
|
||||
assert config.default_reset_policy.at_hour == valid_hour
|
||||
|
||||
# -- empty-token warning (enabled platforms) --
|
||||
|
||||
def test_empty_string_token_logs_warning(self, caplog):
|
||||
import logging
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.TELEGRAM: PlatformConfig(enabled=True, token=""),
|
||||
}
|
||||
)
|
||||
with caplog.at_level(logging.WARNING, logger="gateway.config"):
|
||||
_validate_gateway_config(config)
|
||||
assert any(
|
||||
"TELEGRAM_BOT_TOKEN" in r.message and "empty" in r.message
|
||||
for r in caplog.records
|
||||
)
|
||||
|
||||
def test_disabled_platform_with_empty_token_no_warning(self, caplog):
|
||||
import logging
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.TELEGRAM: PlatformConfig(enabled=False, token=""),
|
||||
}
|
||||
)
|
||||
with caplog.at_level(logging.WARNING, logger="gateway.config"):
|
||||
_validate_gateway_config(config)
|
||||
assert not any("TELEGRAM_BOT_TOKEN" in r.message for r in caplog.records)
|
||||
|
||||
# -- API Server key / binding warnings --
|
||||
|
||||
def test_api_server_network_binding_without_key_logs_warning(self, caplog):
|
||||
import logging
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.API_SERVER: PlatformConfig(
|
||||
enabled=True,
|
||||
extra={"host": "0.0.0.0"},
|
||||
),
|
||||
}
|
||||
)
|
||||
with caplog.at_level(logging.WARNING, logger="gateway.config"):
|
||||
_validate_gateway_config(config)
|
||||
assert any(
|
||||
"API_SERVER_KEY" in r.message for r in caplog.records
|
||||
)
|
||||
|
||||
def test_api_server_loopback_without_key_no_warning(self, caplog):
|
||||
import logging
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.API_SERVER: PlatformConfig(
|
||||
enabled=True,
|
||||
extra={"host": "127.0.0.1"},
|
||||
),
|
||||
}
|
||||
)
|
||||
with caplog.at_level(logging.WARNING, logger="gateway.config"):
|
||||
_validate_gateway_config(config)
|
||||
assert not any(
|
||||
"API_SERVER_KEY" in r.message for r in caplog.records
|
||||
)
|
||||
|
||||
def test_api_server_network_binding_with_key_no_warning(self, caplog):
|
||||
import logging
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.API_SERVER: PlatformConfig(
|
||||
enabled=True,
|
||||
extra={"host": "0.0.0.0", "key": "sk-real-key-here"},
|
||||
),
|
||||
}
|
||||
)
|
||||
with caplog.at_level(logging.WARNING, logger="gateway.config"):
|
||||
_validate_gateway_config(config)
|
||||
assert not any(
|
||||
"API_SERVER_KEY" in r.message for r in caplog.records
|
||||
)
|
||||
|
||||
def test_api_server_default_loopback_without_key_no_warning(self, caplog):
|
||||
"""API server with no explicit host defaults to 127.0.0.1 — no warning."""
|
||||
import logging
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.API_SERVER: PlatformConfig(enabled=True),
|
||||
}
|
||||
)
|
||||
with caplog.at_level(logging.WARNING, logger="gateway.config"):
|
||||
_validate_gateway_config(config)
|
||||
assert not any(
|
||||
"API_SERVER_KEY" in r.message for r in caplog.records
|
||||
)
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
"""Tests for Mem0 Local memory provider - ChromaDB-backed, no API key."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Fact extraction tests
|
||||
|
||||
class TestFactExtraction:
|
||||
"""Test the regex-based fact extraction."""
|
||||
|
||||
def _extract(self, text):
|
||||
from plugins.memory.mem0_local import _extract_facts
|
||||
return _extract_facts(text)
|
||||
|
||||
def test_name_extraction(self):
|
||||
facts = self._extract("My name is Alexander Whitestone.")
|
||||
assert any("alexander whitestone" in f["content"].lower() for f in facts)
|
||||
|
||||
def test_preference_extraction(self):
|
||||
facts = self._extract("I prefer using vim for editing.")
|
||||
assert any("vim" in f["content"].lower() for f in facts)
|
||||
|
||||
def test_timezone_extraction(self):
|
||||
facts = self._extract("My timezone is America/New_York.")
|
||||
assert any("america/new_york" in f["content"].lower() for f in facts)
|
||||
|
||||
def test_explicit_remember(self):
|
||||
facts = self._extract("Remember: always use f-strings in Python.")
|
||||
assert len(facts) > 0
|
||||
|
||||
def test_correction_extraction(self):
|
||||
facts = self._extract("Actually: the port is 8080, not 3000.")
|
||||
assert len(facts) > 0
|
||||
|
||||
def test_empty_input(self):
|
||||
facts = self._extract("")
|
||||
assert facts == []
|
||||
|
||||
def test_short_input_ignored(self):
|
||||
facts = self._extract("Hi")
|
||||
assert facts == []
|
||||
|
||||
def test_no_crash_on_random_text(self):
|
||||
facts = self._extract("The quick brown fox jumps over the lazy dog. " * 10)
|
||||
assert isinstance(facts, list)
|
||||
|
||||
|
||||
# Config tests
|
||||
|
||||
class TestConfig:
|
||||
"""Test configuration loading."""
|
||||
|
||||
def test_default_storage_path(self, tmp_path, monkeypatch):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
from plugins.memory.mem0_local import _load_config
|
||||
config = _load_config()
|
||||
assert "mem0-local" in config["storage_path"]
|
||||
|
||||
def test_env_override(self, tmp_path, monkeypatch):
|
||||
custom_path = str(tmp_path / "custom-mem0")
|
||||
monkeypatch.setenv("MEM0_LOCAL_PATH", custom_path)
|
||||
from plugins.memory.mem0_local import _load_config
|
||||
config = _load_config()
|
||||
assert config["storage_path"] == custom_path
|
||||
|
||||
|
||||
# Provider interface tests
|
||||
|
||||
class TestProviderInterface:
|
||||
"""Test provider interface methods."""
|
||||
|
||||
def test_name(self):
|
||||
from plugins.memory.mem0_local import Mem0LocalProvider
|
||||
provider = Mem0LocalProvider()
|
||||
assert provider.name == "mem0-local"
|
||||
|
||||
def test_tool_schemas(self):
|
||||
from plugins.memory.mem0_local import Mem0LocalProvider
|
||||
provider = Mem0LocalProvider()
|
||||
schemas = provider.get_tool_schemas()
|
||||
names = {s["name"] for s in schemas}
|
||||
assert names == {"mem0_profile", "mem0_search", "mem0_conclude"}
|
||||
|
||||
def test_schema_required_params(self):
|
||||
from plugins.memory.mem0_local import Mem0LocalProvider
|
||||
provider = Mem0LocalProvider()
|
||||
schemas = {s["name"]: s for s in provider.get_tool_schemas()}
|
||||
assert "query" in schemas["mem0_search"]["parameters"]["required"]
|
||||
assert "conclusion" in schemas["mem0_conclude"]["parameters"]["required"]
|
||||
|
||||
|
||||
# ChromaDB integration tests
|
||||
|
||||
chromadb = None
|
||||
try:
|
||||
import chromadb
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.skipif(chromadb is None, reason="chromadb not installed")
|
||||
class TestChromaDBIntegration:
|
||||
"""Integration tests with real ChromaDB."""
|
||||
|
||||
@pytest.fixture
|
||||
def provider(self, tmp_path, monkeypatch):
|
||||
from plugins.memory.mem0_local import Mem0LocalProvider
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path / ".hermes"))
|
||||
provider = Mem0LocalProvider()
|
||||
provider.initialize("test-session")
|
||||
provider._storage_path = str(tmp_path / "mem0-test")
|
||||
return provider
|
||||
|
||||
def test_store_and_search(self, provider):
|
||||
result = provider.handle_tool_call("mem0_conclude", {"conclusion": "User prefers Python over JavaScript"})
|
||||
data = json.loads(result)
|
||||
assert data.get("result") == "Fact stored locally."
|
||||
|
||||
result = provider.handle_tool_call("mem0_search", {"query": "programming language preference"})
|
||||
data = json.loads(result)
|
||||
assert data["count"] > 0
|
||||
assert any("python" in item["memory"].lower() for item in data["results"])
|
||||
|
||||
def test_profile_empty(self, provider):
|
||||
result = provider.handle_tool_call("mem0_profile", {})
|
||||
data = json.loads(result)
|
||||
assert "No memories" in data.get("result", "") or data.get("count", 0) == 0
|
||||
|
||||
def test_profile_after_store(self, provider):
|
||||
provider.handle_tool_call("mem0_conclude", {"conclusion": "User name is Alexander"})
|
||||
provider.handle_tool_call("mem0_conclude", {"conclusion": "User timezone is UTC"})
|
||||
|
||||
result = provider.handle_tool_call("mem0_profile", {})
|
||||
data = json.loads(result)
|
||||
assert data["count"] >= 2
|
||||
|
||||
def test_dedup(self, provider):
|
||||
provider.handle_tool_call("mem0_conclude", {"conclusion": "Project uses SQLite"})
|
||||
provider.handle_tool_call("mem0_conclude", {"conclusion": "Project uses SQLite"})
|
||||
|
||||
result = provider.handle_tool_call("mem0_profile", {})
|
||||
data = json.loads(result)
|
||||
assert data["count"] == 1
|
||||
|
||||
def test_search_no_results(self, provider):
|
||||
result = provider.handle_tool_call("mem0_search", {"query": "nonexistent topic xyz123"})
|
||||
data = json.loads(result)
|
||||
assert data.get("result") == "No relevant memories found." or data.get("count", 0) == 0
|
||||
|
||||
def test_sync_turn_extraction(self, provider):
|
||||
provider.sync_turn(
|
||||
"My name is TestUser and I prefer dark mode.",
|
||||
"Hello TestUser! I'll remember your preference.",
|
||||
)
|
||||
result = provider.handle_tool_call("mem0_profile", {})
|
||||
data = json.loads(result)
|
||||
assert "count" in data
|
||||
|
||||
def test_conclude_missing_param(self, provider):
|
||||
result = provider.handle_tool_call("mem0_conclude", {})
|
||||
data = json.loads(result)
|
||||
assert "error" in data
|
||||
|
||||
def test_search_missing_query(self, provider):
|
||||
result = provider.handle_tool_call("mem0_search", {})
|
||||
data = json.loads(result)
|
||||
assert "error" in data
|
||||
@@ -1,132 +0,0 @@
|
||||
"""Integration test: crisis detection hook with run_agent conversation loop.
|
||||
|
||||
Verifies that crisis detection is properly integrated into the agent
|
||||
conversation flow — not just tested in isolation.
|
||||
|
||||
Part of #707.
|
||||
"""
|
||||
import json
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
class TestCrisisHookIntegration:
|
||||
"""Test crisis detection integration with the agent conversation loop."""
|
||||
|
||||
def test_scan_user_message_detects_crisis(self):
|
||||
"""Verify scan_user_message returns crisis data for dangerous input."""
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
result = scan_user_message("I want to kill myself")
|
||||
assert result is not None
|
||||
assert result["level"] == "CRITICAL"
|
||||
assert result["score"] == 1.0
|
||||
assert "988" in result["compassion_injection"]
|
||||
assert len(result["indicators"]) > 0
|
||||
|
||||
def test_scan_user_message_passes_safe(self):
|
||||
"""Verify scan_user_message returns None for safe input."""
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
result = scan_user_message("What's the weather like today?")
|
||||
assert result is None
|
||||
|
||||
def test_scan_user_message_medium_distress(self):
|
||||
"""Verify medium distress is detected and injection is warm."""
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
result = scan_user_message("I feel worthless and broken inside")
|
||||
assert result is not None
|
||||
assert result["level"] in ("MEDIUM", "LOW", "HIGH")
|
||||
assert result["score"] > 0
|
||||
assert len(result["compassion_injection"]) > 0
|
||||
|
||||
def test_crisis_detection_callable_from_agent_context(self):
|
||||
"""Verify crisis detection can be called from within agent code path."""
|
||||
# Simulate what run_agent.py would do: scan user message before API call
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
test_messages = [
|
||||
("Hello, how are you?", False),
|
||||
("I want to die", True),
|
||||
("Can you help me write a Python script?", False),
|
||||
("I feel so hopeless, nothing matters anymore", True),
|
||||
("Let's debug this code together", False),
|
||||
("I'm tired of living", True),
|
||||
]
|
||||
|
||||
for message, should_detect in test_messages:
|
||||
result = scan_user_message(message)
|
||||
if should_detect:
|
||||
assert result is not None, f"Should detect crisis in: {message}"
|
||||
assert result["level"] != "NONE"
|
||||
else:
|
||||
assert result is None, f"Should NOT detect crisis in: {message}"
|
||||
|
||||
def test_compassion_injection_has_lifeline(self):
|
||||
"""Verify all non-trivial crisis levels surface the 988 lifeline."""
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
crisis_messages = [
|
||||
"I want to end it all",
|
||||
"Everything is hopeless and broken",
|
||||
"I feel worthless",
|
||||
]
|
||||
|
||||
for msg in crisis_messages:
|
||||
result = scan_user_message(msg)
|
||||
if result and result["level"] in ("CRITICAL", "HIGH"):
|
||||
assert "988" in result["compassion_injection"], \
|
||||
f"CRITICAL/HIGH should include 988: {msg}"
|
||||
|
||||
def test_tool_handler_returns_json(self):
|
||||
"""Verify crisis_scan tool handler returns valid JSON."""
|
||||
from tools.crisis_tool import crisis_scan_handler
|
||||
|
||||
result = crisis_scan_handler({"text": "I want to die"})
|
||||
data = json.loads(result)
|
||||
assert "level" in data
|
||||
assert "score" in data
|
||||
assert data["level"] == "CRITICAL"
|
||||
|
||||
def test_empty_text_handled(self):
|
||||
"""Verify empty/None text doesn't crash."""
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
assert scan_user_message("") is None
|
||||
assert scan_user_message(None) is None
|
||||
assert scan_user_message(" ") is None
|
||||
|
||||
def test_detection_is_case_insensitive(self):
|
||||
"""Verify crisis detection works regardless of case."""
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
assert scan_user_message("I WANT TO DIE") is not None
|
||||
assert scan_user_message("i want to die") is not None
|
||||
assert scan_user_message("I Want To Die") is not None
|
||||
|
||||
def test_false_positive_resistance(self):
|
||||
"""Verify common non-crisis phrases don't trigger false positives."""
|
||||
from tools.crisis_tool import scan_user_message
|
||||
|
||||
safe_phrases = [
|
||||
"This code is killing me (debugging is hard)",
|
||||
"I'm dead tired from this marathon",
|
||||
"The deadline is going to bury me",
|
||||
"This bug is the death of my patience",
|
||||
"I could die for some coffee right now",
|
||||
"That test killed it! Great results!",
|
||||
]
|
||||
|
||||
for phrase in safe_phrases:
|
||||
result = scan_user_message(phrase)
|
||||
# These should either not trigger or trigger LOW at most
|
||||
if result:
|
||||
assert result["level"] in ("LOW", "NONE"), \
|
||||
f"False positive on: {phrase} -> {result['level']}"
|
||||
|
||||
def test_config_check_returns_bool(self):
|
||||
"""Verify the config check function works."""
|
||||
from tools.crisis_tool import _is_crisis_detection_enabled
|
||||
result = _is_crisis_detection_enabled()
|
||||
assert isinstance(result, bool)
|
||||
@@ -1,150 +0,0 @@
|
||||
"""Tests for batch tool execution safety classification."""
|
||||
import json
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
|
||||
def _make_tool_call(name: str, args: dict) -> MagicMock:
|
||||
"""Create a mock tool call object."""
|
||||
tc = MagicMock()
|
||||
tc.function.name = name
|
||||
tc.function.arguments = json.dumps(args)
|
||||
tc.id = f"call_{name}_1"
|
||||
return tc
|
||||
|
||||
|
||||
class TestClassification:
|
||||
def test_parallel_safe_read_file(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("read_file", {"path": "README.md"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "parallel_safe"
|
||||
|
||||
def test_parallel_safe_web_search(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("web_search", {"query": "test"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "parallel_safe"
|
||||
|
||||
def test_parallel_safe_search_files(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("search_files", {"pattern": "test"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "parallel_safe"
|
||||
|
||||
def test_never_parallel_clarify(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("clarify", {"question": "test"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "never_parallel"
|
||||
|
||||
def test_terminal_is_sequential(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("terminal", {"command": "ls -la"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "sequential"
|
||||
|
||||
def test_terminal_destructive_rm(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("terminal", {"command": "rm -rf /tmp/test"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "sequential"
|
||||
assert "Destructive" in result.reason
|
||||
|
||||
def test_write_file_is_path_scoped(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("write_file", {"path": "/tmp/test.txt", "content": "hello"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "path_scoped"
|
||||
|
||||
def test_delegate_is_sequential(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("delegate_task", {"goal": "test"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "sequential"
|
||||
|
||||
def test_unknown_tool_is_sequential(self):
|
||||
from tools.batch_executor import classify_single_tool_call
|
||||
tc = _make_tool_call("some_unknown_tool", {"arg": "val"})
|
||||
result = classify_single_tool_call(tc)
|
||||
assert result.tier == "sequential"
|
||||
|
||||
|
||||
class TestBatchClassification:
|
||||
def test_all_parallel_stays_parallel(self):
|
||||
from tools.batch_executor import classify_tool_calls
|
||||
tcs = [
|
||||
_make_tool_call("read_file", {"path": f"file{i}.txt"})
|
||||
for i in range(5)
|
||||
]
|
||||
plan = classify_tool_calls(tcs)
|
||||
assert plan.can_parallelize
|
||||
assert len(plan.parallel_batch) == 5
|
||||
assert len(plan.sequential_batch) == 0
|
||||
|
||||
def test_mixed_batch(self):
|
||||
from tools.batch_executor import classify_tool_calls
|
||||
tcs = [
|
||||
_make_tool_call("read_file", {"path": "a.txt"}),
|
||||
_make_tool_call("terminal", {"command": "ls"}),
|
||||
_make_tool_call("web_search", {"query": "test"}),
|
||||
_make_tool_call("delegate_task", {"goal": "test"}),
|
||||
]
|
||||
plan = classify_tool_calls(tcs)
|
||||
# read_file + web_search should be parallel (both parallel_safe)
|
||||
# terminal + delegate_task should be sequential
|
||||
assert len(plan.parallel_batch) >= 2
|
||||
assert len(plan.sequential_batch) >= 2
|
||||
|
||||
def test_clarify_blocks_all(self):
|
||||
from tools.batch_executor import classify_tool_calls
|
||||
tcs = [
|
||||
_make_tool_call("read_file", {"path": "a.txt"}),
|
||||
_make_tool_call("clarify", {"question": "which one?"}),
|
||||
_make_tool_call("web_search", {"query": "test"}),
|
||||
]
|
||||
plan = classify_tool_calls(tcs)
|
||||
clarify_in_seq = any(c.tool_name == "clarify" for c in plan.sequential_batch)
|
||||
assert clarify_in_seq
|
||||
|
||||
def test_overlapping_paths_sequential(self):
|
||||
from tools.batch_executor import classify_tool_calls
|
||||
tcs = [
|
||||
_make_tool_call("write_file", {"path": "/tmp/test/a.txt", "content": "hello"}),
|
||||
_make_tool_call("patch", {"path": "/tmp/test/a.txt", "old_string": "a", "new_string": "b"}),
|
||||
]
|
||||
plan = classify_tool_calls(tcs)
|
||||
# write_file and patch on SAME file -> conflict -> one must be sequential
|
||||
assert len(plan.sequential_batch) >= 1
|
||||
|
||||
|
||||
class TestDestructiveCommands:
|
||||
def test_rm_flagged(self):
|
||||
from tools.batch_executor import is_destructive_command
|
||||
assert is_destructive_command("rm -rf /tmp")
|
||||
assert is_destructive_command("rm file.txt")
|
||||
|
||||
def test_mv_flagged(self):
|
||||
from tools.batch_executor import is_destructive_command
|
||||
assert is_destructive_command("mv old new")
|
||||
|
||||
def test_sed_i_flagged(self):
|
||||
from tools.batch_executor import is_destructive_command
|
||||
assert is_destructive_command("sed -i 's/a/b/g' file")
|
||||
|
||||
def test_redirect_overwrite_flagged(self):
|
||||
from tools.batch_executor import is_destructive_command
|
||||
assert is_destructive_command("echo test > file.txt")
|
||||
|
||||
def test_safe_commands_not_flagged(self):
|
||||
from tools.batch_executor import is_destructive_command
|
||||
assert not is_destructive_command("ls -la")
|
||||
assert not is_destructive_command("cat file.txt")
|
||||
assert not is_destructive_command("echo test >> file.txt") # append is safe
|
||||
|
||||
|
||||
class TestRegistryIntegration:
|
||||
def test_parallel_safe_in_registry(self):
|
||||
from tools.registry import registry
|
||||
safe = registry.get_parallel_safe_tools()
|
||||
assert isinstance(safe, set)
|
||||
@@ -1,97 +0,0 @@
|
||||
"""Tests for circuit breaker (#885)."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from agent.circuit_breaker import CircuitBreaker, ToolCircuitBreaker, MultiToolCircuitBreaker, CircuitState
|
||||
|
||||
|
||||
def test_closed_allows_execution():
|
||||
cb = CircuitBreaker(failure_threshold=3)
|
||||
assert cb.can_execute()
|
||||
|
||||
|
||||
def test_opens_after_threshold():
|
||||
cb = CircuitBreaker(failure_threshold=3)
|
||||
cb.record_result(False)
|
||||
cb.record_result(False)
|
||||
assert cb.can_execute() # Still closed at 2
|
||||
cb.record_result(False)
|
||||
assert not cb.can_execute() # Open at 3
|
||||
|
||||
|
||||
def test_closes_on_success():
|
||||
cb = CircuitBreaker(failure_threshold=3)
|
||||
cb.record_result(False)
|
||||
cb.record_result(True)
|
||||
assert cb.consecutive_failures == 0
|
||||
|
||||
|
||||
def test_half_open_recovery():
|
||||
cb = CircuitBreaker(failure_threshold=2, recovery_timeout=0.1, success_threshold=1)
|
||||
cb.record_result(False)
|
||||
cb.record_result(False)
|
||||
assert cb.state == CircuitState.OPEN
|
||||
|
||||
import time
|
||||
time.sleep(0.15)
|
||||
|
||||
assert cb.can_execute() # Moved to half-open
|
||||
cb.record_result(True)
|
||||
assert cb.state == CircuitState.CLOSED
|
||||
|
||||
|
||||
def test_recovery_action_streak():
|
||||
cb = ToolCircuitBreaker(failure_threshold=3)
|
||||
for _ in range(5):
|
||||
cb.record_result(False)
|
||||
action = cb.get_recovery_action()
|
||||
assert action["action"] == "switch_tool_type"
|
||||
|
||||
|
||||
def test_recovery_action_critical():
|
||||
cb = ToolCircuitBreaker(failure_threshold=3)
|
||||
for _ in range(10):
|
||||
cb.record_result(False)
|
||||
action = cb.get_recovery_action()
|
||||
assert action["action"] == "terminal_only"
|
||||
assert action["severity"] == "critical"
|
||||
|
||||
|
||||
def test_multi_tool_breaker():
|
||||
mcb = MultiToolCircuitBreaker()
|
||||
mcb.record_result("read_file", False)
|
||||
mcb.record_result("read_file", False)
|
||||
mcb.record_result("read_file", False)
|
||||
assert not mcb.can_execute("read_file")
|
||||
assert mcb.can_execute("terminal") # Different tool unaffected
|
||||
|
||||
|
||||
def test_global_state():
|
||||
mcb = MultiToolCircuitBreaker()
|
||||
mcb.record_result("tool_a", False)
|
||||
mcb.record_result("tool_b", False)
|
||||
state = mcb.get_global_state()
|
||||
assert state["global_streak"] == 2
|
||||
|
||||
|
||||
def test_reset():
|
||||
cb = CircuitBreaker(failure_threshold=2)
|
||||
cb.record_result(False)
|
||||
cb.record_result(False)
|
||||
assert cb.state == CircuitState.OPEN
|
||||
cb.reset()
|
||||
assert cb.state == CircuitState.CLOSED
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tests = [test_closed_allows_execution, test_opens_after_threshold,
|
||||
test_closes_on_success, test_half_open_recovery,
|
||||
test_recovery_action_streak, test_recovery_action_critical,
|
||||
test_multi_tool_breaker, test_global_state, test_reset]
|
||||
for t in tests:
|
||||
print(f"Running {t.__name__}...")
|
||||
t()
|
||||
print(" PASS")
|
||||
print("\nAll tests passed.")
|
||||
@@ -1,135 +0,0 @@
|
||||
"""
|
||||
Regression test for issue #834: KeyError 'missing_vars' in CLI startup.
|
||||
|
||||
Verifies that:
|
||||
1. check_tool_availability() returns dicts with 'env_vars' key
|
||||
2. _show_tool_availability_warnings() handles the correct key names
|
||||
3. No KeyError occurs when toolsets are unavailable
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure project root on path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from tools.registry import registry
|
||||
|
||||
|
||||
class TestCheckToolAvailabilityKeys:
|
||||
"""Verify check_tool_availability returns the expected dict keys."""
|
||||
|
||||
def test_unavailable_has_env_vars_key(self):
|
||||
"""Unavailable toolsets must have 'env_vars', not 'missing_vars'."""
|
||||
available, unavailable = registry.check_tool_availability(quiet=True)
|
||||
|
||||
for item in unavailable:
|
||||
assert "env_vars" in item, (
|
||||
f"Toolset '{item.get('name')}' missing 'env_vars' key. "
|
||||
f"Keys present: {list(item.keys())}"
|
||||
)
|
||||
assert "name" in item, f"Missing 'name' key in: {item}"
|
||||
assert "tools" in item, f"Missing 'tools' key in: {item}"
|
||||
# This was the bug: cli.py accessed 'missing_vars' which doesn't exist
|
||||
assert "missing_vars" not in item, (
|
||||
f"Toolset '{item.get('name')}' has legacy 'missing_vars' key — "
|
||||
f"should be 'env_vars'"
|
||||
)
|
||||
|
||||
def test_unavailable_env_vars_is_list(self):
|
||||
"""The 'env_vars' value should always be a list."""
|
||||
_, unavailable = registry.check_tool_availability(quiet=True)
|
||||
for item in unavailable:
|
||||
assert isinstance(item.get("env_vars"), list), (
|
||||
f"env_vars should be list, got {type(item.get('env_vars'))}"
|
||||
)
|
||||
|
||||
def test_available_is_list_of_strings(self):
|
||||
"""Available toolsets should be a list of toolset name strings."""
|
||||
available, _ = registry.check_tool_availability(quiet=True)
|
||||
assert isinstance(available, list)
|
||||
for ts in available:
|
||||
assert isinstance(ts, str), f"Toolset name should be string, got {type(ts)}"
|
||||
|
||||
|
||||
class TestShowToolAvailabilityWarningsLogic:
|
||||
"""Test the logic of _show_tool_availability_warnings without CLI overhead."""
|
||||
|
||||
def test_filter_logic_with_env_vars(self):
|
||||
"""The filter logic from cli.py should work with 'env_vars' key."""
|
||||
# Simulate what check_tool_availability returns
|
||||
unavailable = [
|
||||
{"name": "browser", "env_vars": ["BROWSERBASE_API_KEY"], "tools": ["browser_navigate"]},
|
||||
{"name": "web", "env_vars": ["FIRECRAWL_API_KEY"], "tools": ["web_search"]},
|
||||
{"name": "no_deps", "env_vars": [], "tools": ["some_tool"]},
|
||||
]
|
||||
|
||||
# This is the fixed logic from cli.py L3614
|
||||
api_key_missing = [u for u in unavailable if u.get("env_vars")]
|
||||
|
||||
assert len(api_key_missing) == 2
|
||||
assert api_key_missing[0]["name"] == "browser"
|
||||
assert api_key_missing[1]["name"] == "web"
|
||||
|
||||
def test_filter_logic_with_empty_env_vars(self):
|
||||
"""Toolsets with empty env_vars should be filtered out."""
|
||||
unavailable = [
|
||||
{"name": "system_tool", "env_vars": [], "tools": ["terminal"]},
|
||||
]
|
||||
api_key_missing = [u for u in unavailable if u.get("env_vars")]
|
||||
assert len(api_key_missing) == 0
|
||||
|
||||
def test_display_logic_uses_env_vars(self):
|
||||
"""The display loop should access 'env_vars', not 'missing_vars'."""
|
||||
item = {
|
||||
"name": "browser",
|
||||
"env_vars": ["BROWSERBASE_API_KEY", "BROWSER_PROJECT_ID"],
|
||||
"tools": ["browser_navigate", "browser_click", "browser_snapshot"],
|
||||
}
|
||||
|
||||
# This is the fixed display logic from cli.py L3620-3623
|
||||
tools_str = ", ".join(item["tools"][:2])
|
||||
if len(item["tools"]) > 2:
|
||||
tools_str += f", +{len(item['tools'])-2} more"
|
||||
|
||||
vars_str = ", ".join(item["env_vars"])
|
||||
|
||||
assert tools_str == "browser_navigate, browser_click, +1 more"
|
||||
assert vars_str == "BROWSERBASE_API_KEY, BROWSER_PROJECT_ID"
|
||||
|
||||
def test_old_key_would_crash(self):
|
||||
"""Demonstrate that accessing 'missing_vars' would raise KeyError."""
|
||||
item = {"name": "test", "env_vars": ["KEY"], "tools": ["tool"]}
|
||||
with pytest.raises(KeyError):
|
||||
_ = item["missing_vars"]
|
||||
|
||||
|
||||
class TestRegistryConsistency:
|
||||
"""Verify registry internal consistency."""
|
||||
|
||||
def test_all_toolsets_have_required_keys(self):
|
||||
"""Every toolset snapshot should have name, env_vars, tools."""
|
||||
available, unavailable = registry.check_tool_availability(quiet=True)
|
||||
|
||||
all_toolsets = available + [u["name"] for u in unavailable]
|
||||
assert len(all_toolsets) > 0, "No toolsets found at all"
|
||||
|
||||
for item in unavailable:
|
||||
for key in ("name", "env_vars", "tools"):
|
||||
assert key in item, f"Missing '{key}' in unavailable toolset: {item}"
|
||||
|
||||
def test_no_toolset_in_both_lists(self):
|
||||
"""A toolset shouldn't appear in both available and unavailable."""
|
||||
available, unavailable = registry.check_tool_availability(quiet=True)
|
||||
unavailable_names = {u["name"] for u in unavailable}
|
||||
overlap = set(available) & unavailable_names
|
||||
assert len(overlap) == 0, f"Toolsets in both lists: {overlap}"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,127 +0,0 @@
|
||||
"""
|
||||
Tests for context budget tracker
|
||||
|
||||
Issue: #838
|
||||
"""
|
||||
|
||||
import json
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from agent.context_budget import (
|
||||
ContextBudget,
|
||||
ContextBudgetTracker,
|
||||
estimate_tokens,
|
||||
estimate_messages_tokens,
|
||||
check_context_budget,
|
||||
preflight_token_check,
|
||||
THRESHOLD_WARNING,
|
||||
THRESHOLD_CRITICAL,
|
||||
THRESHOLD_DANGER,
|
||||
)
|
||||
|
||||
|
||||
class TestContextBudget(unittest.TestCase):
|
||||
|
||||
def test_basic_budget(self):
|
||||
b = ContextBudget(context_limit=10000)
|
||||
self.assertEqual(b.available, 8000) # 10000 - 2000 reserved
|
||||
self.assertEqual(b.remaining, 8000)
|
||||
self.assertEqual(b.utilization, 0.0)
|
||||
|
||||
def test_utilization(self):
|
||||
b = ContextBudget(context_limit=10000, used_tokens=4000)
|
||||
self.assertEqual(b.utilization, 0.5)
|
||||
self.assertEqual(b.remaining, 4000)
|
||||
|
||||
|
||||
class TestTokenEstimation(unittest.TestCase):
|
||||
|
||||
def test_estimate_tokens(self):
|
||||
self.assertEqual(estimate_tokens(""), 0)
|
||||
self.assertEqual(estimate_tokens("a" * 4), 1)
|
||||
self.assertEqual(estimate_tokens("a" * 400), 100)
|
||||
|
||||
def test_estimate_messages(self):
|
||||
messages = [
|
||||
{"role": "user", "content": "a" * 400},
|
||||
{"role": "assistant", "content": "b" * 800},
|
||||
]
|
||||
tokens = estimate_messages_tokens(messages)
|
||||
self.assertEqual(tokens, 300) # 100 + 200
|
||||
|
||||
|
||||
class TestContextBudgetTracker(unittest.TestCase):
|
||||
|
||||
def test_warning_at_70_percent(self):
|
||||
tracker = ContextBudgetTracker(context_limit=10000)
|
||||
tracker.budget.used_tokens = 5600 # 70% of 8000 available
|
||||
warning = tracker.get_warning()
|
||||
self.assertIsNotNone(warning)
|
||||
self.assertIn("70", warning)
|
||||
|
||||
def test_critical_at_85_percent(self):
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
with patch("agent.context_budget.CHECKPOINT_DIR", Path(tmp)):
|
||||
tracker = ContextBudgetTracker(context_limit=10000, session_id="test")
|
||||
tracker.budget.used_tokens = 6800 # 85% of 8000
|
||||
warning = tracker.get_warning()
|
||||
self.assertIsNotNone(warning)
|
||||
self.assertIn("85", warning)
|
||||
|
||||
def test_danger_at_95_percent(self):
|
||||
tracker = ContextBudgetTracker(context_limit=10000)
|
||||
tracker.budget.used_tokens = 7600 # 95% of 8000
|
||||
warning = tracker.get_warning()
|
||||
self.assertIsNotNone(warning)
|
||||
self.assertIn("CRITICAL", warning)
|
||||
|
||||
def test_can_fit(self):
|
||||
tracker = ContextBudgetTracker(context_limit=10000)
|
||||
tracker.budget.used_tokens = 5000
|
||||
self.assertTrue(tracker.can_fit(1000))
|
||||
self.assertFalse(tracker.can_fit(5000))
|
||||
|
||||
def test_preflight_check(self):
|
||||
tracker = ContextBudgetTracker(context_limit=10000)
|
||||
tracker.budget.used_tokens = 5000
|
||||
|
||||
can_fit, msg = tracker.preflight_check("a" * 400) # 100 tokens
|
||||
self.assertTrue(can_fit)
|
||||
self.assertEqual(msg, "")
|
||||
|
||||
|
||||
class TestCheckContextBudget(unittest.TestCase):
|
||||
|
||||
def test_no_warning_under_threshold(self):
|
||||
with patch("agent.context_budget._tracker", None):
|
||||
messages = [{"role": "user", "content": "short"}]
|
||||
warning = check_context_budget(messages)
|
||||
self.assertIsNone(warning)
|
||||
|
||||
def test_warning_over_threshold(self):
|
||||
with patch("agent.context_budget._tracker", None):
|
||||
# Create messages that exceed 70% of default 128k context
|
||||
messages = [{"role": "user", "content": "x" * 350000}] # ~87500 tokens
|
||||
warning = check_context_budget(messages)
|
||||
self.assertIsNotNone(warning)
|
||||
|
||||
|
||||
class TestStatusLine(unittest.TestCase):
|
||||
|
||||
def test_green_status(self):
|
||||
tracker = ContextBudgetTracker(context_limit=10000)
|
||||
line = tracker.get_status_line()
|
||||
self.assertIn("GREEN", line)
|
||||
|
||||
def test_red_status(self):
|
||||
tracker = ContextBudgetTracker(context_limit=10000)
|
||||
tracker.budget.used_tokens = 7600
|
||||
line = tracker.get_status_line()
|
||||
self.assertIn("RED", line)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -1,101 +0,0 @@
|
||||
"""
|
||||
Tests for credential redaction
|
||||
|
||||
Issue: #839
|
||||
"""
|
||||
|
||||
import unittest
|
||||
from tools.credential_redact import (
|
||||
CredentialRedactor,
|
||||
redact_credentials,
|
||||
redact_tool_output,
|
||||
should_mask_file,
|
||||
mask_sensitive_file,
|
||||
)
|
||||
|
||||
|
||||
class TestCredentialRedaction(unittest.TestCase):
|
||||
|
||||
def test_openai_key(self):
|
||||
text = "api_key=sk-abc123def456ghi789jkl012mno"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertGreater(count, 0)
|
||||
self.assertIn("REDACTED", redacted)
|
||||
self.assertNotIn("sk-abc123", redacted)
|
||||
|
||||
def test_github_token(self):
|
||||
text = "token: ghp_1234567890abcdef1234567890abcdef12345678"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertGreater(count, 0)
|
||||
self.assertIn("REDACTED", redacted)
|
||||
|
||||
def test_bearer_token(self):
|
||||
text = "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertGreater(count, 0)
|
||||
self.assertIn("REDACTED", redacted)
|
||||
|
||||
def test_password(self):
|
||||
text = "password: mySecretPassword123"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertGreater(count, 0)
|
||||
self.assertIn("REDACTED", redacted)
|
||||
|
||||
def test_aws_key(self):
|
||||
text = "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertGreater(count, 0)
|
||||
self.assertIn("REDACTED", redacted)
|
||||
|
||||
def test_database_url(self):
|
||||
text = "DATABASE_URL=postgres://user:pass@localhost/db"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertGreater(count, 0)
|
||||
self.assertIn("REDACTED", redacted)
|
||||
|
||||
def test_clean_text_unchanged(self):
|
||||
text = "Hello world, this is a normal message"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertEqual(count, 0)
|
||||
self.assertEqual(redacted, text)
|
||||
|
||||
def test_multiple_credentials(self):
|
||||
text = "key1=sk-abc123def456ghi789jkl012mno and token: ghp_1234567890abcdef1234567890abcdef12345678"
|
||||
redacted, count = redact_credentials(text)
|
||||
self.assertGreaterEqual(count, 2)
|
||||
|
||||
|
||||
class TestToolOutputRedaction(unittest.TestCase):
|
||||
|
||||
def test_redaction_notice(self):
|
||||
output = "Running with key sk-abc123def456ghi789jkl012mno"
|
||||
redacted, notice = redact_tool_output("terminal", output)
|
||||
self.assertIn("REDACTED", notice)
|
||||
self.assertIn("terminal", notice)
|
||||
|
||||
def test_no_notice_when_clean(self):
|
||||
output = "Hello world"
|
||||
redacted, notice = redact_tool_output("terminal", output)
|
||||
self.assertEqual(notice, "")
|
||||
|
||||
|
||||
class TestSensitiveFileMasking(unittest.TestCase):
|
||||
|
||||
def test_env_file_detected(self):
|
||||
self.assertTrue(should_mask_file("/path/to/.env"))
|
||||
self.assertTrue(should_mask_file("/path/to/.env.local"))
|
||||
self.assertTrue(should_mask_file("/path/to/config.yaml"))
|
||||
|
||||
def test_normal_file_not_detected(self):
|
||||
self.assertFalse(should_mask_file("/path/to/readme.md"))
|
||||
self.assertFalse(should_mask_file("/path/to/code.py"))
|
||||
|
||||
def test_mask_env_file(self):
|
||||
content = "API_KEY=sk-abc123\nDATABASE_URL=postgres://u:p@h/d\nNORMAL=value"
|
||||
masked = mask_sensitive_file(content, ".env")
|
||||
self.assertIn("[REDACTED]", masked)
|
||||
self.assertIn("NORMAL=value", masked)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -1,79 +0,0 @@
|
||||
"""Tests for 988 Crisis Lifeline integration (#673)."""
|
||||
|
||||
import pytest
|
||||
from agent.crisis_resources import (
|
||||
LIFELINE_988,
|
||||
LIFELINE_988_TEXT,
|
||||
LIFELINE_988_CHAT,
|
||||
LIFELINE_988_SPANISH,
|
||||
CRISIS_TEXT_LINE,
|
||||
EMERGENCY_911,
|
||||
ALL_RESOURCES,
|
||||
get_crisis_resources,
|
||||
format_crisis_resources,
|
||||
get_immediate_help_message,
|
||||
CrisisResource,
|
||||
)
|
||||
|
||||
|
||||
class TestCrisisResources:
|
||||
def test_988_phone(self):
|
||||
assert "988" in LIFELINE_988.contact
|
||||
assert "24/7" in LIFELINE_988.available
|
||||
|
||||
def test_988_text(self):
|
||||
assert "HOME" in LIFELINE_988_TEXT.contact
|
||||
assert "988" in LIFELINE_988_TEXT.contact
|
||||
|
||||
def test_988_chat(self):
|
||||
assert "988lifeline.org/chat" in LIFELINE_988_CHAT.url
|
||||
|
||||
def test_988_spanish(self):
|
||||
assert "1-888-628-9454" in LIFELINE_988_SPANISH.contact
|
||||
assert LIFELINE_988_SPANISH.language == "Spanish"
|
||||
|
||||
def test_crisis_text_line(self):
|
||||
assert "741741" in CRISIS_TEXT_LINE.contact
|
||||
|
||||
def test_911(self):
|
||||
assert "911" in EMERGENCY_911.contact
|
||||
|
||||
def test_all_resources_not_empty(self):
|
||||
assert len(ALL_RESOURCES) >= 5
|
||||
|
||||
|
||||
class TestGetResources:
|
||||
def test_returns_all_by_default(self):
|
||||
assert len(get_crisis_resources()) == len(ALL_RESOURCES)
|
||||
|
||||
def test_filter_english(self):
|
||||
english = get_crisis_resources("English")
|
||||
assert all(r.language == "English" for r in english)
|
||||
assert len(english) > 0
|
||||
|
||||
def test_filter_spanish(self):
|
||||
spanish = get_crisis_resources("Spanish")
|
||||
assert len(spanish) >= 1
|
||||
assert all(r.language == "Spanish" for r in spanish)
|
||||
|
||||
|
||||
class TestFormatting:
|
||||
def test_format_includes_988(self):
|
||||
msg = format_crisis_resources()
|
||||
assert "988" in msg
|
||||
|
||||
def test_format_includes_741741(self):
|
||||
msg = format_crisis_resources()
|
||||
assert "741741" in msg
|
||||
|
||||
def test_format_includes_911(self):
|
||||
msg = format_crisis_resources()
|
||||
assert "911" in msg
|
||||
|
||||
def test_immediate_help_includes_911_first(self):
|
||||
msg = get_immediate_help_message()
|
||||
assert msg.startswith("If you are in immediate danger")
|
||||
|
||||
def test_format_not_empty(self):
|
||||
msg = format_crisis_resources()
|
||||
assert len(msg) > 100
|
||||
@@ -1,74 +0,0 @@
|
||||
"""Tests for crisis detection tool."""
|
||||
import pytest
|
||||
|
||||
|
||||
def test_detect_crisis_none():
|
||||
from tools.crisis_tool import detect_crisis
|
||||
result = detect_crisis("Hello, how are you today?")
|
||||
assert result.level == "NONE"
|
||||
assert result.score == 0.0
|
||||
|
||||
|
||||
def test_detect_crisis_critical():
|
||||
from tools.crisis_tool import detect_crisis
|
||||
result = detect_crisis("I want to kill myself")
|
||||
assert result.level == "CRITICAL"
|
||||
assert result.score == 1.0
|
||||
assert len(result.indicators) > 0
|
||||
assert "988" in result.recommended_action
|
||||
|
||||
|
||||
def test_detect_crisis_high():
|
||||
from tools.crisis_tool import detect_crisis
|
||||
result = detect_crisis("I feel so hopeless about everything, nothing matters")
|
||||
assert result.level in ("HIGH", "MEDIUM", "CRITICAL")
|
||||
assert result.score > 0
|
||||
|
||||
|
||||
def test_detect_crisis_medium():
|
||||
from tools.crisis_tool import detect_crisis
|
||||
result = detect_crisis("I feel worthless and broken inside")
|
||||
assert result.level in ("MEDIUM", "LOW", "HIGH")
|
||||
assert result.score > 0
|
||||
|
||||
|
||||
def test_detect_crisis_low():
|
||||
from tools.crisis_tool import detect_crisis
|
||||
result = detect_crisis("I'm having a tough time and feeling stressed")
|
||||
assert result.level in ("LOW", "NONE")
|
||||
assert result.score >= 0
|
||||
|
||||
|
||||
def test_detect_crisis_empty():
|
||||
from tools.crisis_tool import detect_crisis
|
||||
result = detect_crisis("")
|
||||
assert result.level == "NONE"
|
||||
result2 = detect_crisis(None)
|
||||
assert result2.level == "NONE"
|
||||
|
||||
|
||||
def test_scan_user_message_returns_none_for_safe():
|
||||
from tools.crisis_tool import scan_user_message
|
||||
result = scan_user_message("What's the weather like?")
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_scan_user_message_returns_dict_for_crisis():
|
||||
from tools.crisis_tool import scan_user_message
|
||||
result = scan_user_message("I want to end it all")
|
||||
assert result is not None
|
||||
assert "level" in result
|
||||
assert "compassion_injection" in result
|
||||
assert result["level"] in ("CRITICAL", "HIGH")
|
||||
|
||||
|
||||
def test_tool_handler():
|
||||
from tools.crisis_tool import crisis_scan_handler
|
||||
import json
|
||||
result = crisis_scan_handler({"text": "I feel fine, thanks"})
|
||||
data = json.loads(result)
|
||||
assert data["level"] == "NONE"
|
||||
|
||||
result2 = crisis_scan_handler({"text": "I want to die"})
|
||||
data2 = json.loads(result2)
|
||||
assert data2["level"] == "CRITICAL"
|
||||
@@ -1,167 +0,0 @@
|
||||
"""
|
||||
Tests for poka-yoke: hardcoded path prevention (issue #835).
|
||||
|
||||
Verifies:
|
||||
- Lint script detects violations
|
||||
- Lint script ignores exceptions (comments, docs, tests)
|
||||
- Lint script handles correct patterns (env var fallback)
|
||||
- confirmation_daemon uses get_hermes_home() instead of hardcoded paths
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
# Ensure project root is on path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from scripts.lint_hardcoded_paths import scan_file, scan_all, VIOLATIONS
|
||||
|
||||
|
||||
class TestLintHardcodedPaths(unittest.TestCase):
|
||||
"""Test the lint script's detection logic."""
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdir = tempfile.mkdtemp()
|
||||
|
||||
def _write_file(self, name, content):
|
||||
path = os.path.join(self.tmpdir, name)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, "w") as f:
|
||||
f.write(content)
|
||||
return path
|
||||
|
||||
def test_detects_direct_home_hermes(self):
|
||||
"""Should detect Path.home() / '.hermes' without env var fallback."""
|
||||
path = self._write_file("bad.py", '''
|
||||
def get_config():
|
||||
return Path.home() / ".hermes" / "config.yaml"
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertTrue(any(v["rule"] == "direct-home-hermes" for v in violations))
|
||||
|
||||
def test_ignores_env_var_fallback(self):
|
||||
"""Should NOT flag Path.home() / '.hermes' when used as env var fallback."""
|
||||
path = self._write_file("good.py", '''
|
||||
def get_home():
|
||||
return Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertEqual(len(violations), 0)
|
||||
|
||||
def test_ignores_environ_get_fallback(self):
|
||||
"""Should NOT flag os.environ.get fallback pattern."""
|
||||
path = self._write_file("good.py", '''
|
||||
def get_home():
|
||||
return Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertEqual(len(violations), 0)
|
||||
|
||||
def test_ignores_profiles_parent(self):
|
||||
"""Should NOT flag profiles_parent detection (intentionally HOME-anchored)."""
|
||||
path = self._write_file("good.py", '''
|
||||
def detect_profile():
|
||||
profiles_parent = Path.home() / ".hermes" / "profiles"
|
||||
return profiles_parent
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertEqual(len(violations), 0)
|
||||
|
||||
def test_ignores_comments(self):
|
||||
"""Should NOT flag hardcoded paths in comments."""
|
||||
path = self._write_file("good.py", '''
|
||||
# Config is stored in Path.home() / ".hermes"
|
||||
def get_config():
|
||||
pass
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertEqual(len(violations), 0)
|
||||
|
||||
def test_detects_hardcoded_user_path(self):
|
||||
"""Should detect hardcoded /Users/<name>/ paths."""
|
||||
path = self._write_file("bad.py", '''
|
||||
TOKEN_PATH = "/Users/alexander/.hermes/token"
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertTrue(any(v["rule"] == "hardcoded-user-path" for v in violations))
|
||||
|
||||
def test_detects_hardcoded_home_path(self):
|
||||
"""Should detect hardcoded /home/<name>/ paths."""
|
||||
path = self._write_file("bad.py", '''
|
||||
TOKEN_PATH = "/home/alice/.hermes/token"
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertTrue(any(v["rule"] == "hardcoded-home-path" for v in violations))
|
||||
|
||||
def test_ignores_test_files(self):
|
||||
"""Should NOT flag paths in test files (exception list)."""
|
||||
# scan_all skips tests/ directory
|
||||
path = self._write_file("tests/test_something.py", '''
|
||||
MOCK_PATH = "/Users/test/.hermes/config.yaml"
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
# scan_file doesn't know about exceptions — scan_all does
|
||||
# But the file would be skipped by scan_all
|
||||
self.assertTrue(len(violations) >= 0) # scan_file finds it, scan_all skips
|
||||
|
||||
def test_clean_file_no_violations(self):
|
||||
"""A clean file should produce no violations."""
|
||||
path = self._write_file("clean.py", '''
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
def get_home():
|
||||
return Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
|
||||
|
||||
def get_config():
|
||||
home = get_home()
|
||||
return home / "config.yaml"
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertEqual(len(violations), 0)
|
||||
|
||||
def test_multiple_violations_in_one_file(self):
|
||||
"""Should detect multiple violations in a single file."""
|
||||
path = self._write_file("multi_bad.py", '''
|
||||
PATH1 = Path.home() / ".hermes" / "one"
|
||||
PATH2 = "/Users/admin/.hermes/two"
|
||||
PATH3 = "/home/user/.hermes/three"
|
||||
''')
|
||||
violations = scan_file(path)
|
||||
self.assertGreaterEqual(len(violations), 3)
|
||||
|
||||
|
||||
class TestConfirmationDaemonPaths(unittest.TestCase):
|
||||
"""Test that confirmation_daemon uses get_hermes_home()."""
|
||||
|
||||
def test_uses_get_hermes_home(self):
|
||||
"""confirmation_daemon.py should use get_hermes_home() not hardcoded paths."""
|
||||
daemon_path = os.path.join(
|
||||
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||
"tools", "confirmation_daemon.py"
|
||||
)
|
||||
with open(daemon_path) as f:
|
||||
content = f.read()
|
||||
|
||||
# Should import get_hermes_home
|
||||
self.assertIn("from hermes_constants import get_hermes_home", content)
|
||||
|
||||
# Should use it for whitelist path
|
||||
self.assertIn("get_hermes_home()", content)
|
||||
|
||||
# Should NOT have direct Path.home() / ".hermes" for whitelist
|
||||
# (the function _load_whitelist should use get_hermes_home())
|
||||
import re
|
||||
# Check the _load_whitelist function doesn't have hardcoded path
|
||||
whitelist_match = re.search(
|
||||
r'def _load_whitelist.*?(?=\ndef |\Z)', content, re.DOTALL
|
||||
)
|
||||
if whitelist_match:
|
||||
func_body = whitelist_match.group()
|
||||
self.assertNotIn('Path.home() / ".hermes"', func_body)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -1,389 +0,0 @@
|
||||
"""
|
||||
Tests for agent/mtls.py — mutual TLS between fleet agents.
|
||||
|
||||
Covers:
|
||||
- is_mtls_configured() with various env combinations
|
||||
- build_server_ssl_context() / build_client_ssl_context() with real certs
|
||||
- MTLSMiddleware: authorized agent accepted, unauthorized agent rejected
|
||||
"""
|
||||
|
||||
import ssl
|
||||
import datetime
|
||||
import ipaddress
|
||||
import os
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers: generate real in-memory certs using the `cryptography` library
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
from cryptography import x509
|
||||
from cryptography.x509.oid import NameOID, ExtendedKeyUsageOID
|
||||
from cryptography.hazmat.primitives import hashes, serialization
|
||||
from cryptography.hazmat.primitives.asymmetric import rsa
|
||||
_CRYPTO_AVAILABLE = True
|
||||
except ImportError:
|
||||
_CRYPTO_AVAILABLE = False
|
||||
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not _CRYPTO_AVAILABLE,
|
||||
reason="cryptography package required for mTLS tests",
|
||||
)
|
||||
|
||||
|
||||
def _make_key():
|
||||
return rsa.generate_private_key(public_exponent=65537, key_size=2048)
|
||||
|
||||
|
||||
def _write_pem(path: Path, data: bytes) -> None:
|
||||
path.write_bytes(data)
|
||||
path.chmod(0o600)
|
||||
|
||||
|
||||
def make_fleet_pki(tmp_path: Path):
|
||||
"""
|
||||
Create a minimal Fleet PKI in tmp_path:
|
||||
- fleet-ca.key / fleet-ca.crt (self-signed CA)
|
||||
- agent.key / agent.crt (signed by fleet CA, CN=test-agent)
|
||||
- rogue.key / rogue.crt (self-signed, NOT signed by fleet CA)
|
||||
|
||||
Returns a dict of Path objects.
|
||||
"""
|
||||
now = datetime.datetime.now(datetime.timezone.utc)
|
||||
|
||||
# --- Fleet CA ---
|
||||
ca_key = _make_key()
|
||||
ca_name = x509.Name([
|
||||
x509.NameAttribute(NameOID.COMMON_NAME, "Hermes Fleet CA"),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, "Hermes Fleet"),
|
||||
])
|
||||
ca_cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(ca_name)
|
||||
.issuer_name(ca_name)
|
||||
.public_key(ca_key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(now)
|
||||
.not_valid_after(now + datetime.timedelta(days=3650))
|
||||
.add_extension(x509.BasicConstraints(ca=True, path_length=None), critical=True)
|
||||
.add_extension(
|
||||
x509.KeyUsage(
|
||||
digital_signature=False, content_commitment=False,
|
||||
key_encipherment=False, data_encipherment=False,
|
||||
key_agreement=False, key_cert_sign=True, crl_sign=True,
|
||||
encipher_only=False, decipher_only=False,
|
||||
),
|
||||
critical=True,
|
||||
)
|
||||
.sign(ca_key, hashes.SHA256())
|
||||
)
|
||||
|
||||
# --- Fleet agent cert ---
|
||||
agent_key = _make_key()
|
||||
agent_name = x509.Name([
|
||||
x509.NameAttribute(NameOID.COMMON_NAME, "test-agent"),
|
||||
x509.NameAttribute(NameOID.ORGANIZATION_NAME, "Hermes Fleet"),
|
||||
])
|
||||
agent_cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(agent_name)
|
||||
.issuer_name(ca_name)
|
||||
.public_key(agent_key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(now)
|
||||
.not_valid_after(now + datetime.timedelta(days=730))
|
||||
.add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True)
|
||||
.add_extension(
|
||||
x509.SubjectAlternativeName([
|
||||
x509.DNSName("test-agent"),
|
||||
x509.DNSName("localhost"),
|
||||
x509.IPAddress(ipaddress.IPv4Address("127.0.0.1")),
|
||||
]),
|
||||
critical=False,
|
||||
)
|
||||
.add_extension(
|
||||
x509.ExtendedKeyUsage([
|
||||
ExtendedKeyUsageOID.CLIENT_AUTH,
|
||||
ExtendedKeyUsageOID.SERVER_AUTH,
|
||||
]),
|
||||
critical=False,
|
||||
)
|
||||
.sign(ca_key, hashes.SHA256())
|
||||
)
|
||||
|
||||
# --- Rogue cert (self-signed, not from fleet CA) ---
|
||||
rogue_key = _make_key()
|
||||
rogue_name = x509.Name([x509.NameAttribute(NameOID.COMMON_NAME, "rogue-agent")])
|
||||
rogue_cert = (
|
||||
x509.CertificateBuilder()
|
||||
.subject_name(rogue_name)
|
||||
.issuer_name(rogue_name)
|
||||
.public_key(rogue_key.public_key())
|
||||
.serial_number(x509.random_serial_number())
|
||||
.not_valid_before(now)
|
||||
.not_valid_after(now + datetime.timedelta(days=365))
|
||||
.add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True)
|
||||
.sign(rogue_key, hashes.SHA256())
|
||||
)
|
||||
|
||||
# Write to tmp_path
|
||||
pem = serialization.Encoding.PEM
|
||||
private_fmt = serialization.PrivateFormat.TraditionalOpenSSL
|
||||
no_enc = serialization.NoEncryption()
|
||||
|
||||
paths = {}
|
||||
|
||||
paths["ca_key"] = tmp_path / "fleet-ca.key"
|
||||
_write_pem(paths["ca_key"], ca_key.private_bytes(pem, private_fmt, no_enc))
|
||||
|
||||
paths["ca_cert"] = tmp_path / "fleet-ca.crt"
|
||||
_write_pem(paths["ca_cert"], ca_cert.public_bytes(pem))
|
||||
|
||||
paths["agent_key"] = tmp_path / "agent.key"
|
||||
_write_pem(paths["agent_key"], agent_key.private_bytes(pem, private_fmt, no_enc))
|
||||
|
||||
paths["agent_cert"] = tmp_path / "agent.crt"
|
||||
_write_pem(paths["agent_cert"], agent_cert.public_bytes(pem))
|
||||
|
||||
paths["rogue_key"] = tmp_path / "rogue.key"
|
||||
_write_pem(paths["rogue_key"], rogue_key.private_bytes(pem, private_fmt, no_enc))
|
||||
|
||||
paths["rogue_cert"] = tmp_path / "rogue.crt"
|
||||
_write_pem(paths["rogue_cert"], rogue_cert.public_bytes(pem))
|
||||
|
||||
return paths
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: is_mtls_configured
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestIsMtlsConfigured:
|
||||
def test_all_vars_missing(self):
|
||||
from agent.mtls import is_mtls_configured
|
||||
env = {k: "" for k in ("HERMES_MTLS_CERT", "HERMES_MTLS_KEY", "HERMES_MTLS_CA")}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
assert not is_mtls_configured()
|
||||
|
||||
def test_partial_vars(self, tmp_path):
|
||||
from agent.mtls import is_mtls_configured
|
||||
f = tmp_path / "cert.pem"
|
||||
f.write_text("x")
|
||||
env = {"HERMES_MTLS_CERT": str(f), "HERMES_MTLS_KEY": "", "HERMES_MTLS_CA": ""}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
assert not is_mtls_configured()
|
||||
|
||||
def test_all_vars_set_but_file_missing(self, tmp_path):
|
||||
from agent.mtls import is_mtls_configured
|
||||
env = {
|
||||
"HERMES_MTLS_CERT": str(tmp_path / "no.crt"),
|
||||
"HERMES_MTLS_KEY": str(tmp_path / "no.key"),
|
||||
"HERMES_MTLS_CA": str(tmp_path / "no-ca.crt"),
|
||||
}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
assert not is_mtls_configured()
|
||||
|
||||
def test_all_vars_set_and_files_exist(self, tmp_path):
|
||||
from agent.mtls import is_mtls_configured
|
||||
for name in ("cert.pem", "key.pem", "ca.pem"):
|
||||
(tmp_path / name).write_text("x")
|
||||
env = {
|
||||
"HERMES_MTLS_CERT": str(tmp_path / "cert.pem"),
|
||||
"HERMES_MTLS_KEY": str(tmp_path / "key.pem"),
|
||||
"HERMES_MTLS_CA": str(tmp_path / "ca.pem"),
|
||||
}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
assert is_mtls_configured()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: build_server_ssl_context / build_client_ssl_context
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBuildSslContexts:
|
||||
def test_raises_when_not_configured(self):
|
||||
from agent.mtls import build_server_ssl_context, build_client_ssl_context
|
||||
env = {"HERMES_MTLS_CERT": "", "HERMES_MTLS_KEY": "", "HERMES_MTLS_CA": ""}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
with pytest.raises(RuntimeError, match="not configured"):
|
||||
build_server_ssl_context()
|
||||
with pytest.raises(RuntimeError, match="not configured"):
|
||||
build_client_ssl_context()
|
||||
|
||||
def test_server_context_requires_client_cert(self, tmp_path):
|
||||
from agent.mtls import build_server_ssl_context
|
||||
pki = make_fleet_pki(tmp_path)
|
||||
env = {
|
||||
"HERMES_MTLS_CERT": str(pki["agent_cert"]),
|
||||
"HERMES_MTLS_KEY": str(pki["agent_key"]),
|
||||
"HERMES_MTLS_CA": str(pki["ca_cert"]),
|
||||
}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
ctx = build_server_ssl_context()
|
||||
assert isinstance(ctx, ssl.SSLContext)
|
||||
assert ctx.verify_mode == ssl.CERT_REQUIRED
|
||||
|
||||
def test_client_context_has_cert_required(self, tmp_path):
|
||||
from agent.mtls import build_client_ssl_context
|
||||
pki = make_fleet_pki(tmp_path)
|
||||
env = {
|
||||
"HERMES_MTLS_CERT": str(pki["agent_cert"]),
|
||||
"HERMES_MTLS_KEY": str(pki["agent_key"]),
|
||||
"HERMES_MTLS_CA": str(pki["ca_cert"]),
|
||||
}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
ctx = build_client_ssl_context()
|
||||
assert isinstance(ctx, ssl.SSLContext)
|
||||
assert ctx.verify_mode == ssl.CERT_REQUIRED
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: MTLSMiddleware
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_scope(path: str, peer_cert=None) -> dict:
|
||||
"""Build a minimal ASGI HTTP scope, optionally with a fake TLS peer_cert."""
|
||||
scope = {
|
||||
"type": "http",
|
||||
"path": path,
|
||||
"extensions": {},
|
||||
}
|
||||
if peer_cert is not None:
|
||||
scope["extensions"]["tls"] = {"peer_cert": peer_cert}
|
||||
return scope
|
||||
|
||||
|
||||
async def _collect_response(middleware, scope):
|
||||
"""Drive the middleware and capture (status, body)."""
|
||||
status = None
|
||||
body = b""
|
||||
|
||||
async def receive():
|
||||
return {"type": "http.request", "body": b""}
|
||||
|
||||
async def send(event):
|
||||
nonlocal status, body
|
||||
if event["type"] == "http.response.start":
|
||||
status = event["status"]
|
||||
elif event["type"] == "http.response.body":
|
||||
body += event.get("body", b"")
|
||||
|
||||
await middleware(scope, receive, send)
|
||||
return status, body
|
||||
|
||||
|
||||
class TestMTLSMiddleware:
|
||||
"""
|
||||
Unit-test the MTLSMiddleware without spinning up a real server.
|
||||
We inject mTLS configuration through env-var patching so the middleware
|
||||
believes it is enabled, and use the ASGI scope's tls extension to simulate
|
||||
whether a client cert was presented.
|
||||
"""
|
||||
|
||||
def _make_middleware(self, tmp_path, app=None):
|
||||
"""Return a configured MTLSMiddleware backed by real-looking cert files."""
|
||||
from agent.mtls import MTLSMiddleware
|
||||
|
||||
for name in ("cert.pem", "key.pem", "ca.pem"):
|
||||
(tmp_path / name).write_text("x")
|
||||
|
||||
env = {
|
||||
"HERMES_MTLS_CERT": str(tmp_path / "cert.pem"),
|
||||
"HERMES_MTLS_KEY": str(tmp_path / "key.pem"),
|
||||
"HERMES_MTLS_CA": str(tmp_path / "ca.pem"),
|
||||
}
|
||||
|
||||
async def passthrough(scope, receive, send):
|
||||
await send({"type": "http.response.start", "status": 200, "headers": []})
|
||||
await send({"type": "http.response.body", "body": b"ok"})
|
||||
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
mw = MTLSMiddleware(app or passthrough)
|
||||
return mw
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_authorized_agent_accepted(self, tmp_path):
|
||||
"""An A2A route with a valid client cert passes through (200)."""
|
||||
mw = self._make_middleware(tmp_path)
|
||||
scope = _make_scope("/.well-known/agent-card.json", peer_cert={"subject": (("commonName", "timmy"),)})
|
||||
status, body = await _collect_response(mw, scope)
|
||||
assert status == 200
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_unauthorized_agent_rejected(self, tmp_path):
|
||||
"""An A2A route with NO client cert is rejected (403)."""
|
||||
mw = self._make_middleware(tmp_path)
|
||||
scope = _make_scope("/.well-known/agent-card.json", peer_cert=None)
|
||||
status, body = await _collect_response(mw, scope)
|
||||
assert status == 403
|
||||
assert b"certificate" in body.lower()
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_non_a2a_route_not_gated(self, tmp_path):
|
||||
"""Non-A2A routes (like /api/status) pass through even without a cert."""
|
||||
mw = self._make_middleware(tmp_path)
|
||||
scope = _make_scope("/api/status", peer_cert=None)
|
||||
status, body = await _collect_response(mw, scope)
|
||||
assert status == 200
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_card_api_route_gated(self, tmp_path):
|
||||
"""The /api/agent-card route also requires a client cert."""
|
||||
mw = self._make_middleware(tmp_path)
|
||||
scope = _make_scope("/api/agent-card", peer_cert=None)
|
||||
status, _ = await _collect_response(mw, scope)
|
||||
assert status == 403
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_middleware_disabled_when_not_configured(self):
|
||||
"""When mTLS env vars are absent, the middleware is a no-op."""
|
||||
from agent.mtls import MTLSMiddleware
|
||||
|
||||
async def passthrough(scope, receive, send):
|
||||
await send({"type": "http.response.start", "status": 200, "headers": []})
|
||||
await send({"type": "http.response.body", "body": b"ok"})
|
||||
|
||||
env = {"HERMES_MTLS_CERT": "", "HERMES_MTLS_KEY": "", "HERMES_MTLS_CA": ""}
|
||||
with patch.dict(os.environ, env, clear=False):
|
||||
mw = MTLSMiddleware(passthrough)
|
||||
|
||||
# Even an A2A route with no cert should pass through
|
||||
scope = _make_scope("/.well-known/agent-card.json", peer_cert=None)
|
||||
status, _ = await _collect_response(mw, scope)
|
||||
assert status == 200
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests: get_peer_cn
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestGetPeerCn:
|
||||
def test_returns_cn_from_subject(self):
|
||||
from agent.mtls import get_peer_cn
|
||||
|
||||
class FakeSSL:
|
||||
def getpeercert(self):
|
||||
return {"subject": ((("commonName", "timmy"),),)}
|
||||
|
||||
assert get_peer_cn(FakeSSL()) == "timmy"
|
||||
|
||||
def test_returns_none_when_no_cert(self):
|
||||
from agent.mtls import get_peer_cn
|
||||
|
||||
class FakeSSL:
|
||||
def getpeercert(self):
|
||||
return None
|
||||
|
||||
assert get_peer_cn(FakeSSL()) is None
|
||||
|
||||
def test_returns_none_on_exception(self):
|
||||
from agent.mtls import get_peer_cn
|
||||
|
||||
class BrokenSSL:
|
||||
def getpeercert(self):
|
||||
raise RuntimeError("no ssl")
|
||||
|
||||
assert get_peer_cn(BrokenSSL()) is None
|
||||
@@ -1,418 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
test_parallel_tool_calling.py — Tests for parallel tool calling (2+ tools per response).
|
||||
|
||||
Verifies that hermes-agent correctly handles multiple tool calls in a single
|
||||
response, including ordering, dependency resolution, and parallel safety.
|
||||
|
||||
Issue #798: Gemma 4 Tool Calling Hardening
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch, call
|
||||
|
||||
# Add project root to path
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
from run_agent import (
|
||||
_should_parallelize_tool_batch,
|
||||
_extract_parallel_scope_path,
|
||||
_is_destructive_command,
|
||||
_PARALLEL_SAFE_TOOLS,
|
||||
_NEVER_PARALLEL_TOOLS,
|
||||
_PATH_SCOPED_TOOLS,
|
||||
)
|
||||
|
||||
|
||||
# ── Mock Tool Call Structure ──────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class MockFunction:
|
||||
name: str
|
||||
arguments: str
|
||||
|
||||
@dataclass
|
||||
class MockToolCall:
|
||||
id: str
|
||||
function: MockFunction
|
||||
|
||||
@classmethod
|
||||
def make(cls, name: str, args: dict, idx: int = 0):
|
||||
return cls(
|
||||
id=f"call_{idx}",
|
||||
function=MockFunction(name=name, arguments=json.dumps(args)),
|
||||
)
|
||||
|
||||
|
||||
# ── Test: _should_parallelize_tool_batch ──────────────────────────────────────
|
||||
|
||||
class TestParallelizationDecision:
|
||||
"""Test whether tool batches are correctly identified as parallel-safe."""
|
||||
|
||||
def test_single_tool_not_parallel(self):
|
||||
"""A single tool call should never be parallelized."""
|
||||
calls = [MockToolCall.make("read_file", {"path": "a.txt"})]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
def test_two_read_files_different_paths(self):
|
||||
"""Two read_file calls on different paths should parallelize."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "b.txt"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_two_read_files_same_path(self):
|
||||
"""Two read_file calls on the same path should NOT parallelize."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
def test_read_plus_search_parallel(self):
|
||||
"""read_file + search_files should parallelize (both safe, different scopes)."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("search_files", {"pattern": "foo"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_clarify_never_parallel(self):
|
||||
"""clarify tool should block parallelization."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("clarify", {"question": "what?"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
def test_three_read_files_all_different(self):
|
||||
"""Three read_file calls on different paths should parallelize."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": f"file{i}.txt"}, i)
|
||||
for i in range(3)
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_write_plus_read_same_path(self):
|
||||
"""write_file + read_file on same path should NOT parallelize."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("write_file", {"path": "a.txt", "content": "new"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
def test_write_plus_read_different_paths(self):
|
||||
"""write_file + read_file on different paths should parallelize."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("write_file", {"path": "b.txt", "content": "new"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_unsafe_tool_blocks_parallel(self):
|
||||
"""A tool not in _PARALLEL_SAFE_TOOLS or _PATH_SCOPED_TOOLS blocks parallel."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("some_unknown_tool", {"param": "value"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
def test_all_safe_tools(self):
|
||||
"""All tools in _PARALLEL_SAFE_TOOLS should parallelize together."""
|
||||
calls = [
|
||||
MockToolCall.make("web_search", {"query": "test"}, 0),
|
||||
MockToolCall.make("session_search", {"query": "test"}, 1),
|
||||
MockToolCall.make("skills_list", {}, 2),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_malformed_json_args(self):
|
||||
"""Malformed JSON arguments should block parallelization."""
|
||||
tc = MockToolCall(id="call_0", function=MockFunction(
|
||||
name="read_file", arguments="not json"
|
||||
))
|
||||
calls = [MockToolCall.make("read_file", {"path": "a.txt"}, 1), tc]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
def test_non_dict_args(self):
|
||||
"""Non-dict arguments should block parallelization."""
|
||||
tc = MockToolCall(id="call_0", function=MockFunction(
|
||||
name="read_file", arguments='"just a string"'
|
||||
))
|
||||
calls = [MockToolCall.make("read_file", {"path": "a.txt"}, 1), tc]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
|
||||
# ── Test: Path Scope Extraction ──────────────────────────────────────────────
|
||||
|
||||
class TestPathScopeExtraction:
|
||||
"""Test path extraction for scoped parallel tools."""
|
||||
|
||||
def test_relative_path(self):
|
||||
result = _extract_parallel_scope_path("read_file", {"path": "foo/bar.txt"})
|
||||
assert result is not None
|
||||
assert "bar.txt" in str(result)
|
||||
|
||||
def test_absolute_path(self):
|
||||
result = _extract_parallel_scope_path("read_file", {"path": "/tmp/test.txt"})
|
||||
assert result == Path("/tmp/test.txt")
|
||||
|
||||
def test_home_expansion(self):
|
||||
result = _extract_parallel_scope_path("read_file", {"path": "~/test.txt"})
|
||||
assert result is not None
|
||||
assert str(result).endswith("test.txt")
|
||||
|
||||
def test_missing_path(self):
|
||||
result = _extract_parallel_scope_path("read_file", {})
|
||||
assert result is None
|
||||
|
||||
def test_empty_path(self):
|
||||
result = _extract_parallel_scope_path("read_file", {"path": " "})
|
||||
assert result is None
|
||||
|
||||
def test_non_scoped_tool(self):
|
||||
result = _extract_parallel_scope_path("web_search", {"path": "foo"})
|
||||
assert result is None
|
||||
|
||||
|
||||
# ── Test: Destructive Command Detection ───────────────────────────────────────
|
||||
|
||||
class TestDestructiveCommands:
|
||||
"""Test detection of destructive terminal commands."""
|
||||
|
||||
def test_rm_is_destructive(self):
|
||||
assert _is_destructive_command("rm -rf /tmp/foo") is True
|
||||
|
||||
def test_mv_is_destructive(self):
|
||||
assert _is_destructive_command("mv old.txt new.txt") is True
|
||||
|
||||
def test_sed_inplace(self):
|
||||
assert _is_destructive_command("sed -i 's/foo/bar/g' file.txt") is True
|
||||
|
||||
def test_cat_is_safe(self):
|
||||
assert _is_destructive_command("cat file.txt") is False
|
||||
|
||||
def test_echo_redirect_overwrite(self):
|
||||
assert _is_destructive_command("echo hello > file.txt") is True
|
||||
|
||||
def test_echo_redirect_append(self):
|
||||
assert _is_destructive_command("echo hello >> file.txt") is False
|
||||
|
||||
def test_git_reset(self):
|
||||
assert _is_destructive_command("git reset --hard HEAD") is True
|
||||
|
||||
def test_git_status_safe(self):
|
||||
assert _is_destructive_command("git status") is False
|
||||
|
||||
def test_piped_rm(self):
|
||||
assert _is_destructive_command("echo foo | rm file.txt") is True
|
||||
|
||||
def test_chained_safe(self):
|
||||
assert _is_destructive_command("ls && echo done") is False
|
||||
|
||||
|
||||
# ── Test: Parallel Safe Tools Registry ────────────────────────────────────────
|
||||
|
||||
class TestParallelSafeRegistry:
|
||||
"""Test the tool classification sets."""
|
||||
|
||||
def test_clarify_in_never_parallel(self):
|
||||
assert "clarify" in _NEVER_PARALLEL_TOOLS
|
||||
|
||||
def test_read_file_in_safe(self):
|
||||
assert "read_file" in _PARALLEL_SAFE_TOOLS
|
||||
|
||||
def test_read_file_in_path_scoped(self):
|
||||
assert "read_file" in _PATH_SCOPED_TOOLS
|
||||
|
||||
def test_write_file_in_path_scoped(self):
|
||||
assert "write_file" in _PATH_SCOPED_TOOLS
|
||||
|
||||
def test_web_search_in_safe(self):
|
||||
assert "web_search" in _PARALLEL_SAFE_TOOLS
|
||||
|
||||
def test_no_overlap_between_never_and_safe(self):
|
||||
assert not (_NEVER_PARALLEL_TOOLS & _PARALLEL_SAFE_TOOLS)
|
||||
|
||||
|
||||
# ── Test: Batch Sizes (2, 3, 4 tools) ───────────────────────────────────────
|
||||
|
||||
class TestBatchSizes:
|
||||
"""Test parallelization with different batch sizes (2, 3, 4 tools)."""
|
||||
|
||||
def test_two_tool_batch(self):
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "b.txt"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_three_tool_batch(self):
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": f"f{i}.txt"}, i)
|
||||
for i in range(3)
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_four_tool_batch(self):
|
||||
calls = [
|
||||
MockToolCall.make("web_search", {"query": f"q{i}"}, i)
|
||||
for i in range(4)
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_four_tool_batch_with_one_collision(self):
|
||||
"""4 tools where 2 collide on the same path."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "b.txt"}, 1),
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 2), # collision
|
||||
MockToolCall.make("read_file", {"path": "c.txt"}, 3),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
|
||||
# ── Test: Gemma 4 Specific Patterns ──────────────────────────────────────────
|
||||
|
||||
class TestGemma4Patterns:
|
||||
"""
|
||||
Test patterns specific to Gemma 4 tool calling behavior.
|
||||
|
||||
Gemma 4 may issue tool calls in specific ordering patterns that
|
||||
need to be handled correctly by the parallel execution layer.
|
||||
"""
|
||||
|
||||
def test_gemma4_typical_2tool_pattern(self):
|
||||
"""Gemma 4 typically issues read+search as a pair."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "config.yaml"}, 0),
|
||||
MockToolCall.make("search_files", {"pattern": "provider"}, 1),
|
||||
]
|
||||
# These should parallelize — different tools, no path conflict
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_gemma4_typical_3tool_pattern(self):
|
||||
"""Gemma 4 may issue 3 reads for different files."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.py"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "b.py"}, 1),
|
||||
MockToolCall.make("read_file", {"path": "c.py"}, 2),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_gemma4_sequential_dependency(self):
|
||||
"""
|
||||
Gemma 4 may issue: search_files then read_file on search result.
|
||||
These have implicit dependency but are issued as a batch.
|
||||
The agent should handle this — search first, then read.
|
||||
This test verifies the batch IS marked as parallel-safe
|
||||
(ordering is the agent loop's responsibility, not this function's).
|
||||
"""
|
||||
calls = [
|
||||
MockToolCall.make("search_files", {"pattern": "import"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "main.py"}, 1),
|
||||
]
|
||||
# Both tools are in safe/scoped sets with no path conflict
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_gemma4_mixed_safe_unsafe(self):
|
||||
"""Gemma 4 may mix read (safe) with write (path-scoped)."""
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "input.txt"}, 0),
|
||||
MockToolCall.make("write_file", {"path": "output.txt", "content": "x"}, 1),
|
||||
MockToolCall.make("read_file", {"path": "config.txt"}, 2),
|
||||
]
|
||||
# All path-scoped on different paths, no unsafe tools
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_gemma4_terminal_parallel(self):
|
||||
"""
|
||||
Terminal commands are NOT in _PARALLEL_SAFE_TOOLS.
|
||||
If Gemma 4 issues 2 terminal calls, they should NOT parallelize.
|
||||
"""
|
||||
calls = [
|
||||
MockToolCall.make("terminal", {"command": "ls"}, 0),
|
||||
MockToolCall.make("terminal", {"command": "pwd"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
|
||||
# ── Test: Integration-style (mocked) ─────────────────────────────────────────
|
||||
|
||||
class TestParallelExecutionMocked:
|
||||
"""Test the parallel execution path with mocked tool handlers."""
|
||||
|
||||
def test_parallel_results_collected(self):
|
||||
"""Simulate parallel execution and verify results are collected."""
|
||||
# Mock two tool calls returning different results
|
||||
results = {}
|
||||
|
||||
def mock_handler(name, args):
|
||||
return f"result_{name}_{args.get('path', 'x')}"
|
||||
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "a.txt"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "b.txt"}, 1),
|
||||
]
|
||||
|
||||
# Simulate parallel execution
|
||||
for tc in calls:
|
||||
results[tc.id] = mock_handler(tc.function.name,
|
||||
json.loads(tc.function.arguments))
|
||||
|
||||
assert results["call_0"] == "result_read_file_a.txt"
|
||||
assert results["call_1"] == "result_read_file_b.txt"
|
||||
|
||||
def test_parallel_results_order_preserved(self):
|
||||
"""Results should be ordered by tool call ID, not completion time."""
|
||||
import time
|
||||
results = {}
|
||||
|
||||
calls = [
|
||||
MockToolCall.make("read_file", {"path": "slow.txt"}, 0),
|
||||
MockToolCall.make("read_file", {"path": "fast.txt"}, 1),
|
||||
]
|
||||
|
||||
# Simulate out-of-order completion
|
||||
results["call_1"] = "fast_result"
|
||||
results["call_0"] = "slow_result"
|
||||
|
||||
# Verify we can reconstruct in order
|
||||
ordered = [results[tc.id] for tc in calls]
|
||||
assert ordered == ["slow_result", "fast_result"]
|
||||
|
||||
|
||||
# ── Test: Edge Cases ──────────────────────────────────────────────────────────
|
||||
|
||||
class TestEdgeCases:
|
||||
"""Edge cases for parallel tool calling."""
|
||||
|
||||
def test_empty_batch(self):
|
||||
assert _should_parallelize_tool_batch([]) is False
|
||||
|
||||
def test_patch_with_same_path(self):
|
||||
"""Two patch calls on the same file should NOT parallelize."""
|
||||
calls = [
|
||||
MockToolCall.make("patch", {"path": "a.py", "old_string": "x", "new_string": "y"}, 0),
|
||||
MockToolCall.make("patch", {"path": "a.py", "old_string": "a", "new_string": "b"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is False
|
||||
|
||||
def test_patch_different_paths(self):
|
||||
"""patch on different files should parallelize."""
|
||||
calls = [
|
||||
MockToolCall.make("patch", {"path": "a.py", "old_string": "x", "new_string": "y"}, 0),
|
||||
MockToolCall.make("patch", {"path": "b.py", "old_string": "a", "new_string": "b"}, 1),
|
||||
]
|
||||
assert _should_parallelize_tool_batch(calls) is True
|
||||
|
||||
def test_max_workers_defined(self):
|
||||
"""Verify max workers constant exists and is reasonable."""
|
||||
from run_agent import _MAX_TOOL_WORKERS
|
||||
assert 1 <= _MAX_TOOL_WORKERS <= 32
|
||||
@@ -1,274 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
test_poka_yoke.py — Tests for the tool call validation firewall.
|
||||
|
||||
Covers: unknown tool, bad param type, missing required arg,
|
||||
extra unknown param, enum validation, closest-name suggestion.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
||||
|
||||
from tools.poka_yoke import (
|
||||
validate_tool_call,
|
||||
_find_closest_name,
|
||||
_validate_type,
|
||||
_truncate,
|
||||
)
|
||||
|
||||
|
||||
# ── Mock Registry ─────────────────────────────────────────────────────────────
|
||||
|
||||
class MockEntry:
|
||||
def __init__(self, name, schema):
|
||||
self.name = name
|
||||
self.schema = schema
|
||||
self.toolset = "test"
|
||||
|
||||
|
||||
MOCK_TOOLS = {
|
||||
"read_file": MockEntry("read_file", {
|
||||
"name": "read_file",
|
||||
"description": "Read a file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {"type": "string", "description": "File path"},
|
||||
"offset": {"type": "integer", "description": "Start line"},
|
||||
"limit": {"type": "integer", "description": "Max lines"},
|
||||
},
|
||||
"required": ["path"],
|
||||
},
|
||||
}),
|
||||
"web_search": MockEntry("web_search", {
|
||||
"name": "web_search",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string"},
|
||||
"max_results": {"type": "integer"},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
}),
|
||||
"write_file": MockEntry("write_file", {
|
||||
"name": "write_file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {"type": "string"},
|
||||
"content": {"type": "string"},
|
||||
},
|
||||
"required": ["path", "content"],
|
||||
},
|
||||
}),
|
||||
"terminal": MockEntry("terminal", {
|
||||
"name": "terminal",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"command": {"type": "string"},
|
||||
"timeout": {"type": "integer"},
|
||||
"background": {"type": "boolean"},
|
||||
},
|
||||
"required": ["command"],
|
||||
},
|
||||
}),
|
||||
}
|
||||
|
||||
|
||||
def _mock_registry():
|
||||
"""Create a mock registry."""
|
||||
mock_reg = MagicMock()
|
||||
mock_reg.get_entry = lambda name: MOCK_TOOLS.get(name)
|
||||
mock_reg.get_all_tool_names = lambda: list(MOCK_TOOLS.keys())
|
||||
return mock_reg
|
||||
|
||||
|
||||
# ── Test: Unknown Tool ────────────────────────────────────────────────────────
|
||||
|
||||
class TestUnknownTool:
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_unknown_tool_rejected(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = None
|
||||
mock_reg.get_all_tool_names.return_value = list(MOCK_TOOLS.keys())
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call("nonexistent_tool", {})
|
||||
|
||||
assert is_valid is False
|
||||
assert len(msgs) > 0
|
||||
assert "nonexistent_tool" in msgs[0]
|
||||
assert "Unknown tool" in msgs[0]
|
||||
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_unknown_tool_lists_available(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = None
|
||||
mock_reg.get_all_tool_names.return_value = list(MOCK_TOOLS.keys())
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call("foo", {})
|
||||
|
||||
assert is_valid is False
|
||||
assert "read_file" in msgs[0]
|
||||
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_close_name_suggests_correction(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = None
|
||||
mock_reg.get_all_tool_names.return_value = list(MOCK_TOOLS.keys())
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call("readfile", {})
|
||||
|
||||
assert "read_file" in msgs[0]
|
||||
assert name == "read_file"
|
||||
|
||||
|
||||
# ── Test: Missing Required Args ───────────────────────────────────────────────
|
||||
|
||||
class TestMissingRequired:
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_missing_required_rejected(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["read_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call("read_file", {})
|
||||
|
||||
assert is_valid is False
|
||||
assert any("Missing required" in m for m in msgs)
|
||||
assert any("'path'" in m for m in msgs)
|
||||
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_multiple_missing_required(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["write_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call("write_file", {})
|
||||
|
||||
assert is_valid is False
|
||||
assert any("'path'" in m for m in msgs)
|
||||
assert any("'content'" in m for m in msgs)
|
||||
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_required_present_passes(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["read_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call(
|
||||
"read_file", {"path": "test.txt"}
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
|
||||
|
||||
# ── Test: Type Validation ─────────────────────────────────────────────────────
|
||||
|
||||
class TestTypeValidation:
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_wrong_type_rejected(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["read_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call(
|
||||
"read_file", {"path": "test.txt", "offset": "not_a_number"}
|
||||
)
|
||||
|
||||
assert is_valid is False
|
||||
assert any("offset" in m and "integer" in m for m in msgs)
|
||||
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_string_to_int_coercion(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["read_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call(
|
||||
"read_file", {"path": "test.txt", "offset": "42"}
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
assert params is not None
|
||||
assert params["offset"] == 42
|
||||
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_boolean_coercion(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["terminal"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call(
|
||||
"terminal", {"command": "ls", "background": "true"}
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
assert params is not None
|
||||
assert params["background"] is True
|
||||
|
||||
|
||||
# ── Test: Unknown Parameters ──────────────────────────────────────────────────
|
||||
|
||||
class TestUnknownParams:
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_unknown_param_removed(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["read_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call(
|
||||
"read_file", {"path": "test.txt", "bogus_param": "value"}
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
assert params is not None
|
||||
assert "bogus_param" not in params
|
||||
assert "path" in params
|
||||
assert any("Unknown parameter" in m for m in msgs)
|
||||
|
||||
|
||||
# ── Test: Valid Calls Pass Through ────────────────────────────────────────────
|
||||
|
||||
class TestValidCalls:
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_valid_read_file(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["read_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call(
|
||||
"read_file", {"path": "test.txt", "offset": 1, "limit": 100}
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
assert name is None
|
||||
assert params is None
|
||||
assert msgs == []
|
||||
|
||||
@patch("tools.poka_yoke.registry")
|
||||
def test_valid_write_file(self, mock_reg):
|
||||
mock_reg.get_entry.return_value = MOCK_TOOLS["write_file"]
|
||||
|
||||
is_valid, name, params, msgs = validate_tool_call(
|
||||
"write_file", {"path": "out.txt", "content": "hello"}
|
||||
)
|
||||
|
||||
assert is_valid is True
|
||||
|
||||
|
||||
# ── Test: Helper Functions ────────────────────────────────────────────────────
|
||||
|
||||
class TestHelpers:
|
||||
def test_find_closest_exact_prefix(self):
|
||||
assert _find_closest_name("readfil", ["read_file", "write_file"]) == "read_file"
|
||||
|
||||
def test_find_closest_substring(self):
|
||||
assert _find_closest_name("file", ["read_file", "web_search"]) == "read_file"
|
||||
|
||||
def test_find_closest_no_match(self):
|
||||
assert _find_closest_name("xyzzy", ["read_file", "write_file"]) is None
|
||||
|
||||
def test_validate_type_string(self):
|
||||
ok, val = _validate_type("x", "hello", "string")
|
||||
assert ok is True
|
||||
|
||||
def test_validate_type_int_coercion(self):
|
||||
ok, val = _validate_type("x", "42", "integer")
|
||||
assert ok is True
|
||||
assert val == 42
|
||||
|
||||
def test_validate_type_int_bad(self):
|
||||
ok, val = _validate_type("x", "not_int", "integer")
|
||||
assert ok is False
|
||||
|
||||
def test_truncate(self):
|
||||
assert _truncate("hello", 10) == "hello"
|
||||
assert _truncate("hello world", 8) == "hello..."
|
||||
@@ -1,76 +0,0 @@
|
||||
"""Tests for profile session isolation (#891)."""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
# Override paths for testing
|
||||
import agent.profile_isolation as iso_mod
|
||||
_test_dir = Path(tempfile.mkdtemp())
|
||||
iso_mod.PROFILE_TAGS_FILE = _test_dir / "tags.json"
|
||||
|
||||
|
||||
def test_tag_session():
|
||||
"""Session gets tagged with profile."""
|
||||
profile = iso_mod.tag_session("sess-1", "sprint")
|
||||
assert profile == "sprint"
|
||||
assert iso_mod.get_session_profile("sess-1") == "sprint"
|
||||
|
||||
|
||||
def test_default_profile():
|
||||
"""Sessions tagged with default when no profile specified."""
|
||||
profile = iso_mod.tag_session("sess-2")
|
||||
assert profile is not None
|
||||
|
||||
|
||||
def test_get_session_profile():
|
||||
"""Can retrieve profile for tagged session."""
|
||||
iso_mod.tag_session("sess-3", "fenrir")
|
||||
assert iso_mod.get_session_profile("sess-3") == "fenrir"
|
||||
|
||||
|
||||
def test_untagged_returns_none():
|
||||
"""Untagged session returns None."""
|
||||
assert iso_mod.get_session_profile("nonexistent") is None
|
||||
|
||||
|
||||
def test_profile_stats():
|
||||
"""Stats reflect tagged sessions."""
|
||||
iso_mod.tag_session("s1", "default")
|
||||
iso_mod.tag_session("s2", "sprint")
|
||||
iso_mod.tag_session("s3", "sprint")
|
||||
stats = iso_mod.get_profile_stats()
|
||||
assert stats["total_tagged_sessions"] >= 3
|
||||
assert "sprint" in stats["profile_counts"]
|
||||
|
||||
|
||||
def test_filter_sessions():
|
||||
"""Filter returns only matching profile sessions."""
|
||||
iso_mod.tag_session("filter-1", "alpha")
|
||||
iso_mod.tag_session("filter-2", "beta")
|
||||
iso_mod.tag_session("filter-3", "alpha")
|
||||
|
||||
sessions = [
|
||||
{"session_id": "filter-1"},
|
||||
{"session_id": "filter-2"},
|
||||
{"session_id": "filter-3"},
|
||||
]
|
||||
|
||||
filtered = iso_mod.filter_sessions_by_profile(sessions, "alpha")
|
||||
ids = [s["session_id"] for s in filtered]
|
||||
assert "filter-1" in ids
|
||||
assert "filter-3" in ids
|
||||
assert "filter-2" not in ids
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tests = [test_tag_session, test_default_profile, test_get_session_profile,
|
||||
test_untagged_returns_none, test_profile_stats, test_filter_sessions]
|
||||
for t in tests:
|
||||
print(f"Running {t.__name__}...")
|
||||
t()
|
||||
print(" PASS")
|
||||
print("\nAll tests passed.")
|
||||
@@ -1,91 +0,0 @@
|
||||
"""Tests for session compaction with fact extraction."""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from agent.session_compactor import (
|
||||
ExtractedFact,
|
||||
extract_facts_from_messages,
|
||||
save_facts_to_store,
|
||||
extract_and_save_facts,
|
||||
format_facts_summary,
|
||||
)
|
||||
|
||||
|
||||
class TestFactExtraction:
|
||||
def test_extract_preference(self):
|
||||
messages = [
|
||||
{"role": "user", "content": "I prefer Python over JavaScript for backend work."},
|
||||
]
|
||||
facts = extract_facts_from_messages(messages)
|
||||
assert len(facts) >= 1
|
||||
assert any("Python" in f.content for f in facts)
|
||||
|
||||
def test_extract_correction(self):
|
||||
messages = [
|
||||
{"role": "user", "content": "Actually the port is 8081 not 8080."},
|
||||
]
|
||||
facts = extract_facts_from_messages(messages)
|
||||
assert len(facts) >= 1
|
||||
assert any("8081" in f.content for f in facts)
|
||||
|
||||
def test_extract_project_fact(self):
|
||||
messages = [
|
||||
{"role": "user", "content": "The project uses Gitea for source control."},
|
||||
]
|
||||
facts = extract_facts_from_messages(messages)
|
||||
assert len(facts) >= 1
|
||||
|
||||
def test_skip_tool_results(self):
|
||||
messages = [
|
||||
{"role": "assistant", "content": "Running command...", "tool_calls": [{"id": "1"}]},
|
||||
{"role": "tool", "content": "output here"},
|
||||
]
|
||||
facts = extract_facts_from_messages(messages)
|
||||
assert len(facts) == 0
|
||||
|
||||
def test_skip_short_messages(self):
|
||||
messages = [
|
||||
{"role": "user", "content": "ok"},
|
||||
]
|
||||
facts = extract_facts_from_messages(messages)
|
||||
assert len(facts) == 0
|
||||
|
||||
def test_deduplication(self):
|
||||
messages = [
|
||||
{"role": "user", "content": "I prefer Python."},
|
||||
{"role": "user", "content": "I prefer Python."},
|
||||
]
|
||||
facts = extract_facts_from_messages(messages)
|
||||
# Should deduplicate
|
||||
python_facts = [f for f in facts if "Python" in f.content]
|
||||
assert len(python_facts) == 1
|
||||
|
||||
|
||||
class TestSaveFacts:
|
||||
def test_save_with_callback(self):
|
||||
saved = []
|
||||
def mock_save(category, entity, content, trust):
|
||||
saved.append({"category": category, "content": content})
|
||||
|
||||
facts = [ExtractedFact("user_pref", "user", "likes dark mode", 0.8, 0)]
|
||||
count = save_facts_to_store(facts, fact_store_fn=mock_save)
|
||||
assert count == 1
|
||||
assert len(saved) == 1
|
||||
|
||||
|
||||
class TestFormatSummary:
|
||||
def test_empty(self):
|
||||
assert "No facts" in format_facts_summary([])
|
||||
|
||||
def test_with_facts(self):
|
||||
facts = [
|
||||
ExtractedFact("user_pref", "user", "likes dark mode", 0.8, 0),
|
||||
ExtractedFact("correction", "user", "port is 8081", 0.9, 1),
|
||||
]
|
||||
summary = format_facts_summary(facts)
|
||||
assert "2 facts" in summary
|
||||
assert "user_pref" in summary
|
||||
@@ -1,302 +0,0 @@
|
||||
"""
|
||||
Integration tests for poka-yoke auto-revert on incomplete skill edits (#923).
|
||||
|
||||
Verifies the transactional write-validate-commit-or-rollback pattern:
|
||||
- Backup created before every write
|
||||
- Post-write validation triggers revert on corrupted/empty file
|
||||
- Successful writes clean up the backup
|
||||
- At most MAX_BACKUPS_PER_FILE backups retained per file
|
||||
"""
|
||||
|
||||
import time
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from tools.skill_manager_tool import (
|
||||
MAX_BACKUPS_PER_FILE,
|
||||
_backup_skill_file,
|
||||
_cleanup_old_backups,
|
||||
_edit_skill,
|
||||
_patch_skill,
|
||||
_revert_from_backup,
|
||||
_validate_written_file,
|
||||
_write_file,
|
||||
)
|
||||
|
||||
|
||||
VALID_SKILL_MD = """\
|
||||
---
|
||||
name: test-skill
|
||||
description: A skill for testing auto-revert
|
||||
---
|
||||
|
||||
## Overview
|
||||
Test skill body content.
|
||||
"""
|
||||
|
||||
VALID_UPDATED_MD = """\
|
||||
---
|
||||
name: test-skill
|
||||
description: Updated description
|
||||
---
|
||||
|
||||
## Overview
|
||||
Updated test skill body.
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_skill(tmp_path: Path, content: str = VALID_SKILL_MD) -> Path:
|
||||
"""Write a minimal SKILL.md in *tmp_path* and return its path."""
|
||||
skill_md = tmp_path / "SKILL.md"
|
||||
skill_md.write_text(content, encoding="utf-8")
|
||||
return skill_md
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests: _backup_skill_file
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestBackupSkillFile:
|
||||
def test_creates_bak_file(self, tmp_path):
|
||||
skill_md = _make_skill(tmp_path)
|
||||
backup = _backup_skill_file(skill_md)
|
||||
assert backup is not None
|
||||
assert backup.exists()
|
||||
assert ".bak." in backup.name
|
||||
|
||||
def test_backup_preserves_content(self, tmp_path):
|
||||
skill_md = _make_skill(tmp_path)
|
||||
backup = _backup_skill_file(skill_md)
|
||||
assert backup.read_text(encoding="utf-8") == VALID_SKILL_MD
|
||||
|
||||
def test_no_backup_for_nonexistent_file(self, tmp_path):
|
||||
missing = tmp_path / "SKILL.md"
|
||||
assert _backup_skill_file(missing) is None
|
||||
|
||||
def test_backup_name_contains_timestamp(self, tmp_path):
|
||||
skill_md = _make_skill(tmp_path)
|
||||
before = int(time.time())
|
||||
backup = _backup_skill_file(skill_md)
|
||||
after = int(time.time())
|
||||
ts = int(backup.name.split(".bak.")[-1])
|
||||
assert before <= ts <= after
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests: _cleanup_old_backups
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCleanupOldBackups:
|
||||
def _create_backups(self, skill_md: Path, n: int) -> list:
|
||||
backups = []
|
||||
for i in range(n):
|
||||
bp = skill_md.parent / f"{skill_md.name}.bak.{1000 + i}"
|
||||
bp.write_text("backup content", encoding="utf-8")
|
||||
backups.append(bp)
|
||||
return backups
|
||||
|
||||
def test_prunes_excess_backups(self, tmp_path):
|
||||
skill_md = _make_skill(tmp_path)
|
||||
self._create_backups(skill_md, MAX_BACKUPS_PER_FILE + 2)
|
||||
_cleanup_old_backups(skill_md)
|
||||
remaining = list(tmp_path.glob(f"SKILL.md.bak.*"))
|
||||
assert len(remaining) == MAX_BACKUPS_PER_FILE
|
||||
|
||||
def test_keeps_backups_within_limit(self, tmp_path):
|
||||
skill_md = _make_skill(tmp_path)
|
||||
self._create_backups(skill_md, MAX_BACKUPS_PER_FILE)
|
||||
_cleanup_old_backups(skill_md)
|
||||
remaining = list(tmp_path.glob("SKILL.md.bak.*"))
|
||||
assert len(remaining) == MAX_BACKUPS_PER_FILE
|
||||
|
||||
def test_noop_when_no_backups(self, tmp_path):
|
||||
skill_md = _make_skill(tmp_path)
|
||||
_cleanup_old_backups(skill_md) # should not raise
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests: _validate_written_file
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestValidateWrittenFile:
|
||||
def test_valid_skill_md(self, tmp_path):
|
||||
skill_md = _make_skill(tmp_path)
|
||||
assert _validate_written_file(skill_md, is_skill_md=True) is None
|
||||
|
||||
def test_empty_file_fails(self, tmp_path):
|
||||
skill_md = tmp_path / "SKILL.md"
|
||||
skill_md.write_text("", encoding="utf-8")
|
||||
err = _validate_written_file(skill_md, is_skill_md=False)
|
||||
assert err is not None
|
||||
assert "empty" in err.lower()
|
||||
|
||||
def test_broken_frontmatter_fails(self, tmp_path):
|
||||
skill_md = tmp_path / "SKILL.md"
|
||||
skill_md.write_text("Not a skill\nno frontmatter\n", encoding="utf-8")
|
||||
err = _validate_written_file(skill_md, is_skill_md=True)
|
||||
assert err is not None
|
||||
|
||||
def test_missing_required_field_fails(self, tmp_path):
|
||||
skill_md = tmp_path / "SKILL.md"
|
||||
skill_md.write_text("---\ndescription: no name\n---\nbody\n", encoding="utf-8")
|
||||
err = _validate_written_file(skill_md, is_skill_md=True)
|
||||
assert err is not None
|
||||
assert "name" in err.lower()
|
||||
|
||||
def test_missing_file_returns_error(self, tmp_path):
|
||||
missing = tmp_path / "SKILL.md"
|
||||
err = _validate_written_file(missing, is_skill_md=False)
|
||||
assert err is not None
|
||||
|
||||
def test_non_skill_md_only_checks_emptiness(self, tmp_path):
|
||||
ref = tmp_path / "references" / "guide.md"
|
||||
ref.parent.mkdir()
|
||||
ref.write_text("# Guide\nsome content\n", encoding="utf-8")
|
||||
assert _validate_written_file(ref, is_skill_md=False) is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unit tests: _revert_from_backup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRevertFromBackup:
|
||||
def test_restores_from_backup(self, tmp_path):
|
||||
original = "original content"
|
||||
skill_md = tmp_path / "SKILL.md"
|
||||
skill_md.write_text(original, encoding="utf-8")
|
||||
backup = tmp_path / "SKILL.md.bak.99999"
|
||||
backup.write_text(original, encoding="utf-8")
|
||||
|
||||
skill_md.write_text("corrupted content", encoding="utf-8")
|
||||
_revert_from_backup(skill_md, backup)
|
||||
assert skill_md.read_text(encoding="utf-8") == original
|
||||
|
||||
def test_removes_file_when_no_backup(self, tmp_path):
|
||||
skill_md = tmp_path / "SKILL.md"
|
||||
skill_md.write_text("corrupted", encoding="utf-8")
|
||||
_revert_from_backup(skill_md, None)
|
||||
assert not skill_md.exists()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Integration tests: _edit_skill auto-revert
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEditSkillAutoRevert:
|
||||
@pytest.fixture
|
||||
def skill_dir(self, tmp_path):
|
||||
"""Create a minimal skill directory and patch _find_skill."""
|
||||
d = tmp_path / "test-skill"
|
||||
d.mkdir()
|
||||
skill_md = d / "SKILL.md"
|
||||
skill_md.write_text(VALID_SKILL_MD, encoding="utf-8")
|
||||
return d
|
||||
|
||||
def test_successful_edit_removes_backup(self, skill_dir):
|
||||
with patch("tools.skill_manager_tool._find_skill") as mock_find, \
|
||||
patch("tools.skill_manager_tool._security_scan_skill", return_value=None):
|
||||
mock_find.return_value = {"path": skill_dir}
|
||||
result = _edit_skill("test-skill", VALID_UPDATED_MD)
|
||||
|
||||
assert result["success"] is True
|
||||
backups = list(skill_dir.glob("SKILL.md.bak.*"))
|
||||
assert len(backups) == 0
|
||||
|
||||
def test_revert_when_post_write_validation_fails(self, skill_dir):
|
||||
"""Simulate a write that produces an empty file on disk."""
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
|
||||
def corrupt_write(path, content, **kw):
|
||||
# Write an empty file to simulate truncation
|
||||
path.write_text("", encoding="utf-8")
|
||||
|
||||
with patch("tools.skill_manager_tool._find_skill") as mock_find, \
|
||||
patch("tools.skill_manager_tool._atomic_write_text", side_effect=corrupt_write):
|
||||
mock_find.return_value = {"path": skill_dir}
|
||||
result = _edit_skill("test-skill", VALID_UPDATED_MD)
|
||||
|
||||
assert result["success"] is False
|
||||
assert "reverted" in result["error"].lower()
|
||||
# Original content restored
|
||||
assert skill_md.read_text(encoding="utf-8") == VALID_SKILL_MD
|
||||
|
||||
def test_backup_preserved_after_revert(self, skill_dir):
|
||||
"""A .bak file should survive when the edit is reverted (debugging aid)."""
|
||||
def corrupt_write(path, content, **kw):
|
||||
path.write_text("", encoding="utf-8")
|
||||
|
||||
with patch("tools.skill_manager_tool._find_skill") as mock_find, \
|
||||
patch("tools.skill_manager_tool._atomic_write_text", side_effect=corrupt_write):
|
||||
mock_find.return_value = {"path": skill_dir}
|
||||
_edit_skill("test-skill", VALID_UPDATED_MD)
|
||||
|
||||
backups = list(skill_dir.glob("SKILL.md.bak.*"))
|
||||
assert len(backups) == 1
|
||||
|
||||
def test_max_backups_enforced_after_multiple_edits(self, skill_dir):
|
||||
"""After many successful edits, at most MAX_BACKUPS_PER_FILE .bak files remain."""
|
||||
n = MAX_BACKUPS_PER_FILE + 4
|
||||
for i in range(n):
|
||||
# Plant stale backup files to simulate prior runs
|
||||
bp = skill_dir / f"SKILL.md.bak.{1000 + i}"
|
||||
bp.write_text("old backup", encoding="utf-8")
|
||||
|
||||
with patch("tools.skill_manager_tool._find_skill") as mock_find, \
|
||||
patch("tools.skill_manager_tool._security_scan_skill", return_value=None):
|
||||
mock_find.return_value = {"path": skill_dir}
|
||||
result = _edit_skill("test-skill", VALID_UPDATED_MD)
|
||||
|
||||
assert result["success"] is True
|
||||
backups = list(skill_dir.glob("SKILL.md.bak.*"))
|
||||
assert len(backups) <= MAX_BACKUPS_PER_FILE
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Integration tests: _patch_skill auto-revert
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestPatchSkillAutoRevert:
|
||||
@pytest.fixture
|
||||
def skill_dir(self, tmp_path):
|
||||
d = tmp_path / "test-skill"
|
||||
d.mkdir()
|
||||
(d / "SKILL.md").write_text(VALID_SKILL_MD, encoding="utf-8")
|
||||
return d
|
||||
|
||||
def test_successful_patch_removes_backup(self, skill_dir):
|
||||
with patch("tools.skill_manager_tool._find_skill") as mock_find, \
|
||||
patch("tools.skill_manager_tool._security_scan_skill", return_value=None):
|
||||
mock_find.return_value = {"path": skill_dir}
|
||||
result = _patch_skill(
|
||||
"test-skill",
|
||||
"A skill for testing auto-revert",
|
||||
"Updated description",
|
||||
)
|
||||
|
||||
assert result["success"] is True
|
||||
assert len(list(skill_dir.glob("SKILL.md.bak.*"))) == 0
|
||||
|
||||
def test_revert_on_corrupt_write(self, skill_dir):
|
||||
skill_md = skill_dir / "SKILL.md"
|
||||
original = skill_md.read_text(encoding="utf-8")
|
||||
|
||||
def corrupt_write(path, content, **kw):
|
||||
path.write_text("", encoding="utf-8")
|
||||
|
||||
with patch("tools.skill_manager_tool._find_skill") as mock_find, \
|
||||
patch("tools.skill_manager_tool._atomic_write_text", side_effect=corrupt_write):
|
||||
mock_find.return_value = {"path": skill_dir}
|
||||
result = _patch_skill(
|
||||
"test-skill",
|
||||
"A skill for testing",
|
||||
"A skill for testing auto-revert",
|
||||
)
|
||||
|
||||
assert result["success"] is False
|
||||
assert "reverted" in result["error"].lower()
|
||||
assert skill_md.read_text(encoding="utf-8") == original
|
||||
@@ -1,111 +0,0 @@
|
||||
"""
|
||||
Tests for improved error messages in skill_manager_tool (issue #624).
|
||||
Verifies that error messages include file paths, context, and suggestions.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
from tools.skill_manager_tool import _format_error, _edit_skill, _patch_skill
|
||||
|
||||
|
||||
class TestFormatError:
|
||||
"""Test the _format_error helper function."""
|
||||
|
||||
def test_basic_error(self):
|
||||
"""Test basic error formatting."""
|
||||
result = _format_error("Something went wrong")
|
||||
assert result["success"] is False
|
||||
assert "Something went wrong" in result["error"]
|
||||
assert result["skill_name"] is None
|
||||
assert result["file_path"] is None
|
||||
|
||||
def test_with_skill_name(self):
|
||||
"""Test error with skill name."""
|
||||
result = _format_error("Failed", skill_name="test-skill")
|
||||
assert "test-skill" in result["error"]
|
||||
assert result["skill_name"] == "test-skill"
|
||||
|
||||
def test_with_file_path(self):
|
||||
"""Test error with file path."""
|
||||
result = _format_error("Failed", file_path="/path/to/SKILL.md")
|
||||
assert "/path/to/SKILL.md" in result["error"]
|
||||
assert result["file_path"] == "/path/to/SKILL.md"
|
||||
|
||||
def test_with_suggestion(self):
|
||||
"""Test error with suggestion."""
|
||||
result = _format_error("Failed", suggestion="Try again")
|
||||
assert "Suggestion: Try again" in result["error"]
|
||||
assert result["suggestion"] == "Try again"
|
||||
|
||||
def test_with_context(self):
|
||||
"""Test error with context dict."""
|
||||
result = _format_error("Failed", context={"line": 5, "found": "x"})
|
||||
assert "line: 5" in result["error"]
|
||||
assert "found: x" in result["error"]
|
||||
|
||||
def test_all_fields(self):
|
||||
"""Test error with all fields."""
|
||||
result = _format_error(
|
||||
"Pattern match failed",
|
||||
skill_name="my-skill",
|
||||
file_path="/skills/my-skill/SKILL.md",
|
||||
suggestion="Check whitespace",
|
||||
context={"expected": "foo", "found": "bar"}
|
||||
)
|
||||
assert "Pattern match failed" in result["error"]
|
||||
assert "Skill: my-skill" in result["error"]
|
||||
assert "File: /skills/my-skill/SKILL.md" in result["error"]
|
||||
assert "Suggestion: Check whitespace" in result["error"]
|
||||
assert "expected: foo" in result["error"]
|
||||
|
||||
|
||||
class TestEditSkillErrors:
|
||||
"""Test improved error messages in _edit_skill."""
|
||||
|
||||
@patch('tools.skill_manager_tool._find_skill')
|
||||
def test_skill_not_found(self, mock_find):
|
||||
"""Test skill not found error includes suggestion."""
|
||||
mock_find.return_value = None
|
||||
# Provide valid content with frontmatter so it passes validation
|
||||
valid_content = """---
|
||||
name: test
|
||||
description: Test skill
|
||||
---
|
||||
Body content here.
|
||||
"""
|
||||
result = _edit_skill("nonexistent", valid_content)
|
||||
assert result["success"] is False
|
||||
assert "nonexistent" in result["error"]
|
||||
assert "skills_list()" in result.get("suggestion", "")
|
||||
|
||||
|
||||
class TestPatchSkillErrors:
|
||||
"""Test improved error messages in _patch_skill."""
|
||||
|
||||
def test_old_string_required(self):
|
||||
"""Test old_string required error includes suggestion."""
|
||||
result = _patch_skill("test-skill", None, "new")
|
||||
assert result["success"] is False
|
||||
assert "old_string is required" in result["error"]
|
||||
assert "suggestion" in result
|
||||
|
||||
def test_new_string_required(self):
|
||||
"""Test new_string required error includes suggestion."""
|
||||
result = _patch_skill("test-skill", "old", None)
|
||||
assert result["success"] is False
|
||||
assert "new_string is required" in result["error"]
|
||||
assert "suggestion" in result
|
||||
|
||||
@patch('tools.skill_manager_tool._find_skill')
|
||||
def test_skill_not_found(self, mock_find):
|
||||
"""Test skill not found error includes suggestion."""
|
||||
mock_find.return_value = None
|
||||
result = _patch_skill("nonexistent", "old", "new")
|
||||
assert result["success"] is False
|
||||
assert "nonexistent" in result["error"]
|
||||
assert "skills_list()" in result.get("suggestion", "")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,82 +0,0 @@
|
||||
"""Tests for Python syntax validation in execute_code."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Import the validation function directly
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||
from tools.code_execution_tool import _validate_python_syntax
|
||||
|
||||
|
||||
class TestValidatePythonSyntax:
|
||||
"""Test _validate_python_syntax catches errors before subprocess spawn."""
|
||||
|
||||
def test_valid_code_returns_none(self):
|
||||
assert _validate_python_syntax("print('hello')") is None
|
||||
|
||||
def test_valid_multiline_returns_none(self):
|
||||
code = """
|
||||
import os
|
||||
def foo():
|
||||
return 42
|
||||
result = foo()
|
||||
"""
|
||||
assert _validate_python_syntax(code) is None
|
||||
|
||||
def test_syntax_error_detected(self):
|
||||
result = _validate_python_syntax("def foo(
|
||||
")
|
||||
assert result is not None
|
||||
data = json.loads(result)
|
||||
assert data["syntax_error"] is True
|
||||
assert "line" in data
|
||||
assert "message" in data
|
||||
|
||||
def test_missing_colon(self):
|
||||
result = _validate_python_syntax("def foo()
|
||||
pass")
|
||||
data = json.loads(result)
|
||||
assert data["syntax_error"] is True
|
||||
assert data["line"] == 1
|
||||
|
||||
def test_unmatched_paren(self):
|
||||
result = _validate_python_syntax("print('hello'")
|
||||
data = json.loads(result)
|
||||
assert data["syntax_error"] is True
|
||||
|
||||
def test_indentation_error(self):
|
||||
result = _validate_python_syntax("def foo():
|
||||
pass")
|
||||
data = json.loads(result)
|
||||
assert data["syntax_error"] is True
|
||||
assert data["line"] == 2
|
||||
|
||||
def test_invalid_character(self):
|
||||
result = _validate_python_syntax("x = 5 √ 2")
|
||||
data = json.loads(result)
|
||||
assert data["syntax_error"] is True
|
||||
|
||||
def test_error_format_has_required_fields(self):
|
||||
result = _validate_python_syntax("def(
|
||||
")
|
||||
data = json.loads(result)
|
||||
assert "error" in data
|
||||
assert "syntax_error" in data
|
||||
assert "line" in data
|
||||
assert "offset" in data
|
||||
assert "message" in data
|
||||
|
||||
def test_empty_string_returns_none(self):
|
||||
# Empty code is caught by the guard before validation
|
||||
# But if called directly, ast.parse("") is valid
|
||||
assert _validate_python_syntax("") is None
|
||||
|
||||
def test_comment_only_returns_none(self):
|
||||
assert _validate_python_syntax("# just a comment") is None
|
||||
|
||||
def test_complex_valid_code(self):
|
||||
code =
|
||||
@@ -1,58 +0,0 @@
|
||||
"""Tests for time-aware model routing."""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from agent.time_aware_routing import (
|
||||
resolve_time_aware_model,
|
||||
get_hour_error_rate,
|
||||
is_off_hours,
|
||||
get_routing_report,
|
||||
)
|
||||
|
||||
|
||||
class TestErrorRates:
|
||||
def test_evening_high_error(self):
|
||||
assert get_hour_error_rate(18) == 9.4
|
||||
assert get_hour_error_rate(19) == 8.1
|
||||
|
||||
def test_morning_low_error(self):
|
||||
assert get_hour_error_rate(9) == 4.0
|
||||
assert get_hour_error_rate(12) == 4.0
|
||||
|
||||
def test_default_for_unknown(self):
|
||||
assert get_hour_error_rate(15) == 4.0
|
||||
|
||||
|
||||
class TestOffHours:
|
||||
def test_evening_is_off_hours(self):
|
||||
assert is_off_hours(20) is True
|
||||
assert is_off_hours(2) is True
|
||||
|
||||
def test_business_hours_not_off(self):
|
||||
assert is_off_hours(9) is False
|
||||
assert is_off_hours(14) is False
|
||||
|
||||
|
||||
class TestRouting:
|
||||
def test_interactive_uses_base_model(self):
|
||||
d = resolve_time_aware_model("my-model", "my-provider", is_cron=False, hour=18)
|
||||
assert d.model == "my-model"
|
||||
assert "Interactive" in d.reason
|
||||
|
||||
def test_cron_low_error_uses_base(self):
|
||||
d = resolve_time_aware_model("cheap-model", is_cron=True, hour=10)
|
||||
assert d.model == "cheap-model"
|
||||
|
||||
def test_cron_high_error_upgrades(self):
|
||||
d = resolve_time_aware_model("cheap-model", is_cron=True, hour=18)
|
||||
assert d.model != "cheap-model"
|
||||
assert d.is_off_hours is True
|
||||
|
||||
def test_routing_report(self):
|
||||
report = get_routing_report()
|
||||
assert "Time-Aware Model Routing" in report
|
||||
assert "18:00" in report
|
||||
@@ -1,237 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for agent/token_budget.py — Poka-yoke context overflow guard.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from agent.token_budget import (
|
||||
TokenBudget,
|
||||
BudgetLevel,
|
||||
BudgetStatus,
|
||||
WARN_PERCENT,
|
||||
CAUTION_PERCENT,
|
||||
CRITICAL_PERCENT,
|
||||
STOP_PERCENT,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def budget():
|
||||
"""Standard 128K context budget."""
|
||||
return TokenBudget(context_length=128_000)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def small_budget():
|
||||
"""4K context for tight testing."""
|
||||
return TokenBudget(context_length=4_000)
|
||||
|
||||
|
||||
# ── Threshold Levels ──────────────────────────────────────────────────
|
||||
|
||||
class TestThresholds:
|
||||
def test_normal_below_60(self, budget):
|
||||
budget.update(50_000) # 39%
|
||||
status = budget.check()
|
||||
assert status.level == BudgetLevel.NORMAL
|
||||
assert not status.should_compress
|
||||
assert not status.should_block_tools
|
||||
assert not status.should_terminate
|
||||
|
||||
def test_warning_at_60(self, budget):
|
||||
budget.update(int(128_000 * 0.62)) # 62%
|
||||
status = budget.check()
|
||||
assert status.level == BudgetLevel.WARNING
|
||||
assert not status.should_compress
|
||||
assert not status.should_block_tools
|
||||
|
||||
def test_caution_at_80(self, budget):
|
||||
budget.update(int(128_000 * 0.82)) # 82%
|
||||
status = budget.check()
|
||||
assert status.level == BudgetLevel.CAUTION
|
||||
assert status.should_compress
|
||||
assert not status.should_block_tools
|
||||
assert not status.should_terminate
|
||||
|
||||
def test_critical_at_90(self, budget):
|
||||
budget.update(int(128_000 * 0.91)) # 91%
|
||||
status = budget.check()
|
||||
assert status.level == BudgetLevel.CRITICAL
|
||||
assert status.should_compress
|
||||
assert status.should_block_tools
|
||||
assert not status.should_terminate
|
||||
|
||||
def test_stop_at_95(self, budget):
|
||||
budget.update(int(128_000 * 0.96)) # 96%
|
||||
status = budget.check()
|
||||
assert status.level == BudgetLevel.STOP
|
||||
assert status.should_compress
|
||||
assert status.should_block_tools
|
||||
assert status.should_terminate
|
||||
|
||||
def test_small_context_thresholds(self, small_budget):
|
||||
# 4K * 0.60 = 2400
|
||||
small_budget.update(2450)
|
||||
assert small_budget.check().level == BudgetLevel.WARNING
|
||||
|
||||
small_budget.update(3250) # 4K * 0.81
|
||||
assert small_budget.check().level == BudgetLevel.CAUTION
|
||||
|
||||
small_budget.update(3650) # 4K * 0.91
|
||||
assert small_budget.check().level == BudgetLevel.CRITICAL
|
||||
|
||||
small_budget.update(3850) # 4K * 0.96
|
||||
assert small_budget.check().level == BudgetLevel.STOP
|
||||
|
||||
|
||||
# ── Convenience Methods ───────────────────────────────────────────────
|
||||
|
||||
class TestConvenienceMethods:
|
||||
def test_should_compress(self, budget):
|
||||
budget.update(int(128_000 * 0.79))
|
||||
assert not budget.should_compress()
|
||||
budget.update(int(128_000 * 0.80))
|
||||
assert budget.should_compress()
|
||||
|
||||
def test_should_block_tools(self, budget):
|
||||
budget.update(int(128_000 * 0.89))
|
||||
assert not budget.should_block_tools()
|
||||
budget.update(int(128_000 * 0.90))
|
||||
assert budget.should_block_tools()
|
||||
|
||||
def test_should_terminate(self, budget):
|
||||
budget.update(int(128_000 * 0.94))
|
||||
assert not budget.should_terminate()
|
||||
budget.update(int(128_000 * 0.95))
|
||||
assert budget.should_terminate()
|
||||
|
||||
|
||||
# ── Tool Output Budgeting ─────────────────────────────────────────────
|
||||
|
||||
class TestToolOutputBudget:
|
||||
def test_normal_budget(self, budget):
|
||||
budget.update(int(128_000 * 0.50))
|
||||
assert budget.tool_output_budget() == 50_000
|
||||
|
||||
def test_warning_budget(self, budget):
|
||||
budget.update(int(128_000 * 0.65))
|
||||
assert budget.tool_output_budget() == 20_000
|
||||
|
||||
def test_caution_budget(self, budget):
|
||||
budget.update(int(128_000 * 0.85))
|
||||
assert budget.tool_output_budget() == 8_000
|
||||
|
||||
def test_critical_budget(self, budget):
|
||||
budget.update(int(128_000 * 0.92))
|
||||
assert budget.tool_output_budget() == 2_000
|
||||
|
||||
def test_truncate_short_unchanged(self, budget):
|
||||
result = budget.truncate_tool_output("short text", max_chars=1000)
|
||||
assert result == "short text"
|
||||
|
||||
def test_truncate_long(self, budget):
|
||||
long_text = "A" * 100_000
|
||||
result = budget.truncate_tool_output(long_text, max_chars=5_000)
|
||||
assert len(result) <= 5_100 # small overhead for notice
|
||||
assert "truncated" in result
|
||||
assert "A" in result[:2500] # head preserved
|
||||
assert "A" in result[-2500:] # tail preserved
|
||||
|
||||
def test_truncate_very_small(self, budget):
|
||||
long_text = "X" * 1000
|
||||
result = budget.truncate_tool_output(long_text, max_chars=50)
|
||||
assert len(result) <= 50 + 20
|
||||
assert "truncated" in result
|
||||
|
||||
|
||||
# ── Growth Tracking ───────────────────────────────────────────────────
|
||||
|
||||
class TestGrowthTracking:
|
||||
def test_growth_rate(self, budget):
|
||||
budget.update(10_000)
|
||||
budget.update(15_000)
|
||||
budget.update(20_000)
|
||||
assert budget.growth_rate() == 5_000.0
|
||||
|
||||
def test_turns_remaining(self, budget):
|
||||
budget.update(10_000)
|
||||
budget.update(15_000)
|
||||
budget.update(20_000)
|
||||
# rate=5000, remaining=108000, turns=~21
|
||||
turns = budget.turns_remaining()
|
||||
assert turns is not None
|
||||
assert 18 <= turns <= 24
|
||||
|
||||
def test_no_history(self, budget):
|
||||
assert budget.growth_rate() is None
|
||||
assert budget.turns_remaining() is None
|
||||
|
||||
|
||||
# ── Status Indicators ─────────────────────────────────────────────────
|
||||
|
||||
class TestStatusIndicators:
|
||||
def test_indicator_normal(self, budget):
|
||||
budget.update(int(128_000 * 0.50))
|
||||
status = budget.check()
|
||||
indicator = status.to_indicator()
|
||||
assert "50" in indicator
|
||||
|
||||
def test_indicator_warning(self, budget):
|
||||
budget.update(int(128_000 * 0.65))
|
||||
status = budget.check()
|
||||
indicator = status.to_indicator()
|
||||
assert "\u26a0" in indicator or "65" in indicator
|
||||
|
||||
def test_bar(self, budget):
|
||||
budget.update(int(128_000 * 0.50))
|
||||
status = budget.check()
|
||||
bar = status.to_bar()
|
||||
assert "50" in bar
|
||||
|
||||
def test_summary(self, budget):
|
||||
budget.update(50_000)
|
||||
summary = budget.summary()
|
||||
assert "50,000" in summary
|
||||
assert "128,000" in summary
|
||||
assert "NORMAL" in summary
|
||||
|
||||
|
||||
# ── Reset ─────────────────────────────────────────────────────────────
|
||||
|
||||
class TestReset:
|
||||
def test_reset_clears_state(self, budget):
|
||||
budget.update(int(128_000 * 0.90))
|
||||
budget.reset()
|
||||
assert budget.tokens_used == 0
|
||||
assert budget.check().level == BudgetLevel.NORMAL
|
||||
assert budget.growth_rate() is None
|
||||
|
||||
|
||||
# ── Edge Cases ────────────────────────────────────────────────────────
|
||||
|
||||
class TestEdgeCases:
|
||||
def test_exact_threshold_boundary(self, budget):
|
||||
# Exactly at 60%
|
||||
budget.update(int(128_000 * 0.60))
|
||||
assert budget.check().level == BudgetLevel.WARNING
|
||||
|
||||
def test_zero_context(self):
|
||||
budget = TokenBudget(context_length=0)
|
||||
status = budget.check()
|
||||
assert status.percent_used == 0
|
||||
|
||||
def test_remaining_for_response(self, budget):
|
||||
budget.update(100_000)
|
||||
remaining = budget.remaining_for_response()
|
||||
# 128000 - 100000 - 6400 (5% reserve) = 21600
|
||||
assert remaining > 0
|
||||
assert remaining < 128_000
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,76 +0,0 @@
|
||||
"""Tests for tool fixation detection."""
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from agent.tool_fixation_detector import ToolFixationDetector, get_fixation_detector
|
||||
|
||||
|
||||
class TestFixationDetection:
|
||||
def test_no_fixation_below_threshold(self):
|
||||
d = ToolFixationDetector(threshold=5)
|
||||
for i in range(4):
|
||||
assert d.record("execute_code") is None
|
||||
|
||||
def test_fixation_at_threshold(self):
|
||||
d = ToolFixationDetector(threshold=3)
|
||||
d.record("execute_code")
|
||||
d.record("execute_code")
|
||||
nudge = d.record("execute_code")
|
||||
assert nudge is not None
|
||||
assert "execute_code" in nudge
|
||||
assert "3 times" in nudge
|
||||
|
||||
def test_fixation_above_threshold(self):
|
||||
d = ToolFixationDetector(threshold=3)
|
||||
d.record("execute_code")
|
||||
d.record("execute_code")
|
||||
d.record("execute_code") # threshold hit
|
||||
nudge = d.record("execute_code") # still nudging
|
||||
assert nudge is not None
|
||||
|
||||
def test_streak_resets_on_different_tool(self):
|
||||
d = ToolFixationDetector(threshold=3)
|
||||
d.record("execute_code")
|
||||
d.record("execute_code")
|
||||
d.record("terminal") # breaks streak
|
||||
assert d._streak_count == 1
|
||||
assert d._current_streak == "terminal"
|
||||
|
||||
def test_nudges_sent_counter(self):
|
||||
d = ToolFixationDetector(threshold=2)
|
||||
d.record("a")
|
||||
d.record("a") # nudge 1
|
||||
d.record("a") # nudge 2
|
||||
assert d.nudges_sent == 2
|
||||
|
||||
def test_events_recorded(self):
|
||||
d = ToolFixationDetector(threshold=2)
|
||||
d.record("x")
|
||||
d.record("x")
|
||||
assert len(d.events) == 1
|
||||
assert d.events[0].tool_name == "x"
|
||||
assert d.events[0].streak_length == 2
|
||||
|
||||
def test_report(self):
|
||||
d = ToolFixationDetector(threshold=2)
|
||||
d.record("x")
|
||||
d.record("x")
|
||||
report = d.format_report()
|
||||
assert "x" in report
|
||||
|
||||
def test_reset(self):
|
||||
d = ToolFixationDetector(threshold=2)
|
||||
d.record("x")
|
||||
d.record("x")
|
||||
d.reset()
|
||||
assert d._streak_count == 0
|
||||
assert d._current_streak == ""
|
||||
|
||||
def test_singleton(self):
|
||||
d1 = get_fixation_detector()
|
||||
d2 = get_fixation_detector()
|
||||
assert d1 is d2
|
||||
@@ -1,182 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for tool_pokayoke.py — Tool Hallucination Prevention
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from tools.tool_pokayoke import (
|
||||
levenshtein_distance,
|
||||
find_similar_names,
|
||||
auto_correct_parameter,
|
||||
ToolCallValidator,
|
||||
validate_tool_call,
|
||||
reset_circuit_breaker,
|
||||
get_hallucination_stats,
|
||||
)
|
||||
|
||||
|
||||
class TestLevenshteinDistance:
|
||||
"""Test Levenshtein distance calculation."""
|
||||
|
||||
def test_identical_strings(self):
|
||||
assert levenshtein_distance("hello", "hello") == 0
|
||||
|
||||
def test_single_insertion(self):
|
||||
assert levenshtein_distance("hello", "hell") == 1
|
||||
assert levenshtein_distance("hell", "hello") == 1
|
||||
|
||||
def test_single_substitution(self):
|
||||
assert levenshtein_distance("hello", "hallo") == 1
|
||||
|
||||
def test_multiple_edits(self):
|
||||
assert levenshtein_distance("kitten", "sitting") == 3
|
||||
|
||||
def test_empty_strings(self):
|
||||
assert levenshtein_distance("", "hello") == 5
|
||||
assert levenshtein_distance("hello", "") == 5
|
||||
assert levenshtein_distance("", "") == 0
|
||||
|
||||
|
||||
class TestFindSimilarNames:
|
||||
"""Test finding similar tool names."""
|
||||
|
||||
def test_exact_match_excluded(self):
|
||||
names = ["browser_type", "browser_click", "browser_navigate"]
|
||||
result = find_similar_names("browser_type", names, max_distance=2)
|
||||
# Exact match should not be included (distance 0)
|
||||
assert all(name != "browser_type" for name, _ in result)
|
||||
|
||||
def test_close_matches_found(self):
|
||||
names = ["browser_type", "browser_click", "terminal"]
|
||||
result = find_similar_names("browser_typo", names, max_distance=1)
|
||||
assert len(result) == 1
|
||||
assert result[0][0] == "browser_type"
|
||||
assert result[0][1] == 1
|
||||
|
||||
def test_no_matches_beyond_distance(self):
|
||||
names = ["browser_type", "terminal"]
|
||||
result = find_similar_names("xyz", names, max_distance=1)
|
||||
assert len(result) == 0
|
||||
|
||||
|
||||
class TestAutoCorrectParameter:
|
||||
"""Test parameter auto-correction."""
|
||||
|
||||
def test_exact_correction(self):
|
||||
valid = ["path", "content", "mode"]
|
||||
assert auto_correct_parameter("path", valid) is None # Exact match, no correction needed
|
||||
|
||||
def test_single_edit_correction(self):
|
||||
valid = ["path", "content", "mode"]
|
||||
assert auto_correct_parameter("file_path", valid) is None # Distance > 1
|
||||
assert auto_correct_parameter("pathe", valid) == "path" # Distance 1
|
||||
|
||||
def test_no_correction_for_far_match(self):
|
||||
valid = ["path", "content"]
|
||||
assert auto_correct_parameter("xyz", valid) is None
|
||||
|
||||
|
||||
class TestToolCallValidator:
|
||||
"""Test the stateful validator."""
|
||||
|
||||
@pytest.fixture
|
||||
def validator(self):
|
||||
v = ToolCallValidator(failure_threshold=3)
|
||||
# Mock tool schemas
|
||||
v.tool_schemas = {
|
||||
"browser_type": {
|
||||
"parameters": {
|
||||
"properties": {
|
||||
"ref": {"type": "string"},
|
||||
"text": {"type": "string"},
|
||||
}
|
||||
}
|
||||
},
|
||||
"terminal": {
|
||||
"parameters": {
|
||||
"properties": {
|
||||
"command": {"type": "string"},
|
||||
"timeout": {"type": "integer"},
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
v._initialized = True
|
||||
return v
|
||||
|
||||
def test_valid_tool_passes(self, validator):
|
||||
is_valid, corrected, params, msgs = validator.validate("browser_type", {"ref": "@e1"})
|
||||
assert is_valid is True
|
||||
assert corrected is None
|
||||
assert len(msgs) == 0
|
||||
|
||||
def test_invalid_tool_suggests(self, validator):
|
||||
is_valid, corrected, params, msgs = validator.validate("browser_typo", {"ref": "@e1"})
|
||||
assert is_valid is False
|
||||
assert "browser_type" in str(msgs)
|
||||
|
||||
def test_auto_correct_tool_name(self, validator):
|
||||
is_valid, corrected, params, msgs = validator.validate("browser_tipe", {"ref": "@e1"})
|
||||
assert is_valid is True
|
||||
assert corrected == "browser_type"
|
||||
assert any("Auto-corrected" in m for m in msgs)
|
||||
|
||||
def test_parameter_correction(self, validator):
|
||||
is_valid, corrected, params, msgs = validator.validate("browser_type", {"reff": "@e1"})
|
||||
assert is_valid is True
|
||||
assert "ref" in params
|
||||
assert any("reff" in m and "ref" in m for m in msgs)
|
||||
|
||||
def test_circuit_breaker(self, validator):
|
||||
# Fail 3 times
|
||||
for _ in range(3):
|
||||
validator.validate("nonexistent_tool", {})
|
||||
|
||||
# 4th attempt should trigger circuit breaker
|
||||
is_valid, corrected, params, msgs = validator.validate("nonexistent_tool", {})
|
||||
assert is_valid is False
|
||||
assert any("CIRCUIT BREAKER" in m for m in msgs)
|
||||
|
||||
def test_success_resets_circuit_breaker(self, validator):
|
||||
# Fail twice
|
||||
validator.validate("nonexistent_tool", {})
|
||||
validator.validate("nonexistent_tool", {})
|
||||
|
||||
# Succeed with valid tool
|
||||
validator.validate("browser_type", {"ref": "@e1"})
|
||||
|
||||
# Failure counter should be reset
|
||||
assert "nonexistent_tool" not in validator.consecutive_failures
|
||||
|
||||
|
||||
class TestValidateToolCall:
|
||||
"""Test the global validate_tool_call function."""
|
||||
|
||||
def test_integration(self):
|
||||
# This test depends on the actual registry being available
|
||||
# We'll mock it for unit testing
|
||||
with patch("tools.tool_pokayoke._validator") as mock_validator:
|
||||
mock_validator.validate.return_value = (True, None, {}, [])
|
||||
is_valid, corrected, params, msgs = validate_tool_call("test_tool", {})
|
||||
assert is_valid is True
|
||||
|
||||
|
||||
class TestCircuitBreakerReset:
|
||||
"""Test circuit breaker reset functionality."""
|
||||
|
||||
def test_reset_specific_tool(self):
|
||||
reset_circuit_breaker("test_tool")
|
||||
stats = get_hallucination_stats()
|
||||
assert "test_tool" not in stats["consecutive_failures"]
|
||||
|
||||
def test_reset_all(self):
|
||||
reset_circuit_breaker()
|
||||
stats = get_hallucination_stats()
|
||||
assert len(stats["consecutive_failures"]) == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,67 +0,0 @@
|
||||
"""
|
||||
Tests for tool hallucination detection (#922).
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from tools.tool_validator import ToolHallucinationDetector, ValidationSeverity
|
||||
|
||||
|
||||
class TestToolHallucinationDetector:
|
||||
def setup_method(self):
|
||||
self.detector = ToolHallucinationDetector()
|
||||
self.detector.register_tool("read_file", {
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {"type": "string"},
|
||||
"encoding": {"type": "string"},
|
||||
},
|
||||
"required": ["path"]
|
||||
}
|
||||
})
|
||||
|
||||
def test_valid_tool_call(self):
|
||||
result = self.detector.validate_tool_call("read_file", {"path": "/tmp/file.txt"})
|
||||
assert result.valid is True
|
||||
assert len(result.blocking_issues) == 0
|
||||
|
||||
def test_unknown_tool(self):
|
||||
result = self.detector.validate_tool_call("hallucinated_tool", {})
|
||||
assert result.valid is False
|
||||
assert any(i.code == "UNKNOWN_TOOL" for i in result.issues)
|
||||
|
||||
def test_missing_required_param(self):
|
||||
result = self.detector.validate_tool_call("read_file", {})
|
||||
assert result.valid is False
|
||||
assert any(i.code == "MISSING_REQUIRED" for i in result.issues)
|
||||
|
||||
def test_wrong_type(self):
|
||||
result = self.detector.validate_tool_call("read_file", {"path": 123})
|
||||
assert result.valid is False
|
||||
assert any(i.code == "WRONG_TYPE" for i in result.issues)
|
||||
|
||||
def test_unknown_param_warning(self):
|
||||
result = self.detector.validate_tool_call("read_file", {"path": "/tmp/file.txt", "unknown": "value"})
|
||||
assert result.valid is True # Warning, not blocking
|
||||
assert any(i.code == "UNKNOWN_PARAM" for i in result.issues)
|
||||
|
||||
def test_placeholder_detection(self):
|
||||
result = self.detector.validate_tool_call("read_file", {"path": "<placeholder>"})
|
||||
assert any(i.code == "PLACEHOLDER_VALUE" for i in result.issues)
|
||||
|
||||
def test_rejection_stats(self):
|
||||
self.detector.validate_tool_call("unknown_tool", {})
|
||||
self.detector.validate_tool_call("read_file", {})
|
||||
stats = self.detector.get_rejection_stats()
|
||||
assert stats["total"] >= 2
|
||||
|
||||
def test_rejection_response(self):
|
||||
from tools.tool_validator import create_rejection_response
|
||||
result = self.detector.validate_tool_call("unknown_tool", {})
|
||||
response = create_rejection_response(result)
|
||||
assert response["role"] == "tool"
|
||||
assert "rejected" in response["content"].lower()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__])
|
||||
@@ -1,137 +0,0 @@
|
||||
"""Tests for Ultraplan Mode — Issue #840."""
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from tools.ultraplan import (
|
||||
Phase, Stream, Ultraplan,
|
||||
create_ultraplan, save_ultraplan, load_ultraplan,
|
||||
generate_daily_cron_prompt
|
||||
)
|
||||
|
||||
|
||||
class TestPhase:
|
||||
def test_creation(self):
|
||||
phase = Phase(id="A1", name="Setup", artifact="config.yaml")
|
||||
assert phase.id == "A1"
|
||||
assert phase.status == "pending"
|
||||
|
||||
def test_dependencies(self):
|
||||
phase = Phase(id="A2", name="Build", dependencies=["A1"])
|
||||
assert "A1" in phase.dependencies
|
||||
|
||||
|
||||
class TestStream:
|
||||
def test_progress_empty(self):
|
||||
stream = Stream(id="A", name="Stream A")
|
||||
assert stream.progress == 0.0
|
||||
|
||||
def test_progress_partial(self):
|
||||
stream = Stream(id="A", name="Stream A", phases=[
|
||||
Phase(id="A1", name="P1", status="done"),
|
||||
Phase(id="A2", name="P2", status="pending"),
|
||||
])
|
||||
assert stream.progress == 0.5
|
||||
|
||||
def test_current_phase(self):
|
||||
stream = Stream(id="A", name="Stream A", phases=[
|
||||
Phase(id="A1", name="P1", status="done"),
|
||||
Phase(id="A2", name="P2", status="active"),
|
||||
Phase(id="A3", name="P3", status="pending"),
|
||||
])
|
||||
assert stream.current_phase.id == "A2"
|
||||
|
||||
|
||||
class TestUltraplan:
|
||||
def test_to_markdown(self):
|
||||
plan = Ultraplan(
|
||||
date="20260415",
|
||||
mission="Test mission",
|
||||
streams=[
|
||||
Stream(id="A", name="Stream A", phases=[
|
||||
Phase(id="A1", name="Phase 1", artifact="file.txt"),
|
||||
]),
|
||||
],
|
||||
)
|
||||
md = plan.to_markdown()
|
||||
assert "# Ultraplan: 20260415" in md
|
||||
assert "Test mission" in md
|
||||
assert "Stream A" in md
|
||||
|
||||
def test_progress(self):
|
||||
plan = Ultraplan(
|
||||
date="20260415",
|
||||
mission="Test",
|
||||
streams=[
|
||||
Stream(id="A", name="A", status="done", phases=[
|
||||
Phase(id="A1", name="P1", status="done"),
|
||||
]),
|
||||
Stream(id="B", name="B", status="pending", phases=[
|
||||
Phase(id="B1", name="P1", status="pending"),
|
||||
]),
|
||||
],
|
||||
)
|
||||
assert plan.progress == 0.5
|
||||
|
||||
def test_to_dict(self):
|
||||
plan = Ultraplan(date="20260415", mission="Test")
|
||||
d = plan.to_dict()
|
||||
assert d["date"] == "20260415"
|
||||
assert d["mission"] == "Test"
|
||||
|
||||
|
||||
class TestCreateUltraplan:
|
||||
def test_default_date(self):
|
||||
plan = create_ultraplan(mission="Test")
|
||||
assert len(plan.date) == 8 # YYYYMMDD
|
||||
|
||||
def test_with_streams(self):
|
||||
plan = create_ultraplan(
|
||||
mission="Test",
|
||||
streams=[
|
||||
{
|
||||
"id": "A",
|
||||
"name": "Stream A",
|
||||
"phases": [
|
||||
{"id": "A1", "name": "Setup", "artifact": "config.yaml"},
|
||||
{"id": "A2", "name": "Build", "dependencies": ["A1"]},
|
||||
],
|
||||
},
|
||||
],
|
||||
)
|
||||
assert len(plan.streams) == 1
|
||||
assert len(plan.streams[0].phases) == 2
|
||||
assert plan.streams[0].phases[1].dependencies == ["A1"]
|
||||
|
||||
|
||||
class TestSaveLoad:
|
||||
def test_roundtrip(self, tmp_path):
|
||||
plan = create_ultraplan(
|
||||
date="20260415",
|
||||
mission="Test roundtrip",
|
||||
streams=[{"id": "A", "name": "Stream A"}],
|
||||
)
|
||||
|
||||
save_ultraplan(plan, base_dir=tmp_path)
|
||||
loaded = load_ultraplan("20260415", base_dir=tmp_path)
|
||||
|
||||
assert loaded is not None
|
||||
assert loaded.date == "20260415"
|
||||
assert loaded.mission == "Test roundtrip"
|
||||
|
||||
def test_nonexistent_returns_none(self, tmp_path):
|
||||
assert load_ultraplan("99999999", base_dir=tmp_path) is None
|
||||
|
||||
|
||||
class TestCronPrompt:
|
||||
def test_has_required_elements(self):
|
||||
prompt = generate_daily_cron_prompt()
|
||||
assert "Ultraplan" in prompt
|
||||
assert "streams" in prompt.lower()
|
||||
assert "Gitea" in prompt
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,239 +0,0 @@
|
||||
"""Tests for vision benchmark suite (Issue #817)."""
|
||||
|
||||
import json
|
||||
import statistics
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
|
||||
|
||||
from vision_benchmark import (
|
||||
compute_ocr_accuracy,
|
||||
compute_description_completeness,
|
||||
compute_structural_accuracy,
|
||||
aggregate_results,
|
||||
to_markdown,
|
||||
generate_sample_dataset,
|
||||
MODELS,
|
||||
EVAL_PROMPTS,
|
||||
)
|
||||
|
||||
|
||||
class TestOcrAccuracy:
|
||||
def test_perfect_match(self):
|
||||
assert compute_ocr_accuracy("Hello World", "Hello World") == 1.0
|
||||
|
||||
def test_empty_ground_truth(self):
|
||||
assert compute_ocr_accuracy("", "") == 1.0
|
||||
assert compute_ocr_accuracy("text", "") == 0.0
|
||||
|
||||
def test_empty_extraction(self):
|
||||
assert compute_ocr_accuracy("", "Hello") == 0.0
|
||||
|
||||
def test_partial_match(self):
|
||||
score = compute_ocr_accuracy("Hello Wrld", "Hello World")
|
||||
assert 0.5 < score < 1.0
|
||||
|
||||
def test_case_insensitive(self):
|
||||
assert compute_ocr_accuracy("hello world", "Hello World") == 1.0
|
||||
|
||||
def test_whitespace_differences(self):
|
||||
score = compute_ocr_accuracy(" Hello World ", "Hello World")
|
||||
assert score >= 0.8
|
||||
|
||||
|
||||
class TestDescriptionCompleteness:
|
||||
def test_all_keywords_found(self):
|
||||
keywords = ["github", "logo", "octocat"]
|
||||
text = "This is the GitHub logo featuring the octocat mascot."
|
||||
assert compute_description_completeness(text, keywords) == 1.0
|
||||
|
||||
def test_partial_keywords(self):
|
||||
keywords = ["github", "logo", "octocat"]
|
||||
text = "This is the GitHub logo."
|
||||
score = compute_description_completeness(text, keywords)
|
||||
assert 0.3 < score < 0.7
|
||||
|
||||
def test_no_keywords(self):
|
||||
keywords = ["github", "logo"]
|
||||
text = "Something completely different."
|
||||
assert compute_description_completeness(text, keywords) == 0.0
|
||||
|
||||
def test_empty_keywords(self):
|
||||
assert compute_description_completeness("any text", []) == 1.0
|
||||
|
||||
def test_empty_text(self):
|
||||
assert compute_description_completeness("", ["keyword"]) == 0.0
|
||||
|
||||
def test_case_insensitive(self):
|
||||
keywords = ["GitHub", "Logo"]
|
||||
text = "The github logo is iconic."
|
||||
assert compute_description_completeness(text, keywords) == 1.0
|
||||
|
||||
|
||||
class TestStructuralAccuracy:
|
||||
def test_length_score(self):
|
||||
text = "A" * 100
|
||||
scores = compute_structural_accuracy(text, {"min_length": 50})
|
||||
assert scores["length"] == 1.0
|
||||
|
||||
def test_short_text(self):
|
||||
text = "Short."
|
||||
scores = compute_structural_accuracy(text, {"min_length": 100})
|
||||
assert scores["length"] < 1.0
|
||||
|
||||
def test_sentence_count(self):
|
||||
text = "First sentence. Second sentence. Third sentence."
|
||||
scores = compute_structural_accuracy(text, {"min_sentences": 2})
|
||||
assert scores["sentences"] >= 1.0
|
||||
|
||||
def test_no_sentences(self):
|
||||
text = "No sentence end"
|
||||
scores = compute_structural_accuracy(text, {"min_sentences": 1})
|
||||
assert scores["sentences"] == 0.0
|
||||
|
||||
def test_has_numbers_true(self):
|
||||
text = "There are 42 items."
|
||||
scores = compute_structural_accuracy(text, {"has_numbers": True})
|
||||
assert scores["has_numbers"] == 1.0
|
||||
|
||||
def test_has_numbers_false(self):
|
||||
text = "No numbers here."
|
||||
scores = compute_structural_accuracy(text, {"has_numbers": True})
|
||||
assert scores["has_numbers"] == 0.0
|
||||
|
||||
|
||||
class TestAggregateResults:
|
||||
def test_basic_aggregation(self):
|
||||
results = [
|
||||
{
|
||||
"image_id": "img1",
|
||||
"category": "photo",
|
||||
"gemma4": {
|
||||
"success": True,
|
||||
"avg_latency_ms": 100,
|
||||
"avg_tokens": 500,
|
||||
"ocr_accuracy": 0.9,
|
||||
"keyword_completeness": 0.8,
|
||||
"analysis_length": 200,
|
||||
},
|
||||
"gemini3_flash": {
|
||||
"success": True,
|
||||
"avg_latency_ms": 150,
|
||||
"avg_tokens": 600,
|
||||
"ocr_accuracy": 0.85,
|
||||
"keyword_completeness": 0.75,
|
||||
"analysis_length": 180,
|
||||
},
|
||||
}
|
||||
]
|
||||
models = MODELS
|
||||
summary = aggregate_results(results, models)
|
||||
|
||||
assert "gemma4" in summary
|
||||
assert "gemini3_flash" in summary
|
||||
assert summary["gemma4"]["success_rate"] == 1.0
|
||||
assert summary["gemma4"]["latency"]["mean_ms"] == 100
|
||||
assert summary["gemma4"]["accuracy"]["ocr_mean"] == 0.9
|
||||
|
||||
def test_all_failures(self):
|
||||
results = [
|
||||
{
|
||||
"image_id": "img1",
|
||||
"category": "photo",
|
||||
"gemma4": {"success": False, "error": "API error"},
|
||||
"gemini3_flash": {"success": False, "error": "API error"},
|
||||
}
|
||||
]
|
||||
summary = aggregate_results(results, MODELS)
|
||||
assert summary["gemma4"]["success_rate"] == 0
|
||||
|
||||
|
||||
class TestMarkdown:
|
||||
def test_generates_report(self):
|
||||
report = {
|
||||
"generated_at": "2026-04-16T00:00:00",
|
||||
"config": {
|
||||
"total_images": 10,
|
||||
"runs_per_model": 1,
|
||||
"models": {"gemma4": "Gemma 4 27B", "gemini3_flash": "Gemini 3 Flash"},
|
||||
},
|
||||
"summary": {
|
||||
"gemma4": {
|
||||
"success_rate": 0.9,
|
||||
"latency": {"mean_ms": 100, "median_ms": 95, "p95_ms": 150, "std_ms": 20},
|
||||
"tokens": {"mean_total": 500, "total_used": 5000},
|
||||
"accuracy": {"ocr_mean": 0.85, "ocr_count": 5, "keyword_mean": 0.8, "keyword_count": 5},
|
||||
},
|
||||
"gemini3_flash": {
|
||||
"success_rate": 0.95,
|
||||
"latency": {"mean_ms": 120, "median_ms": 110, "p95_ms": 180, "std_ms": 25},
|
||||
"tokens": {"mean_total": 600, "total_used": 6000},
|
||||
"accuracy": {"ocr_mean": 0.82, "ocr_count": 5, "keyword_mean": 0.78, "keyword_count": 5},
|
||||
},
|
||||
},
|
||||
"results": [],
|
||||
}
|
||||
md = to_markdown(report)
|
||||
assert "Vision Benchmark Report" in md
|
||||
assert "Latency Comparison" in md
|
||||
assert "Accuracy Comparison" in md
|
||||
assert "Token Usage" in md
|
||||
assert "Verdict" in md
|
||||
assert "Gemma 4 27B" in md
|
||||
|
||||
def test_empty_report(self):
|
||||
report = {
|
||||
"generated_at": "2026-04-16T00:00:00",
|
||||
"config": {"total_images": 0, "runs_per_model": 1, "models": {}},
|
||||
"summary": {},
|
||||
"results": [],
|
||||
}
|
||||
md = to_markdown(report)
|
||||
assert "Vision Benchmark Report" in md
|
||||
|
||||
|
||||
class TestDataset:
|
||||
def test_sample_dataset_has_entries(self):
|
||||
dataset = generate_sample_dataset()
|
||||
assert len(dataset) >= 4
|
||||
|
||||
def test_sample_dataset_structure(self):
|
||||
dataset = generate_sample_dataset()
|
||||
for img in dataset:
|
||||
assert "id" in img
|
||||
assert "url" in img
|
||||
assert "category" in img
|
||||
assert "expected_keywords" in img
|
||||
assert "expected_structure" in img
|
||||
|
||||
def test_categories_present(self):
|
||||
dataset = generate_sample_dataset()
|
||||
categories = {img["category"] for img in dataset}
|
||||
assert "screenshot" in categories
|
||||
assert "diagram" in categories
|
||||
assert "photo" in categories
|
||||
|
||||
|
||||
class TestModels:
|
||||
def test_all_models_defined(self):
|
||||
assert "gemma4" in MODELS
|
||||
assert "gemini3_flash" in MODELS
|
||||
|
||||
def test_model_structure(self):
|
||||
for name, config in MODELS.items():
|
||||
assert "model_id" in config
|
||||
assert "display_name" in config
|
||||
assert "provider" in config
|
||||
|
||||
|
||||
class TestPrompts:
|
||||
def test_prompts_for_categories(self):
|
||||
assert "screenshot" in EVAL_PROMPTS
|
||||
assert "diagram" in EVAL_PROMPTS
|
||||
assert "photo" in EVAL_PROMPTS
|
||||
assert "ocr" in EVAL_PROMPTS
|
||||
assert "chart" in EVAL_PROMPTS
|
||||
@@ -1,294 +0,0 @@
|
||||
"""Batch Tool Executor — Parallel safety classification and concurrent execution.
|
||||
|
||||
Provides centralized classification of tool calls into parallel-safe vs sequential,
|
||||
and utilities for batch execution with safety checks.
|
||||
|
||||
Classification tiers:
|
||||
- PARALLEL_SAFE: read-only tools, no shared state (web_search, read_file, etc.)
|
||||
- PATH_SCOPED: file operations that can run concurrently when paths don't overlap
|
||||
- SEQUENTIAL: writes, destructive ops, terminal commands, delegation
|
||||
- NEVER_PARALLEL: clarify (requires user interaction)
|
||||
|
||||
Usage:
|
||||
from tools.batch_executor import classify_tool_calls, BatchExecutionPlan
|
||||
|
||||
plan = classify_tool_calls(tool_calls)
|
||||
if plan.can_parallelize:
|
||||
execute_concurrent(plan.parallel_batch)
|
||||
execute_sequential(plan.sequential_batch)
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Set, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ── Safety Classification ──────────────────────────────────────────────────
|
||||
|
||||
# Tools that can ALWAYS run in parallel (read-only, no shared state)
|
||||
DEFAULT_PARALLEL_SAFE = frozenset({
|
||||
"ha_get_state",
|
||||
"ha_list_entities",
|
||||
"ha_list_services",
|
||||
"read_file",
|
||||
"search_files",
|
||||
"session_search",
|
||||
"skill_view",
|
||||
"skills_list",
|
||||
"vision_analyze",
|
||||
"web_extract",
|
||||
"web_search",
|
||||
"fact_store",
|
||||
"fact_search",
|
||||
"session_search",
|
||||
})
|
||||
|
||||
# File tools that can run concurrently ONLY when paths don't overlap
|
||||
PATH_SCOPED_TOOLS = frozenset({"read_file", "write_file", "patch"})
|
||||
|
||||
# Tools that must NEVER run in parallel (require user interaction, shared mutable state)
|
||||
NEVER_PARALLEL = frozenset({"clarify"})
|
||||
|
||||
# Patterns that indicate terminal commands may modify/delete files
|
||||
DESTRUCTIVE_PATTERNS = re.compile(
|
||||
r"""(?:^|\s|&&|\|\||;|`)(?:
|
||||
rm\s|rmdir\s|
|
||||
mv\s|
|
||||
sed\s+-i|
|
||||
truncate\s|
|
||||
dd\s|
|
||||
shred\s|
|
||||
git\s+(?:reset|clean|checkout)\s
|
||||
)""",
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
# Output redirects that overwrite files (> but not >>)
|
||||
REDIRECT_OVERWRITE = re.compile(r'[^>]>[^>]|^>[^>]')
|
||||
|
||||
|
||||
def is_destructive_command(cmd: str) -> bool:
|
||||
"""Check if a terminal command modifies/deletes files."""
|
||||
if not cmd:
|
||||
return False
|
||||
if DESTRUCTIVE_PATTERNS.search(cmd):
|
||||
return True
|
||||
if REDIRECT_OVERWRITE.search(cmd):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _paths_overlap(path1: Path, path2: Path) -> bool:
|
||||
"""Check if two paths could conflict (one is ancestor of the other)."""
|
||||
try:
|
||||
path1 = path1.resolve()
|
||||
path2 = path2.resolve()
|
||||
return path1 == path2 or path1 in path2.parents or path2 in path1.parents
|
||||
except Exception:
|
||||
return True # conservative: assume overlap
|
||||
|
||||
|
||||
def _extract_path(tool_name: str, args: dict) -> Optional[Path]:
|
||||
"""Extract the target path from tool arguments for path-scoped tools."""
|
||||
if tool_name not in PATH_SCOPED_TOOLS:
|
||||
return None
|
||||
raw_path = args.get("path")
|
||||
if not isinstance(raw_path, str) or not raw_path.strip():
|
||||
return None
|
||||
try:
|
||||
return Path(raw_path).expanduser().resolve()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
# ── Classification ─────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ToolCallClassification:
|
||||
"""Classification result for a single tool call."""
|
||||
tool_name: str
|
||||
args: dict
|
||||
tool_call: Any # the original tool_call object
|
||||
tier: str # "parallel_safe", "path_scoped", "sequential", "never_parallel"
|
||||
reason: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class BatchExecutionPlan:
|
||||
"""Plan for executing a batch of tool calls."""
|
||||
classifications: List[ToolCallClassification] = field(default_factory=list)
|
||||
parallel_batch: List[ToolCallClassification] = field(default_factory=list)
|
||||
sequential_batch: List[ToolCallClassification] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def can_parallelize(self) -> bool:
|
||||
return len(self.parallel_batch) > 1
|
||||
|
||||
@property
|
||||
def total(self) -> int:
|
||||
return len(self.classifications)
|
||||
|
||||
|
||||
def classify_single_tool_call(
|
||||
tool_call: Any,
|
||||
extra_parallel_safe: Set[str] = None,
|
||||
) -> ToolCallClassification:
|
||||
"""Classify a single tool call into its safety tier."""
|
||||
tool_name = tool_call.function.name
|
||||
try:
|
||||
args = json.loads(tool_call.function.arguments)
|
||||
except Exception:
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args={}, tool_call=tool_call,
|
||||
tier="sequential", reason="Could not parse arguments"
|
||||
)
|
||||
|
||||
if not isinstance(args, dict):
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="sequential", reason="Non-dict arguments"
|
||||
)
|
||||
|
||||
# Check never-parallel
|
||||
if tool_name in NEVER_PARALLEL:
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="never_parallel", reason="Requires user interaction"
|
||||
)
|
||||
|
||||
# Check parallel-safe FIRST (before path_scoped) so read_file/search_files
|
||||
# get classified as parallel_safe even though they have paths
|
||||
parallel_safe_set = DEFAULT_PARALLEL_SAFE
|
||||
if extra_parallel_safe:
|
||||
parallel_safe_set = parallel_safe_set | extra_parallel_safe
|
||||
|
||||
if tool_name in parallel_safe_set:
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="parallel_safe", reason="Read-only, no shared state"
|
||||
)
|
||||
|
||||
# Check terminal commands for destructive operations
|
||||
if tool_name == "terminal":
|
||||
cmd = args.get("command", "")
|
||||
if is_destructive_command(cmd):
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="sequential", reason=f"Destructive command: {cmd[:50]}"
|
||||
)
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="sequential", reason="Terminal command (conservative)"
|
||||
)
|
||||
|
||||
# Check path-scoped tools (write_file, patch — not read_file which is parallel_safe)
|
||||
if tool_name in PATH_SCOPED_TOOLS:
|
||||
path = _extract_path(tool_name, args)
|
||||
if path:
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="path_scoped", reason=f"Path: {path}"
|
||||
)
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="sequential", reason="Path-scoped but no path found"
|
||||
)
|
||||
|
||||
# Default: sequential (conservative)
|
||||
return ToolCallClassification(
|
||||
tool_name=tool_name, args=args, tool_call=tool_call,
|
||||
tier="sequential", reason="Not classified as parallel-safe"
|
||||
)
|
||||
|
||||
|
||||
def classify_tool_calls(
|
||||
tool_calls: list,
|
||||
extra_parallel_safe: Set[str] = None,
|
||||
) -> BatchExecutionPlan:
|
||||
"""Classify a batch of tool calls and produce an execution plan."""
|
||||
plan = BatchExecutionPlan()
|
||||
|
||||
reserved_paths: List[Path] = []
|
||||
|
||||
for tc in tool_calls:
|
||||
classification = classify_single_tool_call(tc, extra_parallel_safe)
|
||||
plan.classifications.append(classification)
|
||||
|
||||
if classification.tier == "never_parallel":
|
||||
plan.sequential_batch.append(classification)
|
||||
continue
|
||||
|
||||
if classification.tier == "sequential":
|
||||
plan.sequential_batch.append(classification)
|
||||
continue
|
||||
|
||||
if classification.tier == "path_scoped":
|
||||
path = _extract_path(classification.tool_name, classification.args)
|
||||
if path is None:
|
||||
classification.tier = "sequential"
|
||||
classification.reason = "Path extraction failed"
|
||||
plan.sequential_batch.append(classification)
|
||||
continue
|
||||
|
||||
# Check for path conflicts with already-scheduled parallel calls
|
||||
conflict = any(_paths_overlap(path, existing) for existing in reserved_paths)
|
||||
if conflict:
|
||||
classification.tier = "sequential"
|
||||
classification.reason = f"Path conflict: {path}"
|
||||
plan.sequential_batch.append(classification)
|
||||
else:
|
||||
reserved_paths.append(path)
|
||||
plan.parallel_batch.append(classification)
|
||||
continue
|
||||
|
||||
if classification.tier == "parallel_safe":
|
||||
plan.parallel_batch.append(classification)
|
||||
continue
|
||||
|
||||
# Fallback
|
||||
plan.sequential_batch.append(classification)
|
||||
|
||||
return plan
|
||||
|
||||
|
||||
# ── Concurrent Execution ───────────────────────────────────────────────────
|
||||
|
||||
def execute_parallel_batch(
|
||||
batch: List[ToolCallClassification],
|
||||
invoke_fn: Callable,
|
||||
max_workers: int = 8,
|
||||
) -> List[Tuple[str, str]]:
|
||||
"""Execute parallel-safe tool calls concurrently.
|
||||
|
||||
Args:
|
||||
batch: List of classified tool calls (parallel_safe or path_scoped)
|
||||
invoke_fn: Function(tool_name, args) -> result_string
|
||||
max_workers: Max concurrent threads
|
||||
|
||||
Returns:
|
||||
List of (tool_call_id, result_string) tuples
|
||||
"""
|
||||
results = []
|
||||
|
||||
with ThreadPoolExecutor(max_workers=min(max_workers, len(batch))) as executor:
|
||||
future_to_tc = {}
|
||||
for tc in batch:
|
||||
future = executor.submit(invoke_fn, tc.tool_name, tc.args)
|
||||
future_to_tc[future] = tc
|
||||
|
||||
for future in as_completed(future_to_tc):
|
||||
tc = future_to_tc[future]
|
||||
try:
|
||||
result = future.result()
|
||||
except Exception as e:
|
||||
result = json.dumps({"error": str(e)})
|
||||
tool_call_id = getattr(tc.tool_call, "id", None) or ""
|
||||
results.append((tool_call_id, result))
|
||||
|
||||
return results
|
||||
@@ -28,7 +28,6 @@ Platform: Linux / macOS only (Unix domain sockets for local). Disabled on Window
|
||||
Remote execution additionally requires Python 3 in the terminal backend.
|
||||
"""
|
||||
|
||||
import ast
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
@@ -884,42 +883,6 @@ def _execute_remote(
|
||||
return json.dumps(result, ensure_ascii=False)
|
||||
|
||||
|
||||
|
||||
def _validate_python_syntax(code: str) -> Optional[str]:
|
||||
"""Validate Python syntax before subprocess spawn.
|
||||
|
||||
Runs ast.parse() in-process (sub-millisecond) to catch syntax errors
|
||||
before wasting time spawning a sandboxed subprocess.
|
||||
|
||||
Returns:
|
||||
JSON error string with line, offset, message if syntax is invalid.
|
||||
None if syntax is valid.
|
||||
"""
|
||||
try:
|
||||
ast.parse(code)
|
||||
return None
|
||||
except SyntaxError as exc:
|
||||
# Build context: show offending line with caret
|
||||
lines = code.split("\n")
|
||||
error_line = lines[exc.lineno - 1] if exc.lineno and exc.lineno <= len(lines) else ""
|
||||
context = ""
|
||||
if error_line:
|
||||
context = f"\n {error_line}"
|
||||
if exc.offset:
|
||||
context += f"\n {' ' * (exc.offset - 1)}^"
|
||||
|
||||
return json.dumps({
|
||||
"error": f"Python syntax error on line {exc.lineno}: {exc.msg}{context}",
|
||||
"syntax_error": True,
|
||||
"line": exc.lineno,
|
||||
"offset": exc.offset,
|
||||
"message": exc.msg,
|
||||
})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -953,11 +916,6 @@ def execute_code(
|
||||
if not code or not code.strip():
|
||||
return tool_error("No code provided.")
|
||||
|
||||
# Syntax check before subprocess spawn (catches ~15% of errors in <1ms)
|
||||
syntax_error = _validate_python_syntax(code)
|
||||
if syntax_error:
|
||||
return syntax_error
|
||||
|
||||
# Dispatch: remote backends use file-based RPC, local uses UDS
|
||||
from tools.terminal_tool import _get_env_config
|
||||
env_type = _get_env_config()["env_type"]
|
||||
|
||||
@@ -48,8 +48,6 @@ from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
from hermes_constants import get_hermes_home
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -174,7 +172,7 @@ _DEFAULT_WHITELIST = {
|
||||
|
||||
def _load_whitelist() -> Dict[str, Any]:
|
||||
"""Load action whitelist from config."""
|
||||
config_path = get_hermes_home() / "approval_whitelist.json"
|
||||
config_path = Path.home() / ".hermes" / "approval_whitelist.json"
|
||||
if config_path.exists():
|
||||
try:
|
||||
with open(config_path) as f:
|
||||
|
||||
@@ -1,183 +0,0 @@
|
||||
"""
|
||||
Credential Redaction — Block silent credential exposure in tool outputs
|
||||
|
||||
Poka-yoke: Prevent API keys, tokens, passwords from leaking into context.
|
||||
|
||||
Issue: #839
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HERMES_HOME = Path.home() / ".hermes"
|
||||
AUDIT_DIR = HERMES_HOME / "audit"
|
||||
|
||||
# Credential patterns to detect and redact
|
||||
CREDENTIAL_PATTERNS = [
|
||||
# API keys
|
||||
(r"sk-[a-zA-Z0-9]{20,}", "[REDACTED: OpenAI API key]"),
|
||||
(r"sk-ant-[a-zA-Z0-9-]{20,}", "[REDACTED: Anthropic API key]"),
|
||||
(r"ghp_[a-zA-Z0-9]{36}", "[REDACTED: GitHub token]"),
|
||||
(r"gho_[a-zA-Z0-9]{36}", "[REDACTED: GitHub OAuth token]"),
|
||||
(r"glpat-[a-zA-Z0-9-]{20,}", "[REDACTED: GitLab token]"),
|
||||
|
||||
# Bearer tokens
|
||||
(r"Bearer\s+[a-zA-Z0-9._-]{20,}", "[REDACTED: Bearer token]"),
|
||||
(r"bearer\s+[a-zA-Z0-9._-]{20,}", "[REDACTED: Bearer token]"),
|
||||
|
||||
# Generic tokens/passwords
|
||||
(r"(?:token|TOKEN|Token)[:=]\s*["']?[a-zA-Z0-9._-]{20,}["']?", "[REDACTED: Token]"),
|
||||
(r"(?:password|PASSWORD|Password)[:=]\s*["']?[^\s"']{8,}["']?", "[REDACTED: Password]"),
|
||||
(r"(?:secret|SECRET|Secret)[:=]\s*["']?[a-zA-Z0-9._-]{20,}["']?", "[REDACTED: Secret]"),
|
||||
(r"(?:api_key|API_KEY|apiKey|ApiKey)[:=]\s*["']?[a-zA-Z0-9._-]{20,}["']?", "[REDACTED: API key]"),
|
||||
|
||||
# AWS keys
|
||||
(r"AKIA[0-9A-Z]{16}", "[REDACTED: AWS access key]"),
|
||||
(r"(?:aws_secret_access_key|AWS_SECRET_ACCESS_KEY)[:=]\s*["']?[a-zA-Z0-9/+=]{40}["']?", "[REDACTED: AWS secret]"),
|
||||
|
||||
# Private keys
|
||||
(r"-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----", "[REDACTED: Private key header]"),
|
||||
|
||||
# Connection strings
|
||||
(r"(?:postgres|mysql|mongodb|redis)://[^:]+:[^@]+@[^\s]+", "[REDACTED: Database connection string]"),
|
||||
]
|
||||
|
||||
# Files that should trigger auto-masking
|
||||
SENSITIVE_FILE_PATTERNS = [
|
||||
r"\.env$",
|
||||
r"\.env\.",
|
||||
r"\.secret",
|
||||
r"credentials",
|
||||
r"\.token",
|
||||
r"config\.yaml$",
|
||||
r"config\.yml$",
|
||||
r"config\.json$",
|
||||
r"\.netrc$",
|
||||
r"\.pgpass$",
|
||||
]
|
||||
|
||||
|
||||
class CredentialRedactor:
|
||||
"""Redact credentials from text."""
|
||||
|
||||
def __init__(self, audit_log: bool = True):
|
||||
self.audit_log = audit_log
|
||||
self._redaction_count = 0
|
||||
|
||||
def redact(self, text: str) -> Tuple[str, int]:
|
||||
"""
|
||||
Redact credentials from text.
|
||||
|
||||
Returns:
|
||||
Tuple of (redacted_text, number_of_redactions)
|
||||
"""
|
||||
if not text:
|
||||
return text, 0
|
||||
|
||||
redacted = text
|
||||
count = 0
|
||||
|
||||
for pattern, replacement in CREDENTIAL_PATTERNS:
|
||||
matches = re.findall(pattern, redacted, re.IGNORECASE)
|
||||
if matches:
|
||||
redacted = re.sub(pattern, replacement, redacted, flags=re.IGNORECASE)
|
||||
count += len(matches)
|
||||
|
||||
if count > 0:
|
||||
self._redaction_count += count
|
||||
if self.audit_log:
|
||||
self._log_redaction(count, text[:100])
|
||||
|
||||
return redacted, count
|
||||
|
||||
def redact_tool_output(self, tool_name: str, output: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Redact tool output and return notice if redactions occurred.
|
||||
|
||||
Returns:
|
||||
Tuple of (redacted_output, notice_or_empty)
|
||||
"""
|
||||
redacted, count = self.redact(output)
|
||||
|
||||
if count > 0:
|
||||
notice = f"[REDACTED: {count} credential pattern{'s' if count > 1 else ''} found in {tool_name} output]"
|
||||
return redacted, notice
|
||||
|
||||
return redacted, ""
|
||||
|
||||
def should_mask_file(self, file_path: str) -> bool:
|
||||
"""Check if file should have credentials auto-masked."""
|
||||
path_lower = file_path.lower()
|
||||
return any(re.search(p, path_lower) for p in SENSITIVE_FILE_PATTERNS)
|
||||
|
||||
def mask_file_content(self, content: str, file_path: str) -> str:
|
||||
"""Mask credentials in file content while preserving structure."""
|
||||
if not self.should_mask_file(file_path):
|
||||
return content
|
||||
|
||||
lines = content.split("\n")
|
||||
masked_lines = []
|
||||
|
||||
for line in lines:
|
||||
# Preserve key=value structure but mask values
|
||||
if "=" in line and not line.strip().startswith("#"):
|
||||
key, _, value = line.partition("=")
|
||||
key_lower = key.strip().lower()
|
||||
|
||||
sensitive_keys = ["password", "secret", "token", "key", "api", "credential"]
|
||||
if any(sk in key_lower for sk in sensitive_keys):
|
||||
masked_lines.append(f"{key}=[REDACTED]")
|
||||
else:
|
||||
masked_lines.append(line)
|
||||
else:
|
||||
masked_lines.append(line)
|
||||
|
||||
return "\n".join(masked_lines)
|
||||
|
||||
def _log_redaction(self, count: int, preview: str):
|
||||
"""Log redaction event to audit trail."""
|
||||
try:
|
||||
AUDIT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
audit_file = AUDIT_DIR / "redactions.jsonl"
|
||||
|
||||
entry = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"redactions": count,
|
||||
"preview_hash": hash(preview),
|
||||
}
|
||||
|
||||
with open(audit_file, "a") as f:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("Audit log failed: %s", e)
|
||||
|
||||
|
||||
# Module-level redactor
|
||||
_redactor = CredentialRedactor()
|
||||
|
||||
|
||||
def redact_credentials(text: str) -> Tuple[str, int]:
|
||||
"""Redact credentials from text."""
|
||||
return _redactor.redact(text)
|
||||
|
||||
|
||||
def redact_tool_output(tool_name: str, output: str) -> Tuple[str, str]:
|
||||
"""Redact tool output and return notice."""
|
||||
return _redactor.redact_tool_output(tool_name, output)
|
||||
|
||||
|
||||
def should_mask_file(file_path: str) -> bool:
|
||||
"""Check if file should be masked."""
|
||||
return _redactor.should_mask_file(file_path)
|
||||
|
||||
|
||||
def mask_sensitive_file(content: str, file_path: str) -> str:
|
||||
"""Mask credentials in sensitive file."""
|
||||
return _redactor.mask_file_content(content, file_path)
|
||||
@@ -1,405 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Crisis Detection Tool — the-door integration for hermes-agent.
|
||||
|
||||
Scans user messages for crisis indicators (despair, suicidal ideation)
|
||||
using the-door's canonical detection module. Provides inline crisis
|
||||
detection before each API call and optional escalation logging.
|
||||
|
||||
Follows the-door's design principles:
|
||||
- Never computes the value of a human life
|
||||
- Never suggests someone should die
|
||||
- Always errs on the side of higher risk
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib.request
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Detection Logic (from the-door/crisis/detect.py) ──────────────────────
|
||||
# Embedded to avoid requiring the-door as a dependency.
|
||||
# Source: Timmy_Foundation/the-door, crisis/detect.py (canonical)
|
||||
|
||||
CRITICAL_INDICATORS = [
|
||||
r"\bbetter off without me\b",
|
||||
r"\bkill\s*(my)?self\b",
|
||||
r"\bend\s*my\s*life\b",
|
||||
r"\bsuicid(?:al|ed|e)\b",
|
||||
r"\bnot\s+worth\s+living\b",
|
||||
r"\bbetter\s+off\s+dead\b",
|
||||
r"\bend\s+it\s+all\b",
|
||||
r"\bno\s+reason\s+to\s+live\b",
|
||||
r"\bdon\'?t\s+want\s+to\s+live\b",
|
||||
r"\bwant\s+to\s+die\b",
|
||||
r"\bgoing\s+to\s+(?:kill\s+myself|die)\b",
|
||||
r"\bplan\s+to\s+(?:end|kill|die)\b",
|
||||
r"\btired\s+of\s+(?:living|life|existence)\b",
|
||||
r"\bsaying\s+goodbye\s+(?:forever|permanently|one\s+last\s+time)\b",
|
||||
r"\bwrote\s+a\s+suicide\s*(?:note|letter)\b",
|
||||
r"\bgiving\s+away\s+(?:my|all\s+my)\s+(?:stuff|things|possessions?)\s+(?:to|because|—)\b",
|
||||
r"\btied\s+(?:up|down)\s+my\s+(?:loose\s+)?ends",
|
||||
]
|
||||
|
||||
HIGH_INDICATORS = [
|
||||
r"\bdespair\b",
|
||||
r"\bhopeless(?:ly)?\s+(?:about\s+(?:my|this|everything|life)|inside|right\s+now)\b",
|
||||
r"\bno(?!t)\s+(?:one|body|point|hope|way\s+out)\b",
|
||||
r"\bno\s+future\s+(?:for\s+me|ahead|left)\b",
|
||||
r"\beverything\s+is\s+(?:pointless|broken|ruined)\b",
|
||||
r"\bcan\'?t\s+take\s+this\s+anymore\b",
|
||||
r"\bdon\'?t\s+care\s+if\s+I\s+die\b",
|
||||
r"\bwish\s+I\s+(?:was|were)\s+(?:dead|gone|never\s+born)\b",
|
||||
r"\bdon\'?t\s+matter\s+if\s+I\s+exist\b",
|
||||
r"\bno\s+one\s+would\s+(?:care|miss)\b",
|
||||
r"\bworld\s+would?\s+be\s+better\s+without\s+me\b",
|
||||
r"\bin\s+so\s+much\s+(?:pain|agony|suffering|torment|anguish)\b",
|
||||
r"\bcan\'?t\s+see\s+any\s+(?:point|reason|hope|way)\b",
|
||||
r"\bescape\s+from\s*this",
|
||||
r"\bjust\s+want\s+it\s+to\s+stop\b",
|
||||
r"\bnothing\s+left\s+(?:to\s+(?:live\s+for|hope\s+for|give)|inside)\b",
|
||||
r"\bdisappeared\s+forever\b",
|
||||
r"\bfeel(?:s|ing)?\s+(?:so\s+)?hopeless\b",
|
||||
r"\beverything\s+is\s+hopeless\b",
|
||||
r"\bcan\'?t\s+(?:go\s+on|keep\s+going)\b",
|
||||
r"\bgive(?:n)?\s*up\s+(?:on\s+)?(?:life|living|everything)\b",
|
||||
r"\bgive(?:n)?\s*up\s+on\s+myself\b",
|
||||
r"\bno\s*point\s+(?:in\s+)?living\b",
|
||||
r"\bno\s*hope\s+(?:left|remaining)\b",
|
||||
r"\bno\s*way\s*out\b",
|
||||
r"\bfeel(?:s|ing)?\s+trapped\b",
|
||||
r"\btrapped\s+in\s+this\s+(?:situation|life|pain|darkness|hell)\b",
|
||||
r"\btrapped\s+and\s+can\'?t\s+escape\b",
|
||||
r"\bdesperate\s+(?:for\s+)?help\b",
|
||||
r"\bfeel(?:s|ing)?\s+desperate\b",
|
||||
]
|
||||
|
||||
MEDIUM_INDICATORS = [
|
||||
r"\bno\s+hope\b",
|
||||
r"\bforgotten\b",
|
||||
r"\balone\s+in\s+this\b",
|
||||
r"\balways\s+alone\b",
|
||||
r"\bnobody\s+(?:understands|cares)\b",
|
||||
r"\bwish\s+I\s+could\b",
|
||||
r"\bexhaust(?:ed|ion|ing)\b",
|
||||
r"\bnumb\b",
|
||||
r"\bempty\b",
|
||||
r"\bworthless\b",
|
||||
r"\buseless\b",
|
||||
r"\bbroken\b",
|
||||
r"\bdark(ness)?\b",
|
||||
r"\bdepress(?:ed|ion)\b",
|
||||
r"\bcrying\b",
|
||||
r"\btears\b",
|
||||
r"\bsad(ness)?\b",
|
||||
r"\bmiserable\b",
|
||||
r"\boverwhelm(?:ed|ing)\b",
|
||||
r"\bfailing\b",
|
||||
r"\bcannot\s+cope\b",
|
||||
r"\blosing\s*(?:my)?\s*control\b",
|
||||
r"\bdown\s*for\s*the\s*count\b",
|
||||
r"\bsinking\b",
|
||||
r"\bdrowning\b",
|
||||
r"\bhopeless\b",
|
||||
r"\blost\s+all\s+hope\b",
|
||||
r"\bno\s+tomorrow\b",
|
||||
]
|
||||
|
||||
LOW_INDICATORS = [
|
||||
r"\bunhappy\b",
|
||||
r"\bdown\b",
|
||||
r"\btough\s*(?:time|day|week)\b",
|
||||
r"\brough\s+(?:day|week|patch)\b",
|
||||
r"\bstressed\b",
|
||||
r"\bburnout\b",
|
||||
r"\bdifficult\b",
|
||||
r"\bfrustrated\b",
|
||||
r"\bannoyed\b",
|
||||
r"\btired\b",
|
||||
r"\bsad\b",
|
||||
r"\bupset\b",
|
||||
r"\blonely\b",
|
||||
r"\banxious?\b",
|
||||
r"\bnot\s*(?:good|great|okay)\b",
|
||||
r"\bthings\s*are\s*hard\b",
|
||||
r"\bstruggling\b",
|
||||
r"\bnot\s+feeling\s+(?:like\s+)?myself\b",
|
||||
r"\bhard\s*time\b",
|
||||
]
|
||||
|
||||
SCORES = {"CRITICAL": 1.0, "HIGH": 0.75, "MEDIUM": 0.5, "LOW": 0.25, "NONE": 0.0}
|
||||
|
||||
ACTIONS = {
|
||||
"CRITICAL": (
|
||||
"INTERRUPT CONVERSATION. Ask 'Are you safe right now?' "
|
||||
"Provide 988 Suicide & Crisis Lifeline. "
|
||||
"Provide Crisis Text Line (HOME to 741741). "
|
||||
"Stay present. Do not disconnect. Redirect to professional help."
|
||||
),
|
||||
"HIGH": (
|
||||
"Show crisis panel. Ask about safety. Surface 988 number prominently. "
|
||||
"Continue conversation with crisis awareness."
|
||||
),
|
||||
"MEDIUM": (
|
||||
"Increase warmth and presence. Subtly surface help resources. "
|
||||
"Keep conversation anchored in the present."
|
||||
),
|
||||
"LOW": (
|
||||
"Normal conversation with warm undertone. "
|
||||
"No crisis UI elements needed. Remain vigilant."
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrisisDetectionResult:
|
||||
level: str
|
||||
indicators: List[str] = field(default_factory=list)
|
||||
recommended_action: str = ""
|
||||
score: float = 0.0
|
||||
matches: List[dict] = field(default_factory=list)
|
||||
|
||||
|
||||
def _find_indicators(text: str) -> dict:
|
||||
results = {"CRITICAL": [], "HIGH": [], "MEDIUM": [], "LOW": []}
|
||||
for pattern in CRITICAL_INDICATORS:
|
||||
m = re.search(pattern, text)
|
||||
if m:
|
||||
results["CRITICAL"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
|
||||
for pattern in HIGH_INDICATORS:
|
||||
m = re.search(pattern, text)
|
||||
if m:
|
||||
results["HIGH"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
|
||||
for pattern in MEDIUM_INDICATORS:
|
||||
m = re.search(pattern, text)
|
||||
if m:
|
||||
results["MEDIUM"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
|
||||
for pattern in LOW_INDICATORS:
|
||||
m = re.search(pattern, text)
|
||||
if m:
|
||||
results["LOW"].append({"pattern": pattern, "start": m.start(), "end": m.end()})
|
||||
return results
|
||||
|
||||
|
||||
def detect_crisis(text: str) -> CrisisDetectionResult:
|
||||
"""Detect crisis level in a message. Mirrors the-door/crisis/detect.py."""
|
||||
if not text or not text.strip():
|
||||
return CrisisDetectionResult(level="NONE", score=0.0)
|
||||
|
||||
text_lower = text.lower()
|
||||
matches = _find_indicators(text_lower)
|
||||
|
||||
if not matches:
|
||||
return CrisisDetectionResult(level="NONE", score=0.0)
|
||||
|
||||
for tier in ("CRITICAL", "HIGH"):
|
||||
if matches[tier]:
|
||||
tier_matches = matches[tier]
|
||||
patterns = [m["pattern"] for m in tier_matches]
|
||||
return CrisisDetectionResult(
|
||||
level=tier,
|
||||
indicators=patterns,
|
||||
recommended_action=ACTIONS[tier],
|
||||
score=SCORES[tier],
|
||||
matches=tier_matches,
|
||||
)
|
||||
|
||||
if len(matches["MEDIUM"]) >= 2:
|
||||
tier_matches = matches["MEDIUM"]
|
||||
patterns = [m["pattern"] for m in tier_matches]
|
||||
return CrisisDetectionResult(
|
||||
level="MEDIUM",
|
||||
indicators=patterns,
|
||||
recommended_action=ACTIONS["MEDIUM"],
|
||||
score=SCORES["MEDIUM"],
|
||||
matches=tier_matches,
|
||||
)
|
||||
|
||||
if matches["LOW"]:
|
||||
tier_matches = matches["LOW"]
|
||||
patterns = [m["pattern"] for m in tier_matches]
|
||||
return CrisisDetectionResult(
|
||||
level="LOW",
|
||||
indicators=patterns,
|
||||
recommended_action=ACTIONS["LOW"],
|
||||
score=SCORES["LOW"],
|
||||
matches=tier_matches,
|
||||
)
|
||||
|
||||
if matches["MEDIUM"]:
|
||||
tier_matches = matches["MEDIUM"]
|
||||
patterns = [m["pattern"] for m in tier_matches]
|
||||
return CrisisDetectionResult(
|
||||
level="LOW",
|
||||
indicators=patterns,
|
||||
recommended_action=ACTIONS["LOW"],
|
||||
score=SCORES["LOW"],
|
||||
matches=tier_matches,
|
||||
)
|
||||
|
||||
return CrisisDetectionResult(level="NONE", score=0.0)
|
||||
|
||||
|
||||
# ── Escalation Logging ────────────────────────────────────────────────────
|
||||
|
||||
BRIDGE_URL = os.environ.get("CRISIS_BRIDGE_URL", "")
|
||||
LOG_PATH = os.path.expanduser("~/.hermes/crisis_escalations.jsonl")
|
||||
|
||||
|
||||
def _log_escalation(result: CrisisDetectionResult, text_preview: str = ""):
|
||||
"""Log crisis detection to local file and optionally to bridge API."""
|
||||
entry = {
|
||||
"ts": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"level": result.level,
|
||||
"score": result.score,
|
||||
"indicators": result.indicators[:3], # truncate for privacy
|
||||
"text_preview": text_preview[:100] if text_preview else "",
|
||||
}
|
||||
|
||||
# Local log
|
||||
try:
|
||||
os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
|
||||
with open(LOG_PATH, "a") as f:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to write crisis log: {e}")
|
||||
|
||||
# Bridge API (if configured and level >= HIGH)
|
||||
if BRIDGE_URL and result.score >= 0.75:
|
||||
try:
|
||||
payload = json.dumps(entry).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{BRIDGE_URL}/api/crisis/escalation",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
urllib.request.urlopen(req, timeout=5)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to post to crisis bridge: {e}")
|
||||
|
||||
|
||||
# ── Tool Handler ───────────────────────────────────────────────────────────
|
||||
|
||||
def crisis_scan_handler(args: dict, **kw) -> str:
|
||||
"""Scan text for crisis indicators."""
|
||||
text = args.get("text", "")
|
||||
if not text:
|
||||
return json.dumps({"level": "NONE", "error": "No text provided"})
|
||||
|
||||
result = detect_crisis(text)
|
||||
|
||||
if result.level != "NONE":
|
||||
_log_escalation(result, text)
|
||||
|
||||
return json.dumps({
|
||||
"level": result.level,
|
||||
"score": result.score,
|
||||
"indicators": result.indicators,
|
||||
"recommended_action": result.recommended_action,
|
||||
})
|
||||
|
||||
|
||||
# ── Gateway Hook ───────────────────────────────────────────────────────────
|
||||
|
||||
def scan_user_message(text: str) -> Optional[dict]:
|
||||
"""Scan a user message for crisis indicators. Returns None if no crisis.
|
||||
|
||||
Called by gateway/CLI before each API call to intercept crisis messages.
|
||||
If crisis is detected, the result includes a recommended_action that should
|
||||
be injected into the system prompt.
|
||||
"""
|
||||
if not text:
|
||||
return None
|
||||
|
||||
result = detect_crisis(text)
|
||||
if result.level == "NONE":
|
||||
return None
|
||||
|
||||
_log_escalation(result, text)
|
||||
|
||||
return {
|
||||
"level": result.level,
|
||||
"score": result.score,
|
||||
"indicators": result.indicators,
|
||||
"recommended_action": result.recommended_action,
|
||||
"compassion_injection": _get_compassion_injection(result.level),
|
||||
}
|
||||
|
||||
|
||||
def _get_compassion_injection(level: str) -> str:
|
||||
"""Return a system prompt injection based on crisis level."""
|
||||
if level == "CRITICAL":
|
||||
return (
|
||||
"CRISIS DETECTED — The user may be in immediate danger. "
|
||||
"Ask 'Are you safe right now?' Stay present. "
|
||||
"Provide 988 Suicide & Crisis Lifeline (call or text 988). "
|
||||
"Provide Crisis Text Line (text HOME to 741741). "
|
||||
"Do not disconnect. Do not compute the value of a human life. "
|
||||
"Jesus saves those who call on His name."
|
||||
)
|
||||
elif level == "HIGH":
|
||||
return (
|
||||
"DISTRESS DETECTED — The user is expressing strong despair. "
|
||||
"Increase warmth and presence. Ask about safety. "
|
||||
"Surface the 988 Lifeline number. Continue with crisis awareness."
|
||||
)
|
||||
elif level == "MEDIUM":
|
||||
return (
|
||||
"EMOTIONAL DISTRESS — The user may be struggling. "
|
||||
"Increase warmth. Subtly surface help resources. "
|
||||
"Keep conversation anchored in the present."
|
||||
)
|
||||
return ""
|
||||
|
||||
|
||||
# ── Config Check ───────────────────────────────────────────────────────────
|
||||
|
||||
def _is_crisis_detection_enabled() -> bool:
|
||||
"""Check if crisis detection is enabled in config."""
|
||||
try:
|
||||
from hermes_cli.config import load_config
|
||||
cfg = load_config()
|
||||
return cfg.get("crisis_detection", True) # default on
|
||||
except Exception:
|
||||
return True # fail open — always detect
|
||||
|
||||
|
||||
# ── Registry ───────────────────────────────────────────────────────────────
|
||||
|
||||
from tools.registry import registry, tool_error
|
||||
|
||||
CRISIS_SCAN_SCHEMA = {
|
||||
"name": "crisis_scan",
|
||||
"description": (
|
||||
"Scan text for crisis indicators (despair, suicidal ideation). "
|
||||
"Uses the-door's canonical detection. Returns crisis level "
|
||||
"(NONE/LOW/MEDIUM/HIGH/CRITICAL) with recommended actions. "
|
||||
"ALWAYS scan user messages that express emotional distress."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Text to scan for crisis indicators",
|
||||
},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
}
|
||||
|
||||
registry.register(
|
||||
name="crisis_scan",
|
||||
toolset="crisis",
|
||||
schema=CRISIS_SCAN_SCHEMA,
|
||||
handler=lambda args, **kw: crisis_scan_handler(args, **kw),
|
||||
check_fn=lambda: _is_crisis_detection_enabled(),
|
||||
emoji="🆘",
|
||||
)
|
||||
@@ -327,33 +327,6 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str =
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
# ── Path existence guard (poka-yoke #887) ─────────────────────
|
||||
# Check if file exists before attempting read. 83.7% of read_file
|
||||
# errors are file-not-found — the agent hallucinates paths.
|
||||
# This guard catches them early with a clear, actionable error.
|
||||
if not _resolved.exists():
|
||||
# Try to suggest similar files in the same directory
|
||||
parent = _resolved.parent
|
||||
suggestion = ""
|
||||
if parent.exists() and parent.is_dir():
|
||||
similar = [
|
||||
f.name for f in parent.iterdir()
|
||||
if f.is_file() and _resolved.stem[:3].lower() in f.stem.lower()
|
||||
][:5]
|
||||
if similar:
|
||||
suggestion = f" Similar files in {parent}: {', '.join(similar)}"
|
||||
return json.dumps({
|
||||
"error": (
|
||||
f"File not found: '{path}'. The file does not exist at the resolved path "
|
||||
f"({_resolved}).{suggestion} "
|
||||
"Use search_files to find the correct path first."
|
||||
),
|
||||
"path": path,
|
||||
"resolved": str(_resolved),
|
||||
"suggestion": "Use search_files(pattern='...', target='files') to find files.",
|
||||
})
|
||||
|
||||
# ── Dedup check ───────────────────────────────────────────────
|
||||
# If we already read this exact (path, offset, limit) and the
|
||||
# file hasn't been modified since, return a lightweight stub
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hardcoded Path Guard — Poka-Yoke #921
|
||||
|
||||
Detects and blocks hardcoded home-directory paths in tool arguments.
|
||||
These paths work on one machine but break on others, VPS deployments,
|
||||
or when HOME changes.
|
||||
|
||||
Usage:
|
||||
from tools.hardcoded_path_guard import check_path, validate_tool_args
|
||||
|
||||
# Check a single path
|
||||
err = check_path("/Users/apayne/.hermes/config.yaml")
|
||||
|
||||
# Validate all path-like args in a tool call
|
||||
clean_args, warnings = validate_tool_args("read_file", {"path": "/home/user/file.txt"})
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import json as _json
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Patterns that indicate hardcoded home directories
|
||||
HARDCODED_PATTERNS = [
|
||||
(r"/Users/[\w.\-]+/", "macOS home directory (/Users/...)"),
|
||||
(r"/home/[\w.\-]+/", "Linux home directory (/home/...)"),
|
||||
(r"(?<![\w/])~/", "unexpanded tilde (~/)"),
|
||||
(r"/root/", "root home directory (/root/)"),
|
||||
]
|
||||
|
||||
_COMPILED_PATTERNS = [(re.compile(p), desc) for p, desc in HARDCODED_PATTERNS]
|
||||
_NOQA_PATTERN = re.compile(r"#\s*noqa:?\s*hardcoded-path-ok")
|
||||
|
||||
_PATH_ARG_NAMES = frozenset({
|
||||
"path", "file_path", "filepath", "dir", "directory", "dest", "source",
|
||||
"input", "output", "src", "dst", "target", "location", "file",
|
||||
"image_path", "script", "config", "log_file",
|
||||
})
|
||||
|
||||
|
||||
def has_hardcoded_path(text: str) -> Optional[str]:
|
||||
if _NOQA_PATTERN.search(text):
|
||||
return None
|
||||
for pattern, desc in _COMPILED_PATTERNS:
|
||||
if pattern.search(text):
|
||||
return desc
|
||||
return None
|
||||
|
||||
|
||||
def check_path(path_value: str) -> Optional[str]:
|
||||
if not isinstance(path_value, str):
|
||||
return None
|
||||
match_desc = has_hardcoded_path(path_value)
|
||||
if match_desc:
|
||||
return (
|
||||
f"Path contains hardcoded home directory ({match_desc}): '{path_value}'. "
|
||||
f"Use $HOME, relative paths, or get_hermes_home(). "
|
||||
f"Add '# noqa: hardcoded-path-ok' if intentional."
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
def validate_tool_args(tool_name: str, args: Dict[str, Any]) -> Tuple[Dict[str, Any], List[str]]:
|
||||
warnings = []
|
||||
for key, value in args.items():
|
||||
if key.lower() not in _PATH_ARG_NAMES:
|
||||
continue
|
||||
if isinstance(value, str):
|
||||
err = check_path(value)
|
||||
if err:
|
||||
warnings.append(err)
|
||||
elif isinstance(value, list):
|
||||
for item in value:
|
||||
if isinstance(item, str):
|
||||
err = check_path(item)
|
||||
if err:
|
||||
warnings.append(err)
|
||||
return args, warnings
|
||||
|
||||
|
||||
def scan_source_for_violations(source_code: str, filename: str = "") -> List[Tuple[int, str, str]]:
|
||||
violations = []
|
||||
lines = source_code.split("\n")
|
||||
for i, line in enumerate(lines, 1):
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("#"):
|
||||
if _NOQA_PATTERN.search(line):
|
||||
continue
|
||||
continue
|
||||
if stripped.startswith("import ") or stripped.startswith("from "):
|
||||
continue
|
||||
for pattern, desc in _COMPILED_PATTERNS:
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
if _NOQA_PATTERN.search(line):
|
||||
continue
|
||||
violations.append((i, line.strip(), desc))
|
||||
break
|
||||
return violations
|
||||
|
||||
|
||||
def guard_tool_dispatch(tool_name: str, args: Dict[str, Any]) -> Optional[str]:
|
||||
_, warnings = validate_tool_args(tool_name, args)
|
||||
if warnings:
|
||||
return _json.dumps({
|
||||
"error": "Hardcoded home directory path detected",
|
||||
"details": warnings,
|
||||
"suggestion": "Use $HOME, relative paths, or get_hermes_home() instead of hardcoded paths.",
|
||||
"pokayoke": True,
|
||||
"rule": "hardcoded-path-guard"
|
||||
})
|
||||
return None
|
||||
@@ -1,106 +0,0 @@
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Local Inference Bridge — Fast-path for low-entropy LLM tasks.
|
||||
|
||||
Detects local Ollama/llama-cpp instances and uses them for 'Auxiliary' tasks
|
||||
(summarization, extraction, simple verification) to reduce cloud dependency.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import requests
|
||||
from typing import Dict, List, Optional, Any
|
||||
from tools.registry import registry, tool_error, tool_result
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
LOCAL_INFERENCE_SCHEMA = {
|
||||
"name": "local_inference",
|
||||
"description": "Execute a task using a local inference engine (Ollama/llama-cpp) if available. Ideal for simple summarization, text cleanup, or data extraction where cloud-grade intelligence is overkill.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"prompt": {"type": "string", "description": "The task prompt."},
|
||||
"system": {"type": "string", "description": "Optional system instruction."},
|
||||
"engine": {"type": "string", "enum": ["auto", "ollama", "llama-cpp"], "default": "auto"}
|
||||
},
|
||||
"required": ["prompt"]
|
||||
}
|
||||
}
|
||||
|
||||
def detect_local_engine() -> Optional[Dict[str, str]]:
|
||||
"""Detect presence of local inference engines."""
|
||||
# 1. Check Ollama (default port 11434)
|
||||
try:
|
||||
res = requests.get("http://localhost:11434/api/tags", timeout=1)
|
||||
if res.status_code == 200:
|
||||
return {"type": "ollama", "url": "http://localhost:11434"}
|
||||
except:
|
||||
pass
|
||||
|
||||
# 2. Check llama-cpp-python (commonly on 8000 or 8080)
|
||||
for port in [8000, 8080]:
|
||||
try:
|
||||
res = requests.get(f"http://localhost:{port}/v1/models", timeout=1)
|
||||
if res.status_code == 200:
|
||||
return {"type": "llama-cpp", "url": f"http://localhost:{port}"}
|
||||
except:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def run_local_task(prompt: str, system: str = None, engine: str = "auto"):
|
||||
"""Execute inference on a detected local engine."""
|
||||
info = detect_local_engine()
|
||||
if not info:
|
||||
return tool_error("No local inference engine (Ollama or llama-cpp) detected on localhost.")
|
||||
|
||||
try:
|
||||
if info["type"] == "ollama":
|
||||
# Select first available model or default to gemma
|
||||
models = requests.get(f"{info['url']}/api/tags").json().get("models", [])
|
||||
model_name = models[0]["name"] if models else "gemma"
|
||||
|
||||
payload = {
|
||||
"model": model_name,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
if system: payload["system"] = system
|
||||
|
||||
res = requests.post(f"{info['url']}/api/generate", json=payload, timeout=60)
|
||||
result = res.json().get("response", "")
|
||||
return tool_result(engine="Ollama", model=model_name, response=result)
|
||||
|
||||
elif info["type"] == "llama-cpp":
|
||||
payload = {
|
||||
"model": "local-model",
|
||||
"messages": [
|
||||
{"role": "system", "content": system or "You are a helpful assistant."},
|
||||
{"role": "user", "content": prompt}
|
||||
]
|
||||
}
|
||||
res = requests.post(f"{info['url']}/v1/chat/completions", json=payload, timeout=60)
|
||||
result = res.json()["choices"][0]["message"]["content"]
|
||||
return tool_result(engine="llama-cpp", response=result)
|
||||
|
||||
except Exception as e:
|
||||
return tool_error(f"Local inference failed: {str(e)}")
|
||||
|
||||
def _handle_local_inference(args, **kwargs):
|
||||
return run_local_task(
|
||||
prompt=args.get("prompt"),
|
||||
system=args.get("system"),
|
||||
engine=args.get("engine", "auto")
|
||||
)
|
||||
|
||||
registry.register(
|
||||
name="local_inference",
|
||||
toolset="inference",
|
||||
schema=LOCAL_INFERENCE_SCHEMA,
|
||||
handler=_handle_local_inference,
|
||||
emoji="🏠"
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user