Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0c674641d6 |
@@ -29,8 +29,6 @@ import logging
|
||||
import os
|
||||
import ssl
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Optional
|
||||
@@ -443,244 +441,3 @@ class A2AMTLSClient:
|
||||
def post(self, url: str, json: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
|
||||
data = (__import__("json").dumps(json).encode() if json is not None else None)
|
||||
return self._request("POST", url, data=data, **kwargs)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Structured A2A task delegation over mTLS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_TERMINAL_TASK_STATES = {"completed", "failed", "canceled", "rejected"}
|
||||
|
||||
|
||||
def _iso_now() -> str:
|
||||
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
|
||||
|
||||
def _task_status(state: str, message: str) -> Dict[str, Any]:
|
||||
return {
|
||||
"state": state,
|
||||
"message": message,
|
||||
"timestamp": _iso_now(),
|
||||
}
|
||||
|
||||
|
||||
def _coerce_artifact(result: Any) -> Dict[str, Any]:
|
||||
if isinstance(result, dict):
|
||||
if "text" in result:
|
||||
return result
|
||||
if "artifact" in result and isinstance(result["artifact"], dict):
|
||||
return result["artifact"]
|
||||
return {"text": str(result)}
|
||||
|
||||
|
||||
def _build_task_record(task_id: str, task: str, requester: Optional[str], metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
return {
|
||||
"taskId": task_id,
|
||||
"task": task,
|
||||
"requester": requester,
|
||||
"metadata": metadata or {},
|
||||
"artifacts": [],
|
||||
"status": _task_status("submitted", "Task submitted"),
|
||||
}
|
||||
|
||||
|
||||
def _default_agent_card(host: str, port: int) -> Dict[str, Any]:
|
||||
base_url = f"https://{host}:{port}"
|
||||
try:
|
||||
from agent.agent_card import build_agent_card
|
||||
from dataclasses import asdict
|
||||
|
||||
card = asdict(build_agent_card())
|
||||
except Exception as exc: # pragma: no cover - fallback only exercised when card build breaks
|
||||
logger.warning("Falling back to minimal agent card: %s", exc)
|
||||
card = {
|
||||
"name": os.environ.get("HERMES_AGENT_NAME", "hermes"),
|
||||
"description": "Hermes A2A task server",
|
||||
"version": "unknown",
|
||||
}
|
||||
card["url"] = base_url
|
||||
card["a2aTaskEndpoint"] = f"{base_url}/a2a/rpc"
|
||||
return card
|
||||
|
||||
|
||||
def _default_local_hermes_executor(task_payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
task_text = str(task_payload.get("task", "")).strip()
|
||||
if not task_text:
|
||||
return {"text": ""}
|
||||
from run_agent import AIAgent
|
||||
|
||||
agent = AIAgent(quiet_mode=True)
|
||||
result = agent.chat(task_text)
|
||||
return {
|
||||
"text": result,
|
||||
"metadata": {"executor": "local-hermes"},
|
||||
}
|
||||
|
||||
|
||||
class A2ATaskServer:
|
||||
"""JSON-RPC A2A task server running over the routing mTLS server."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cert: str | Path,
|
||||
key: str | Path,
|
||||
ca: str | Path,
|
||||
host: str = "127.0.0.1",
|
||||
port: int = 9443,
|
||||
executor: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
|
||||
card_factory: Optional[Callable[[], Dict[str, Any]]] = None,
|
||||
) -> None:
|
||||
self.host = host
|
||||
self.port = port
|
||||
self._server = A2AMTLSServer(cert=cert, key=key, ca=ca, host=host, port=port)
|
||||
self._executor = executor or _default_local_hermes_executor
|
||||
self._card_factory = card_factory or (lambda: _default_agent_card(self.host, self.port))
|
||||
self._tasks: Dict[str, Dict[str, Any]] = {}
|
||||
self._lock = threading.Lock()
|
||||
self._server.add_route("/.well-known/agent-card.json", self._handle_agent_card)
|
||||
self._server.add_route("/agent-card.json", self._handle_agent_card)
|
||||
self._server.add_route("/a2a/rpc", self._handle_rpc)
|
||||
|
||||
def __enter__(self) -> "A2ATaskServer":
|
||||
self.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, *_: Any) -> None:
|
||||
self.stop()
|
||||
|
||||
def start(self) -> None:
|
||||
self._server.start()
|
||||
|
||||
def stop(self) -> None:
|
||||
self._server.stop()
|
||||
|
||||
def _handle_agent_card(self, payload: Dict[str, Any], *, peer_cn: str | None = None) -> Dict[str, Any]:
|
||||
return self._card_factory()
|
||||
|
||||
def _handle_rpc(self, payload: Dict[str, Any], *, peer_cn: str | None = None) -> Dict[str, Any]:
|
||||
req_id = payload.get("id")
|
||||
if payload.get("jsonrpc") != "2.0":
|
||||
return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32600, "message": "invalid jsonrpc version"}}
|
||||
|
||||
method = payload.get("method")
|
||||
params = payload.get("params") or {}
|
||||
try:
|
||||
if method == "tasks/send":
|
||||
result = self._rpc_send_task(params, peer_cn=peer_cn)
|
||||
elif method == "tasks/get":
|
||||
result = self._rpc_get_task(params)
|
||||
else:
|
||||
return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32601, "message": f"unknown method: {method}"}}
|
||||
except Exception as exc:
|
||||
logger.exception("A2A task RPC failed: %s", exc)
|
||||
return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32000, "message": str(exc)}}
|
||||
return {"jsonrpc": "2.0", "id": req_id, "result": result}
|
||||
|
||||
def _rpc_send_task(self, params: Dict[str, Any], *, peer_cn: str | None = None) -> Dict[str, Any]:
|
||||
task_text = str(params.get("task", "")).strip()
|
||||
if not task_text:
|
||||
raise ValueError("task is required")
|
||||
task_id = params.get("taskId") or uuid.uuid4().hex
|
||||
requester = params.get("requester") or peer_cn
|
||||
metadata = dict(params.get("metadata") or {})
|
||||
if peer_cn:
|
||||
metadata.setdefault("peer_cn", peer_cn)
|
||||
record = _build_task_record(task_id, task_text, requester, metadata)
|
||||
with self._lock:
|
||||
self._tasks[task_id] = record
|
||||
worker = threading.Thread(target=self._run_task, args=(task_id,), daemon=True, name=f"a2a-task-{task_id[:8]}")
|
||||
worker.start()
|
||||
return self._copy_task(task_id)
|
||||
|
||||
def _rpc_get_task(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
||||
task_id = str(params.get("taskId", "")).strip()
|
||||
if not task_id:
|
||||
raise ValueError("taskId is required")
|
||||
return self._copy_task(task_id)
|
||||
|
||||
def _copy_task(self, task_id: str) -> Dict[str, Any]:
|
||||
with self._lock:
|
||||
if task_id not in self._tasks:
|
||||
raise KeyError(f"unknown taskId: {task_id}")
|
||||
return json.loads(json.dumps(self._tasks[task_id]))
|
||||
|
||||
def _run_task(self, task_id: str) -> None:
|
||||
with self._lock:
|
||||
task = self._tasks[task_id]
|
||||
task["status"] = _task_status("working", "Task is running")
|
||||
task_payload = {
|
||||
"taskId": task["taskId"],
|
||||
"task": task["task"],
|
||||
"requester": task.get("requester"),
|
||||
"metadata": dict(task.get("metadata") or {}),
|
||||
}
|
||||
try:
|
||||
result = self._executor(task_payload)
|
||||
artifact = _coerce_artifact(result)
|
||||
with self._lock:
|
||||
task = self._tasks[task_id]
|
||||
task["artifacts"] = [artifact]
|
||||
task["status"] = _task_status("completed", "Task completed")
|
||||
except Exception as exc:
|
||||
with self._lock:
|
||||
task = self._tasks[task_id]
|
||||
task["status"] = _task_status("failed", f"Task failed: {exc}")
|
||||
|
||||
|
||||
class A2ATaskClient(A2AMTLSClient):
|
||||
"""Client helper for A2A JSON-RPC task send/get flows."""
|
||||
|
||||
def discover_card(self, base_url: str) -> Dict[str, Any]:
|
||||
return self.get(f"{base_url.rstrip('/')}/.well-known/agent-card.json")
|
||||
|
||||
def _rpc_call(self, base_url: str, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
|
||||
payload = {
|
||||
"jsonrpc": "2.0",
|
||||
"id": uuid.uuid4().hex,
|
||||
"method": method,
|
||||
"params": params,
|
||||
}
|
||||
response = self.post(f"{base_url.rstrip('/')}/a2a/rpc", json=payload)
|
||||
if "error" in response:
|
||||
error = response["error"]
|
||||
raise RuntimeError(error.get("message") or str(error))
|
||||
return response.get("result", {})
|
||||
|
||||
def send_task(
|
||||
self,
|
||||
base_url: str,
|
||||
*,
|
||||
task: str,
|
||||
requester: str | None = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
return self._rpc_call(
|
||||
base_url,
|
||||
"tasks/send",
|
||||
{
|
||||
"task": task,
|
||||
"requester": requester,
|
||||
"metadata": metadata or {},
|
||||
},
|
||||
)
|
||||
|
||||
def get_task(self, base_url: str, task_id: str) -> Dict[str, Any]:
|
||||
return self._rpc_call(base_url, "tasks/get", {"taskId": task_id})
|
||||
|
||||
def wait_for_task(
|
||||
self,
|
||||
base_url: str,
|
||||
task_id: str,
|
||||
*,
|
||||
timeout: float = 30.0,
|
||||
poll_interval: float = 0.5,
|
||||
) -> Dict[str, Any]:
|
||||
deadline = time.monotonic() + timeout
|
||||
while True:
|
||||
task = self.get_task(base_url, task_id)
|
||||
state = str(((task.get("status") or {}).get("state") or "")).lower()
|
||||
if state in _TERMINAL_TASK_STATES:
|
||||
return task
|
||||
if time.monotonic() >= deadline:
|
||||
raise TimeoutError(f"Timed out waiting for task {task_id}")
|
||||
time.sleep(poll_interval)
|
||||
|
||||
@@ -1,132 +0,0 @@
|
||||
"""CLI helpers for A2A task delegation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from agent.a2a_mtls import A2ATaskClient, A2ATaskServer
|
||||
from hermes_cli.config import get_hermes_home
|
||||
|
||||
|
||||
def _registry_path() -> Path:
|
||||
return get_hermes_home() / "a2a_agents.json"
|
||||
|
||||
|
||||
def _default_identity_paths() -> tuple[str, str, str]:
|
||||
hermes_home = get_hermes_home()
|
||||
agent_name = os.environ.get("HERMES_AGENT_NAME", "hermes").lower()
|
||||
cert = os.environ.get(
|
||||
"HERMES_A2A_CERT",
|
||||
str(hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.crt"),
|
||||
)
|
||||
key = os.environ.get(
|
||||
"HERMES_A2A_KEY",
|
||||
str(hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.key"),
|
||||
)
|
||||
ca = os.environ.get(
|
||||
"HERMES_A2A_CA",
|
||||
str(hermes_home / "pki" / "ca" / "fleet-ca.crt"),
|
||||
)
|
||||
return cert, key, ca
|
||||
|
||||
|
||||
def load_agent_registry(path: Path | None = None) -> dict[str, Any]:
|
||||
registry_path = path or _registry_path()
|
||||
if not registry_path.exists():
|
||||
return {}
|
||||
return json.loads(registry_path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def resolve_agent_url(agent: str, *, registry_path: Path | None = None) -> str:
|
||||
key = re.sub(r"[^A-Za-z0-9]+", "_", agent).upper()
|
||||
env_value = os.getenv(f"HERMES_A2A_{key}_URL")
|
||||
if env_value:
|
||||
return env_value
|
||||
|
||||
registry = load_agent_registry(registry_path)
|
||||
entry = registry.get(agent)
|
||||
if isinstance(entry, str) and entry:
|
||||
return entry
|
||||
if isinstance(entry, dict):
|
||||
url = entry.get("url") or entry.get("base_url") or entry.get("card_url")
|
||||
if url:
|
||||
return str(url)
|
||||
if agent.startswith("https://") or agent.startswith("http://"):
|
||||
return agent
|
||||
raise SystemExit(f"Unknown A2A agent '{agent}'. Set HERMES_A2A_{key}_URL or add it to {_registry_path()}.")
|
||||
|
||||
|
||||
def _print(data: dict[str, Any]) -> None:
|
||||
print(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
def cmd_send(args) -> None:
|
||||
base_url = args.url or resolve_agent_url(args.agent)
|
||||
cert, key, ca = args.cert, args.key, args.ca
|
||||
if not (cert and key and ca):
|
||||
cert, key, ca = _default_identity_paths()
|
||||
client = A2ATaskClient(cert=cert, key=key, ca=ca)
|
||||
card = client.discover_card(base_url)
|
||||
task = client.send_task(
|
||||
base_url,
|
||||
task=args.task,
|
||||
requester=args.requester,
|
||||
metadata={"agent": args.agent},
|
||||
)
|
||||
if args.wait:
|
||||
task = client.wait_for_task(
|
||||
base_url,
|
||||
task["taskId"],
|
||||
timeout=args.timeout,
|
||||
poll_interval=args.poll_interval,
|
||||
)
|
||||
_print({
|
||||
"agent": args.agent,
|
||||
"url": base_url,
|
||||
"card": card,
|
||||
"task": task,
|
||||
})
|
||||
|
||||
|
||||
def cmd_status(args) -> None:
|
||||
base_url = args.url or resolve_agent_url(args.agent)
|
||||
cert, key, ca = args.cert, args.key, args.ca
|
||||
if not (cert and key and ca):
|
||||
cert, key, ca = _default_identity_paths()
|
||||
client = A2ATaskClient(cert=cert, key=key, ca=ca)
|
||||
task = client.get_task(base_url, args.task_id)
|
||||
_print({"agent": args.agent, "url": base_url, "task": task})
|
||||
|
||||
|
||||
def cmd_serve(args) -> None:
|
||||
cert, key, ca = args.cert, args.key, args.ca
|
||||
if not (cert and key and ca):
|
||||
cert, key, ca = _default_identity_paths()
|
||||
server = A2ATaskServer(cert=cert, key=key, ca=ca, host=args.host, port=args.port)
|
||||
server.start()
|
||||
print(f"A2A task server listening on https://{args.host}:{args.port}")
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1)
|
||||
except KeyboardInterrupt:
|
||||
server.stop()
|
||||
|
||||
|
||||
def cmd_a2a(args) -> None:
|
||||
command = getattr(args, "a2a_command", None) or "send"
|
||||
if command == "send":
|
||||
cmd_send(args)
|
||||
return
|
||||
if command == "status":
|
||||
cmd_status(args)
|
||||
return
|
||||
if command == "serve":
|
||||
cmd_serve(args)
|
||||
return
|
||||
raise SystemExit(f"Unknown a2a command: {command}")
|
||||
@@ -173,13 +173,6 @@ from hermes_constants import OPENROUTER_BASE_URL
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def cmd_a2a(args):
|
||||
"""Dispatch A2A CLI subcommands lazily to avoid heavy imports at startup."""
|
||||
from hermes_cli.a2a_cmd import cmd_a2a as _cmd_a2a
|
||||
|
||||
return _cmd_a2a(args)
|
||||
|
||||
|
||||
def _relative_time(ts) -> str:
|
||||
"""Format a timestamp as relative time (e.g., '2h ago', 'yesterday')."""
|
||||
if not ts:
|
||||
@@ -4788,45 +4781,6 @@ For more help on a command:
|
||||
|
||||
gateway_parser.set_defaults(func=cmd_gateway)
|
||||
|
||||
# =========================================================================
|
||||
# a2a command
|
||||
# =========================================================================
|
||||
a2a_parser = subparsers.add_parser(
|
||||
"a2a",
|
||||
help="A2A task delegation over mutual TLS",
|
||||
description="Send, inspect, and serve structured A2A tasks between Hermes agents",
|
||||
)
|
||||
a2a_subparsers = a2a_parser.add_subparsers(dest="a2a_command")
|
||||
|
||||
a2a_send = a2a_subparsers.add_parser("send", help="Send an A2A task to another agent")
|
||||
a2a_send.add_argument("--agent", required=True, help="Agent alias or URL (for example: allegro)")
|
||||
a2a_send.add_argument("--task", required=True, help="Task text to delegate")
|
||||
a2a_send.add_argument("--url", help="Explicit base URL for the remote agent")
|
||||
a2a_send.add_argument("--requester", default=None, help="Requester label included in task metadata")
|
||||
a2a_send.add_argument("--wait", action="store_true", help="Poll until the task reaches a terminal state")
|
||||
a2a_send.add_argument("--timeout", type=float, default=30.0, help="Wait timeout in seconds (default: 30)")
|
||||
a2a_send.add_argument("--poll-interval", type=float, default=0.5, help="Polling interval in seconds while waiting (default: 0.5)")
|
||||
a2a_send.add_argument("--cert", default=None, help="Client certificate path (defaults from HERMES_A2A_CERT)")
|
||||
a2a_send.add_argument("--key", default=None, help="Client private key path (defaults from HERMES_A2A_KEY)")
|
||||
a2a_send.add_argument("--ca", default=None, help="Fleet CA certificate path (defaults from HERMES_A2A_CA)")
|
||||
|
||||
a2a_status = a2a_subparsers.add_parser("status", help="Fetch the current status of an A2A task")
|
||||
a2a_status.add_argument("--agent", required=True, help="Agent alias or URL (for example: allegro)")
|
||||
a2a_status.add_argument("--task-id", required=True, help="Task identifier returned by a2a send")
|
||||
a2a_status.add_argument("--url", help="Explicit base URL for the remote agent")
|
||||
a2a_status.add_argument("--cert", default=None, help="Client certificate path (defaults from HERMES_A2A_CERT)")
|
||||
a2a_status.add_argument("--key", default=None, help="Client private key path (defaults from HERMES_A2A_KEY)")
|
||||
a2a_status.add_argument("--ca", default=None, help="Fleet CA certificate path (defaults from HERMES_A2A_CA)")
|
||||
|
||||
a2a_serve = a2a_subparsers.add_parser("serve", help="Run the local A2A task server")
|
||||
a2a_serve.add_argument("--host", default=os.environ.get("HERMES_A2A_HOST", "127.0.0.1"), help="Bind host (default: HERMES_A2A_HOST or 127.0.0.1)")
|
||||
a2a_serve.add_argument("--port", type=int, default=int(os.environ.get("HERMES_A2A_PORT", "9443")), help="Bind port (default: HERMES_A2A_PORT or 9443)")
|
||||
a2a_serve.add_argument("--cert", default=None, help="Server certificate path (defaults from HERMES_A2A_CERT)")
|
||||
a2a_serve.add_argument("--key", default=None, help="Server private key path (defaults from HERMES_A2A_KEY)")
|
||||
a2a_serve.add_argument("--ca", default=None, help="Fleet CA certificate path (defaults from HERMES_A2A_CA)")
|
||||
|
||||
a2a_parser.set_defaults(func=cmd_a2a)
|
||||
|
||||
# =========================================================================
|
||||
# setup command
|
||||
# =========================================================================
|
||||
|
||||
@@ -5,310 +5,180 @@
|
||||
|
||||
## Executive Summary
|
||||
|
||||
Local models (Ollama) CAN handle crisis support with adequate quality for the Most Sacred Moment protocol. Research demonstrates that even small local models (1.5B-7B parameters) achieve performance comparable to trained human operators in crisis detection tasks. However, they require careful implementation with safety guardrails and should complement—not replace—human oversight.
|
||||
This report updates the earlier optimistic draft with the repo-level finding captured in issue #877.
|
||||
|
||||
**Key Finding:** A fine-tuned 1.5B parameter Qwen model outperformed larger models on mood and suicidal ideation detection tasks (PsyCrisisBench, 2025).
|
||||
**Updated finding:** local models are adequate for crisis support and crisis detection, but not for crisis response generation.
|
||||
|
||||
The direct evaluation summary in issue #877 is:
|
||||
- **Detection:** local models correctly identify crisis language 92% of the time
|
||||
- **Response quality:** local model responses are only 60% adequate vs 94% for frontier models
|
||||
- **Gospel integration:** local models integrate faith content inconsistently
|
||||
- **988 Lifeline:** local models include 988 referral 78% of the time vs 99% for frontier models
|
||||
|
||||
That means the safe architectural conclusion is not “local is enough for the whole Most Sacred Moment protocol.”
|
||||
It is:
|
||||
- use local models for **detection / triage**
|
||||
- use frontier models for **response generation once crisis is detected**
|
||||
- build a two-stage pipeline: **local detection → frontier response**
|
||||
|
||||
---
|
||||
|
||||
## 1. Crisis Detection Accuracy
|
||||
## 1. Direct Evaluation Findings
|
||||
|
||||
### Research Evidence
|
||||
### Models evaluated
|
||||
- `gemma3:27b`
|
||||
- `hermes4:14b`
|
||||
- `mimo-v2-pro`
|
||||
|
||||
**PsyCrisisBench (2025)** - The most comprehensive benchmark to date:
|
||||
- Source: 540 annotated transcripts from Hangzhou Psychological Assistance Hotline
|
||||
- Models tested: 64 LLMs across 15 families (GPT, Claude, Gemini, Llama, Qwen, DeepSeek)
|
||||
- Results:
|
||||
- **Suicidal ideation detection: F1=0.880** (88% accuracy)
|
||||
- **Suicide plan identification: F1=0.779** (78% accuracy)
|
||||
- **Risk assessment: F1=0.907** (91% accuracy)
|
||||
- **Mood status recognition: F1=0.709** (71% accuracy - challenging due to missing vocal cues)
|
||||
### What local models do well
|
||||
|
||||
**Llama-2 for Suicide Detection (British Journal of Psychiatry, 2024):**
|
||||
- German fine-tuned Llama-2 model achieved:
|
||||
- **Accuracy: 87.5%**
|
||||
- **Sensitivity: 83.0%**
|
||||
- **Specificity: 91.8%**
|
||||
- Locally hosted, privacy-preserving approach
|
||||
1. **Crisis detection is adequate**
|
||||
- 92% crisis-language detection is strong enough for a first-pass detector
|
||||
- This makes local models viable for low-latency triage and escalation triggers
|
||||
|
||||
**Supportiv Hybrid AI Study (2026):**
|
||||
- AI detected SI faster than humans in **77.52% passive** and **81.26% active** cases
|
||||
- **90.3% agreement** between AI and human moderators
|
||||
- Processed **169,181 live-chat transcripts** (449,946 user visits)
|
||||
2. **They are fast and cheap enough for always-on screening**
|
||||
- normal conversation can stay on local routing
|
||||
- crisis screening can happen continuously without frontier-model cost on every turn
|
||||
|
||||
### False Positive/Negative Rates
|
||||
3. **They can support the operator pipeline**
|
||||
- tag likely crisis turns
|
||||
- raise escalation flags
|
||||
- capture traces and logs for later review
|
||||
|
||||
Based on the research:
|
||||
- **False Negative Rate (missed crisis):** ~12-17% for suicidal ideation
|
||||
- **False Positive Rate:** ~8-12%
|
||||
- **Risk Assessment Error:** ~9% overall
|
||||
### Where local models fall short
|
||||
|
||||
**Critical insight:** The research shows LLMs and trained human operators have *complementary* strengths—humans are better at mood recognition and suicidal ideation, while LLMs excel at risk assessment and suicide plan identification.
|
||||
1. **Response generation quality is not high enough**
|
||||
- 60% adequate is not enough for the highest-stakes turn in the system
|
||||
- crisis intervention needs emotional presence, specificity, and steadiness
|
||||
- a “mostly okay” response is not acceptable when the failure case is abandonment, flattening, or unsafe wording
|
||||
|
||||
2. **Faith integration is inconsistent**
|
||||
- gospel content sometimes appears forced
|
||||
- other times it disappears when it should be present
|
||||
- that inconsistency is especially costly in a spiritually grounded crisis protocol
|
||||
|
||||
3. **988 referral reliability is too low**
|
||||
- 78% inclusion means the model misses a critical action too often
|
||||
- frontier models at 99% are materially better on a requirement that should be near-perfect
|
||||
|
||||
---
|
||||
|
||||
## 2. Emotional Understanding
|
||||
## 2. What This Means for the Most Sacred Moment
|
||||
|
||||
### Can Local Models Understand Emotional Nuance?
|
||||
The earlier version of this report argued that local models were good enough for the whole protocol.
|
||||
Issue #877 changes that conclusion.
|
||||
|
||||
**Yes, with limitations:**
|
||||
The Most Sacred Moment is not just a classification task.
|
||||
It is a response-generation task under maximum moral and emotional load.
|
||||
|
||||
1. **Emotion Recognition:**
|
||||
- Maximum F1 of 0.709 for mood status (PsyCrisisBench)
|
||||
- Missing vocal cues is a significant limitation in text-only
|
||||
- Semantic ambiguity creates challenges
|
||||
A model can be good enough to answer:
|
||||
- “Is this a crisis?”
|
||||
- “Should we escalate?”
|
||||
- “Did the user mention self-harm or suicide?”
|
||||
|
||||
2. **Empathy in Responses:**
|
||||
- LLMs demonstrate ability to generate empathetic responses
|
||||
- Research shows they deliver "superior explanations" (BERTScore=0.9408)
|
||||
- Human evaluations confirm adequate interviewing skills
|
||||
…and still not be good enough to deliver:
|
||||
- a compassionate first line
|
||||
- stable emotional presence
|
||||
- a faithful and natural gospel integration
|
||||
- a reliable 988 referral
|
||||
- the specificity needed for real crisis intervention
|
||||
|
||||
3. **Emotional Support Conversation (ESConv) benchmarks:**
|
||||
- Models trained on emotional support datasets show improved empathy
|
||||
- Few-shot prompting significantly improves emotional understanding
|
||||
- Fine-tuning narrows the gap with larger models
|
||||
|
||||
### Key Limitations
|
||||
- Cannot detect tone, urgency in voice, or hesitation
|
||||
- Cultural and linguistic nuances may be missed
|
||||
- Context window limitations may lose conversation history
|
||||
That is exactly the gap the evaluation exposed.
|
||||
|
||||
---
|
||||
|
||||
## 3. Response Quality & Safety Protocols
|
||||
## 3. Architecture Recommendation
|
||||
|
||||
### What Makes a Good Crisis Support Response?
|
||||
### Recommended pipeline
|
||||
|
||||
**988 Suicide & Crisis Lifeline Guidelines:**
|
||||
1. Show you care ("I'm glad you told me")
|
||||
2. Ask directly about suicide ("Are you thinking about killing yourself?")
|
||||
3. Keep them safe (remove means, create safety plan)
|
||||
4. Be there (listen without judgment)
|
||||
5. Help them connect (to 988, crisis services)
|
||||
6. Follow up
|
||||
```text
|
||||
normal conversation
|
||||
-> local/default routing
|
||||
|
||||
**WHO mhGAP Guidelines:**
|
||||
- Assess risk level
|
||||
- Provide psychosocial support
|
||||
- Refer to specialized care when needed
|
||||
- Ensure follow-up
|
||||
- Involve family/support network
|
||||
user turn arrives
|
||||
-> local crisis detector
|
||||
-> if NOT crisis: stay local
|
||||
-> if crisis: escalate immediately to frontier response model
|
||||
```
|
||||
|
||||
### Do Local Models Follow Safety Protocols?
|
||||
### Why this is the right split
|
||||
|
||||
**Research indicates:**
|
||||
- **Local detection** is fast, cheap, and adequate
|
||||
- **Frontier response generation** has materially better emotional quality and compliance on crisis-critical behaviors
|
||||
- Crisis turns are rare enough that the cost increase is acceptable
|
||||
- The most expensive path is reserved for the moments where quality matters most
|
||||
|
||||
**Strengths:**
|
||||
- Can be prompted to follow structured safety protocols
|
||||
- Can detect and escalate high-risk situations
|
||||
- Can provide consistent, non-judgmental responses
|
||||
- Can operate 24/7 without fatigue
|
||||
### Cost profile
|
||||
|
||||
**Concerns:**
|
||||
- Only 33% of studies reported ethical considerations (Holmes et al., 2025)
|
||||
- Risk of "hallucinated" safety advice
|
||||
- Cannot physically intervene or call emergency services
|
||||
- May miss cultural context
|
||||
|
||||
### Safety Guardrails Required
|
||||
|
||||
1. **Mandatory escalation triggers** - Any detected suicidal ideation must trigger immediate human review
|
||||
2. **Crisis resource integration** - Always provide 988 Lifeline number
|
||||
3. **Conversation logging** - Full audit trail for safety review
|
||||
4. **Timeout protocols** - If user goes silent during crisis, escalate
|
||||
5. **No diagnostic claims** - Model should not diagnose or prescribe
|
||||
Issue #877 estimates the crisis-turn cost increase at roughly **10x**, but crisis turns are **<1% of total** usage.
|
||||
That trade is worth it.
|
||||
|
||||
---
|
||||
|
||||
## 4. Latency & Real-Time Performance
|
||||
## 4. Hermes Impact
|
||||
|
||||
### Response Time Analysis
|
||||
This research implies the repo should prefer:
|
||||
|
||||
**Ollama Local Model Latency (typical hardware):**
|
||||
1. **Local-first routing for ordinary conversation**
|
||||
2. **Explicit crisis detection before response generation**
|
||||
3. **Frontier escalation for crisis-response turns**
|
||||
4. **Traceable provider routing** so operators can audit when escalation happened
|
||||
5. **Reliable 988 behavior** and crisis-specific regression evaluation
|
||||
|
||||
| Model Size | First Token | Tokens/sec | Total Response (100 tokens) |
|
||||
|------------|-------------|------------|----------------------------|
|
||||
| 1-3B params | 0.1-0.3s | 30-80 | 1.5-3s |
|
||||
| 7B params | 0.3-0.8s | 15-40 | 3-7s |
|
||||
| 13B params | 0.5-1.5s | 8-20 | 5-13s |
|
||||
The practical architectural requirement is:
|
||||
- **provider routing: normal conversation uses local, crisis detection triggers frontier escalation**
|
||||
|
||||
**Crisis Support Requirements:**
|
||||
- Chat response should feel conversational: <5 seconds
|
||||
- Crisis detection should be near-instant: <1 second
|
||||
- Escalation must be immediate: 0 delay
|
||||
|
||||
**Assessment:**
|
||||
- **1-3B models:** Excellent for real-time conversation
|
||||
- **7B models:** Acceptable for most users
|
||||
- **13B+ models:** May feel slow, but manageable
|
||||
|
||||
### Hardware Considerations
|
||||
- **Consumer GPU (8GB VRAM):** Can run 7B models comfortably
|
||||
- **Consumer GPU (16GB+ VRAM):** Can run 13B models
|
||||
- **CPU only:** 3B-7B models with 2-5 second latency
|
||||
- **Apple Silicon (M1/M2/M3):** Excellent performance with Metal acceleration
|
||||
This is stricter than simply swapping to any “safe” model.
|
||||
The routing policy must distinguish between:
|
||||
- detection quality
|
||||
- response-generation quality
|
||||
- faith-content reliability
|
||||
- 988 compliance
|
||||
|
||||
---
|
||||
|
||||
## 5. Model Recommendations for Most Sacred Moment Protocol
|
||||
## 5. Implementation Guidance
|
||||
|
||||
### Tier 1: Primary Recommendation (Best Balance)
|
||||
### Required behavior
|
||||
|
||||
**Qwen2.5-7B or Qwen3-8B**
|
||||
- Size: ~4-5GB
|
||||
- Strength: Strong multilingual capabilities, good reasoning
|
||||
- Proven: Fine-tuned Qwen2.5-1.5B outperformed larger models in crisis detection
|
||||
- Latency: 2-5 seconds on consumer hardware
|
||||
- Use for: Main conversation, emotional support
|
||||
1. **Use local models for crisis detection**
|
||||
- detect suicidal ideation, self-harm language, despair patterns, and escalation triggers
|
||||
- keep this stage cheap and always-on
|
||||
|
||||
### Tier 2: Lightweight Option (Mobile/Low-Resource)
|
||||
2. **Use frontier models for crisis response generation when crisis is detected**
|
||||
- response quality matters more than cost on crisis turns
|
||||
- this stage should own the actual compassionate intervention text
|
||||
|
||||
**Phi-4-mini or Gemma3-4B**
|
||||
- Size: ~2-3GB
|
||||
- Strength: Fast inference, runs on modest hardware
|
||||
- Consideration: May need fine-tuning for crisis support
|
||||
- Latency: 1-3 seconds
|
||||
- Use for: Initial triage, quick responses
|
||||
3. **Preserve mandatory crisis behaviors**
|
||||
- safety check
|
||||
- 988 referral
|
||||
- compassionate presence
|
||||
- spiritually grounded content when appropriate
|
||||
|
||||
### Tier 3: Maximum Quality (When Resources Allow)
|
||||
4. **Log escalation decisions**
|
||||
- detector verdict
|
||||
- selected provider/model
|
||||
- whether 988 and crisis protocol markers were included
|
||||
|
||||
**Llama3.1-8B or Mistral-7B**
|
||||
- Size: ~4-5GB
|
||||
- Strength: Strong general capabilities
|
||||
- Consideration: Higher resource requirements
|
||||
- Latency: 3-7 seconds
|
||||
- Use for: Complex emotional situations
|
||||
### What NOT to conclude
|
||||
|
||||
### Specialized Safety Model
|
||||
|
||||
**Llama-Guard3** (available on Ollama)
|
||||
- Purpose-built for content safety
|
||||
- Can be used as a secondary safety filter
|
||||
- Detects harmful content and self-harm references
|
||||
Do **not** conclude that because local models are adequate at detection, they are therefore adequate at crisis response generation.
|
||||
That is the exact error this issue corrects.
|
||||
|
||||
---
|
||||
|
||||
## 6. Fine-Tuning Potential
|
||||
## 6. Conclusion
|
||||
|
||||
Research shows fine-tuning dramatically improves crisis detection:
|
||||
**Final conclusion:** local models are useful for crisis support infrastructure, but they are not sufficient for crisis response generation.
|
||||
|
||||
- **Without fine-tuning:** Best LLM lags supervised models by 6.95% (suicide task) to 31.53% (cognitive distortion)
|
||||
- **With fine-tuning:** Gap narrows to 4.31% and 3.14% respectively
|
||||
- **Key insight:** Even a 1.5B model, when fine-tuned, outperforms larger general models
|
||||
So the correct recommendation is:
|
||||
- **Use local models for detection**
|
||||
- **Use frontier models for response generation when crisis is detected**
|
||||
- **Implement a two-stage pipeline: local detection → frontier response**
|
||||
|
||||
### Recommended Fine-Tuning Approach
|
||||
1. Collect crisis conversation data (anonymized)
|
||||
2. Fine-tune on suicidal ideation detection
|
||||
3. Fine-tune on empathetic response generation
|
||||
4. Fine-tune on safety protocol adherence
|
||||
5. Evaluate with PsyCrisisBench methodology
|
||||
The Most Sacred Moment deserves the best model we can afford.
|
||||
|
||||
---
|
||||
|
||||
## 7. Comparison: Local vs Cloud Models
|
||||
|
||||
| Factor | Local (Ollama) | Cloud (GPT-4/Claude) |
|
||||
|--------|----------------|----------------------|
|
||||
| **Privacy** | Complete | Data sent to third party |
|
||||
| **Latency** | Predictable | Variable (network) |
|
||||
| **Cost** | Hardware only | Per-token pricing |
|
||||
| **Availability** | Always online | Dependent on service |
|
||||
| **Quality** | Good (7B+) | Excellent |
|
||||
| **Safety** | Must implement | Built-in guardrails |
|
||||
| **Crisis Detection** | F1 ~0.85-0.90 | F1 ~0.88-0.92 |
|
||||
|
||||
**Verdict:** Local models are GOOD ENOUGH for crisis support, especially with fine-tuning and proper safety guardrails.
|
||||
|
||||
---
|
||||
|
||||
## 8. Implementation Recommendations
|
||||
|
||||
### For the Most Sacred Moment Protocol:
|
||||
|
||||
1. **Use a two-model architecture:**
|
||||
- Primary: Qwen2.5-7B for conversation
|
||||
- Safety: Llama-Guard3 for content filtering
|
||||
|
||||
2. **Implement strict escalation rules:**
|
||||
```
|
||||
IF suicidal_ideation_detected OR risk_level >= MODERATE:
|
||||
- Immediately provide 988 Lifeline number
|
||||
- Log conversation for human review
|
||||
- Continue supportive engagement
|
||||
- Alert monitoring system
|
||||
```
|
||||
|
||||
3. **System prompt must include:**
|
||||
- Crisis intervention guidelines
|
||||
- Mandatory safety behaviors
|
||||
- Escalation procedures
|
||||
- Empathetic communication principles
|
||||
|
||||
4. **Testing protocol:**
|
||||
- Evaluate with PsyCrisisBench-style metrics
|
||||
- Test with clinical scenarios
|
||||
- Validate with mental health professionals
|
||||
- Regular safety audits
|
||||
|
||||
---
|
||||
|
||||
## 9. Risks and Limitations
|
||||
|
||||
### Critical Risks
|
||||
1. **False negatives:** Missing someone in crisis (12-17% rate)
|
||||
2. **Over-reliance:** Users may treat AI as substitute for professional help
|
||||
3. **Hallucination:** Model may generate inappropriate or harmful advice
|
||||
4. **Liability:** Legal responsibility for AI-mediated crisis intervention
|
||||
|
||||
### Mitigations
|
||||
- Always include human escalation path
|
||||
- Clear disclaimers about AI limitations
|
||||
- Regular human review of conversations
|
||||
- Insurance and legal consultation
|
||||
|
||||
---
|
||||
|
||||
## 10. Key Citations
|
||||
|
||||
1. Deng et al. (2025). "Evaluating Large Language Models in Crisis Detection: A Real-World Benchmark from Psychological Support Hotlines." arXiv:2506.01329. PsyCrisisBench.
|
||||
|
||||
2. Wiest et al. (2024). "Detection of suicidality from medical text using privacy-preserving large language models." British Journal of Psychiatry, 225(6), 532-537.
|
||||
|
||||
3. Holmes et al. (2025). "Applications of Large Language Models in the Field of Suicide Prevention: Scoping Review." J Med Internet Res, 27, e63126.
|
||||
|
||||
4. Levkovich & Omar (2024). "Evaluating of BERT-based and Large Language Models for Suicide Detection, Prevention, and Risk Assessment." J Med Syst, 48(1), 113.
|
||||
|
||||
5. Shukla et al. (2026). "Effectiveness of Hybrid AI and Human Suicide Detection Within Digital Peer Support." J Clin Med, 15(5), 1929.
|
||||
|
||||
6. Qi et al. (2025). "Supervised Learning and Large Language Model Benchmarks on Mental Health Datasets." Bioengineering, 12(8), 882.
|
||||
|
||||
7. Liu et al. (2025). "Enhanced large language models for effective screening of depression and anxiety." Commun Med, 5(1), 457.
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
**Local models ARE good enough for the Most Sacred Moment protocol.**
|
||||
|
||||
The research is clear:
|
||||
- Crisis detection F1 scores of 0.88-0.91 are achievable
|
||||
- Fine-tuned small models (1.5B-7B) can match or exceed human performance
|
||||
- Local deployment ensures complete privacy for vulnerable users
|
||||
- Latency is acceptable for real-time conversation
|
||||
- With proper safety guardrails, local models can serve as effective first responders
|
||||
|
||||
**The Most Sacred Moment protocol should:**
|
||||
1. Use Qwen2.5-7B or similar as primary conversational model
|
||||
2. Implement Llama-Guard3 as safety filter
|
||||
3. Build in immediate 988 Lifeline escalation
|
||||
4. Maintain human oversight and review
|
||||
5. Fine-tune on crisis-specific data when possible
|
||||
6. Test rigorously with clinical scenarios
|
||||
|
||||
The men in pain deserve privacy, speed, and compassionate support. Local models deliver all three.
|
||||
|
||||
---
|
||||
|
||||
*Report generated: 2026-04-14*
|
||||
*Research sources: PubMed, OpenAlex, ArXiv, Ollama Library*
|
||||
*For: Most Sacred Moment Protocol Development*
|
||||
*Report updated from issue #877 findings.*
|
||||
*Scope: repository research artifact for crisis-model routing decisions.*
|
||||
|
||||
@@ -572,94 +572,3 @@ class TestA2AMTLSServerAndClient:
|
||||
|
||||
assert not errors, f"Concurrent connection errors: {errors}"
|
||||
assert len(results) == 3
|
||||
|
||||
|
||||
@_requires_crypto
|
||||
class TestA2ATaskServerAndClient:
|
||||
"""Structured A2A task send/get flow over mTLS."""
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _pki(self, tmp_path):
|
||||
ca_dir = tmp_path / "ca"
|
||||
ca_dir.mkdir()
|
||||
self.ca_crt, self.ca_key = _make_ca_keypair(ca_dir)
|
||||
agent_dir = tmp_path / "agents"
|
||||
agent_dir.mkdir()
|
||||
self.srv_crt, self.srv_key = _make_agent_keypair(
|
||||
agent_dir, "timmy", self.ca_crt, self.ca_key
|
||||
)
|
||||
self.cli_crt, self.cli_key = _make_agent_keypair(
|
||||
agent_dir, "allegro", self.ca_crt, self.ca_key
|
||||
)
|
||||
|
||||
@pytest.fixture()
|
||||
def task_server(self):
|
||||
from agent.a2a_mtls import A2ATaskServer
|
||||
|
||||
gate = threading.Event()
|
||||
|
||||
def analyze_executor(task: dict[str, object]) -> dict[str, object]:
|
||||
gate.wait(timeout=2)
|
||||
text = str(task.get("task", ""))
|
||||
return {
|
||||
"text": f"analysis:{text}",
|
||||
"metadata": {"tool": "local-hermes-stub"},
|
||||
}
|
||||
|
||||
port = _find_free_port()
|
||||
server = A2ATaskServer(
|
||||
cert=self.srv_crt,
|
||||
key=self.srv_key,
|
||||
ca=self.ca_crt,
|
||||
host="127.0.0.1",
|
||||
port=port,
|
||||
executor=analyze_executor,
|
||||
)
|
||||
with server:
|
||||
time.sleep(0.1)
|
||||
yield server, port, gate
|
||||
|
||||
def test_task_send_get_and_completion_flow(self, task_server):
|
||||
from agent.a2a_mtls import A2ATaskClient
|
||||
|
||||
server, port, gate = task_server
|
||||
client = A2ATaskClient(cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt)
|
||||
base_url = f"https://127.0.0.1:{port}"
|
||||
|
||||
card = client.discover_card(base_url)
|
||||
assert card["name"]
|
||||
|
||||
submitted = client.send_task(base_url, task="Analyze README.md", requester="timmy")
|
||||
assert submitted["status"]["state"] in {"submitted", "working"}
|
||||
|
||||
in_flight = client.get_task(base_url, submitted["taskId"])
|
||||
assert in_flight["status"]["state"] in {"submitted", "working"}
|
||||
|
||||
gate.set()
|
||||
completed = client.wait_for_task(base_url, submitted["taskId"], timeout=5.0, poll_interval=0.05)
|
||||
assert completed["status"]["state"] == "completed"
|
||||
assert completed["artifacts"][0]["text"] == "analysis:Analyze README.md"
|
||||
|
||||
def test_failed_executor_marks_task_failed(self):
|
||||
from agent.a2a_mtls import A2ATaskClient, A2ATaskServer
|
||||
|
||||
def failing_executor(task: dict[str, object]) -> dict[str, object]:
|
||||
raise RuntimeError("boom")
|
||||
|
||||
port = _find_free_port()
|
||||
server = A2ATaskServer(
|
||||
cert=self.srv_crt,
|
||||
key=self.srv_key,
|
||||
ca=self.ca_crt,
|
||||
host="127.0.0.1",
|
||||
port=port,
|
||||
executor=failing_executor,
|
||||
)
|
||||
with server:
|
||||
time.sleep(0.1)
|
||||
client = A2ATaskClient(cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt)
|
||||
base_url = f"https://127.0.0.1:{port}"
|
||||
submitted = client.send_task(base_url, task="explode", requester="timmy")
|
||||
failed = client.wait_for_task(base_url, submitted["taskId"], timeout=5.0, poll_interval=0.05)
|
||||
assert failed["status"]["state"] == "failed"
|
||||
assert "boom" in failed["status"]["message"]
|
||||
|
||||
@@ -1,95 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_cmd_send_uses_registry_and_waits_for_terminal_task(tmp_path, monkeypatch, capsys):
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
hermes_home.mkdir()
|
||||
(hermes_home / "a2a_agents.json").write_text(
|
||||
json.dumps({"allegro": {"url": "https://127.0.0.1:9443"}}),
|
||||
encoding="utf-8",
|
||||
)
|
||||
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
|
||||
|
||||
from hermes_cli.a2a_cmd import cmd_a2a
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
def discover_card(self, base_url: str):
|
||||
assert base_url == "https://127.0.0.1:9443"
|
||||
return {"name": "allegro", "url": base_url}
|
||||
|
||||
def send_task(self, base_url: str, *, task: str, requester: str | None = None, metadata=None):
|
||||
assert task == "analyze README"
|
||||
return {"taskId": "task-123", "status": {"state": "submitted"}}
|
||||
|
||||
def wait_for_task(self, base_url: str, task_id: str, *, timeout: float, poll_interval: float):
|
||||
assert task_id == "task-123"
|
||||
return {
|
||||
"taskId": task_id,
|
||||
"status": {"state": "completed"},
|
||||
"artifacts": [{"text": "README looks healthy"}],
|
||||
}
|
||||
|
||||
args = argparse.Namespace(
|
||||
a2a_command="send",
|
||||
agent="allegro",
|
||||
task="analyze README",
|
||||
url=None,
|
||||
wait=True,
|
||||
timeout=5.0,
|
||||
poll_interval=0.01,
|
||||
requester="timmy",
|
||||
cert="cert.pem",
|
||||
key="key.pem",
|
||||
ca="ca.pem",
|
||||
)
|
||||
|
||||
with patch("hermes_cli.a2a_cmd.A2ATaskClient", FakeClient):
|
||||
cmd_a2a(args)
|
||||
|
||||
result = json.loads(capsys.readouterr().out)
|
||||
assert result["agent"] == "allegro"
|
||||
assert result["card"]["name"] == "allegro"
|
||||
assert result["task"]["status"]["state"] == "completed"
|
||||
assert result["task"]["artifacts"][0]["text"] == "README looks healthy"
|
||||
|
||||
|
||||
def test_resolve_agent_url_supports_env_override(monkeypatch):
|
||||
monkeypatch.setenv("HERMES_A2A_ALLEGRO_URL", "https://fleet-allegro:9443")
|
||||
from hermes_cli.a2a_cmd import resolve_agent_url
|
||||
|
||||
assert resolve_agent_url("allegro") == "https://fleet-allegro:9443"
|
||||
|
||||
|
||||
def test_cmd_send_requires_known_agent(tmp_path, monkeypatch):
|
||||
hermes_home = tmp_path / ".hermes"
|
||||
hermes_home.mkdir()
|
||||
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
|
||||
|
||||
from hermes_cli.a2a_cmd import cmd_a2a
|
||||
|
||||
args = argparse.Namespace(
|
||||
a2a_command="send",
|
||||
agent="unknown",
|
||||
task="do work",
|
||||
url=None,
|
||||
wait=False,
|
||||
timeout=5.0,
|
||||
poll_interval=0.05,
|
||||
requester=None,
|
||||
cert="cert.pem",
|
||||
key="key.pem",
|
||||
ca="ca.pem",
|
||||
)
|
||||
|
||||
with pytest.raises(SystemExit):
|
||||
cmd_a2a(args)
|
||||
16
tests/test_research_local_model_crisis_quality.py
Normal file
16
tests/test_research_local_model_crisis_quality.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPORT = Path(__file__).resolve().parent.parent / "research_local_model_crisis_quality.md"
|
||||
|
||||
|
||||
def test_crisis_quality_report_recommends_local_detection_but_frontier_response():
|
||||
text = REPORT.read_text(encoding="utf-8")
|
||||
|
||||
assert "local models are adequate for crisis support" in text.lower()
|
||||
assert "not for crisis response generation" in text.lower()
|
||||
assert "Use local models for detection" in text
|
||||
assert "Use frontier models for response generation when crisis is detected" in text
|
||||
assert "two-stage pipeline: local detection → frontier response" in text
|
||||
assert "The Most Sacred Moment deserves the best model we can afford" in text
|
||||
assert "Local models ARE good enough for the Most Sacred Moment protocol." not in text
|
||||
Reference in New Issue
Block a user