Compare commits

..

2 Commits

Author SHA1 Message Date
Alexander Whitestone
985488bcbe feat: add A2A task delegation over mTLS (#804)
All checks were successful
Lint / lint (pull_request) Successful in 11s
2026-04-22 11:14:26 -04:00
Alexander Whitestone
524868d4f4 test: add red coverage for A2A task delegation (#804) 2026-04-22 11:09:18 -04:00
11 changed files with 845 additions and 1122 deletions

View File

@@ -29,6 +29,8 @@ import logging
import os
import ssl
import threading
import time
import uuid
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
from typing import Any, Callable, Dict, Optional
@@ -441,3 +443,244 @@ class A2AMTLSClient:
def post(self, url: str, json: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
data = (__import__("json").dumps(json).encode() if json is not None else None)
return self._request("POST", url, data=data, **kwargs)
# ---------------------------------------------------------------------------
# Structured A2A task delegation over mTLS
# ---------------------------------------------------------------------------
_TERMINAL_TASK_STATES = {"completed", "failed", "canceled", "rejected"}
def _iso_now() -> str:
return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
def _task_status(state: str, message: str) -> Dict[str, Any]:
return {
"state": state,
"message": message,
"timestamp": _iso_now(),
}
def _coerce_artifact(result: Any) -> Dict[str, Any]:
if isinstance(result, dict):
if "text" in result:
return result
if "artifact" in result and isinstance(result["artifact"], dict):
return result["artifact"]
return {"text": str(result)}
def _build_task_record(task_id: str, task: str, requester: Optional[str], metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
return {
"taskId": task_id,
"task": task,
"requester": requester,
"metadata": metadata or {},
"artifacts": [],
"status": _task_status("submitted", "Task submitted"),
}
def _default_agent_card(host: str, port: int) -> Dict[str, Any]:
base_url = f"https://{host}:{port}"
try:
from agent.agent_card import build_agent_card
from dataclasses import asdict
card = asdict(build_agent_card())
except Exception as exc: # pragma: no cover - fallback only exercised when card build breaks
logger.warning("Falling back to minimal agent card: %s", exc)
card = {
"name": os.environ.get("HERMES_AGENT_NAME", "hermes"),
"description": "Hermes A2A task server",
"version": "unknown",
}
card["url"] = base_url
card["a2aTaskEndpoint"] = f"{base_url}/a2a/rpc"
return card
def _default_local_hermes_executor(task_payload: Dict[str, Any]) -> Dict[str, Any]:
task_text = str(task_payload.get("task", "")).strip()
if not task_text:
return {"text": ""}
from run_agent import AIAgent
agent = AIAgent(quiet_mode=True)
result = agent.chat(task_text)
return {
"text": result,
"metadata": {"executor": "local-hermes"},
}
class A2ATaskServer:
"""JSON-RPC A2A task server running over the routing mTLS server."""
def __init__(
self,
cert: str | Path,
key: str | Path,
ca: str | Path,
host: str = "127.0.0.1",
port: int = 9443,
executor: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None,
card_factory: Optional[Callable[[], Dict[str, Any]]] = None,
) -> None:
self.host = host
self.port = port
self._server = A2AMTLSServer(cert=cert, key=key, ca=ca, host=host, port=port)
self._executor = executor or _default_local_hermes_executor
self._card_factory = card_factory or (lambda: _default_agent_card(self.host, self.port))
self._tasks: Dict[str, Dict[str, Any]] = {}
self._lock = threading.Lock()
self._server.add_route("/.well-known/agent-card.json", self._handle_agent_card)
self._server.add_route("/agent-card.json", self._handle_agent_card)
self._server.add_route("/a2a/rpc", self._handle_rpc)
def __enter__(self) -> "A2ATaskServer":
self.start()
return self
def __exit__(self, *_: Any) -> None:
self.stop()
def start(self) -> None:
self._server.start()
def stop(self) -> None:
self._server.stop()
def _handle_agent_card(self, payload: Dict[str, Any], *, peer_cn: str | None = None) -> Dict[str, Any]:
return self._card_factory()
def _handle_rpc(self, payload: Dict[str, Any], *, peer_cn: str | None = None) -> Dict[str, Any]:
req_id = payload.get("id")
if payload.get("jsonrpc") != "2.0":
return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32600, "message": "invalid jsonrpc version"}}
method = payload.get("method")
params = payload.get("params") or {}
try:
if method == "tasks/send":
result = self._rpc_send_task(params, peer_cn=peer_cn)
elif method == "tasks/get":
result = self._rpc_get_task(params)
else:
return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32601, "message": f"unknown method: {method}"}}
except Exception as exc:
logger.exception("A2A task RPC failed: %s", exc)
return {"jsonrpc": "2.0", "id": req_id, "error": {"code": -32000, "message": str(exc)}}
return {"jsonrpc": "2.0", "id": req_id, "result": result}
def _rpc_send_task(self, params: Dict[str, Any], *, peer_cn: str | None = None) -> Dict[str, Any]:
task_text = str(params.get("task", "")).strip()
if not task_text:
raise ValueError("task is required")
task_id = params.get("taskId") or uuid.uuid4().hex
requester = params.get("requester") or peer_cn
metadata = dict(params.get("metadata") or {})
if peer_cn:
metadata.setdefault("peer_cn", peer_cn)
record = _build_task_record(task_id, task_text, requester, metadata)
with self._lock:
self._tasks[task_id] = record
worker = threading.Thread(target=self._run_task, args=(task_id,), daemon=True, name=f"a2a-task-{task_id[:8]}")
worker.start()
return self._copy_task(task_id)
def _rpc_get_task(self, params: Dict[str, Any]) -> Dict[str, Any]:
task_id = str(params.get("taskId", "")).strip()
if not task_id:
raise ValueError("taskId is required")
return self._copy_task(task_id)
def _copy_task(self, task_id: str) -> Dict[str, Any]:
with self._lock:
if task_id not in self._tasks:
raise KeyError(f"unknown taskId: {task_id}")
return json.loads(json.dumps(self._tasks[task_id]))
def _run_task(self, task_id: str) -> None:
with self._lock:
task = self._tasks[task_id]
task["status"] = _task_status("working", "Task is running")
task_payload = {
"taskId": task["taskId"],
"task": task["task"],
"requester": task.get("requester"),
"metadata": dict(task.get("metadata") or {}),
}
try:
result = self._executor(task_payload)
artifact = _coerce_artifact(result)
with self._lock:
task = self._tasks[task_id]
task["artifacts"] = [artifact]
task["status"] = _task_status("completed", "Task completed")
except Exception as exc:
with self._lock:
task = self._tasks[task_id]
task["status"] = _task_status("failed", f"Task failed: {exc}")
class A2ATaskClient(A2AMTLSClient):
"""Client helper for A2A JSON-RPC task send/get flows."""
def discover_card(self, base_url: str) -> Dict[str, Any]:
return self.get(f"{base_url.rstrip('/')}/.well-known/agent-card.json")
def _rpc_call(self, base_url: str, method: str, params: Dict[str, Any]) -> Dict[str, Any]:
payload = {
"jsonrpc": "2.0",
"id": uuid.uuid4().hex,
"method": method,
"params": params,
}
response = self.post(f"{base_url.rstrip('/')}/a2a/rpc", json=payload)
if "error" in response:
error = response["error"]
raise RuntimeError(error.get("message") or str(error))
return response.get("result", {})
def send_task(
self,
base_url: str,
*,
task: str,
requester: str | None = None,
metadata: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
return self._rpc_call(
base_url,
"tasks/send",
{
"task": task,
"requester": requester,
"metadata": metadata or {},
},
)
def get_task(self, base_url: str, task_id: str) -> Dict[str, Any]:
return self._rpc_call(base_url, "tasks/get", {"taskId": task_id})
def wait_for_task(
self,
base_url: str,
task_id: str,
*,
timeout: float = 30.0,
poll_interval: float = 0.5,
) -> Dict[str, Any]:
deadline = time.monotonic() + timeout
while True:
task = self.get_task(base_url, task_id)
state = str(((task.get("status") or {}).get("state") or "")).lower()
if state in _TERMINAL_TASK_STATES:
return task
if time.monotonic() >= deadline:
raise TimeoutError(f"Timed out waiting for task {task_id}")
time.sleep(poll_interval)

View File

@@ -1,757 +1,194 @@
[
{
"id": "screenshot_github_mark",
"id": "screenshot_github_home",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": [
"github",
"logo",
"mark"
],
"expected_keywords": ["github", "logo", "mark"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_github_social",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": [
"github",
"page",
"web"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_github_code_search",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": [
"search",
"code",
"feature"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_terminal_capture",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": [
"terminal",
"command",
"output"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_http_404",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": [
"404",
"error",
"cat"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_01",
"url": "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01",
"category": "screenshot",
"expected_keywords": [
"hermes",
"cli",
"session"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_cli_02",
"url": "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard",
"category": "screenshot",
"expected_keywords": [
"prompt",
"cache",
"dashboard"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_01",
"url": "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode",
"category": "screenshot",
"expected_keywords": [
"settings",
"voice",
"mode"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_02",
"url": "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview",
"category": "screenshot",
"expected_keywords": [
"browser",
"vision",
"preview"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "screenshot_dummy_ui_03",
"url": "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector",
"category": "screenshot",
"expected_keywords": [
"tool",
"call",
"inspector"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "diagram_flow_a",
"url": "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker",
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"category": "diagram",
"expected_keywords": [
"flowchart",
"api",
"worker"
],
"expected_keywords": ["flow", "diagram", "process"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "diagram_flow_b",
"url": "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client",
"category": "diagram",
"expected_keywords": [
"architecture",
"diagram",
"cache"
],
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_uml_a",
"url": "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message",
"category": "diagram",
"expected_keywords": [
"class",
"diagram",
"session"
],
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_uml_b",
"url": "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response",
"category": "diagram",
"expected_keywords": [
"sequence",
"diagram",
"response"
],
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "diagram_network_a",
"url": "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router",
"category": "diagram",
"expected_keywords": [
"network",
"node",
"router"
],
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_network_b",
"url": "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth",
"category": "diagram",
"expected_keywords": [
"service",
"mesh",
"auth"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_state_machine",
"url": "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop",
"category": "diagram",
"expected_keywords": [
"state",
"machine",
"idle"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_mind_map",
"url": "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools",
"category": "diagram",
"expected_keywords": [
"mind",
"memory",
"tools"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
},
{
"id": "diagram_pipeline",
"url": "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize",
"category": "diagram",
"expected_keywords": [
"pipeline",
"ingest",
"summarize"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "diagram_org_chart",
"url": "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": [
"org",
"chart",
"review"
],
"expected_keywords": ["organization", "hierarchy", "chart"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": false
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_01",
"url": "https://picsum.photos/seed/vision-bench-1/640/480",
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_02",
"url": "https://picsum.photos/seed/vision-bench-2/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_03",
"url": "https://picsum.photos/seed/vision-bench-3/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_04",
"url": "https://picsum.photos/seed/vision-bench-4/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_05",
"url": "https://picsum.photos/seed/vision-bench-5/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_06",
"url": "https://picsum.photos/seed/vision-bench-6/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_07",
"url": "https://picsum.photos/seed/vision-bench-7/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_08",
"url": "https://picsum.photos/seed/vision-bench-8/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_09",
"url": "https://picsum.photos/seed/vision-bench-9/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "photo_random_10",
"url": "https://picsum.photos/seed/vision-bench-10/640/480",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 30,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "chart_bar_quarterly",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": [
"bar",
"chart",
"revenue"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_pie_market",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": [
"pie",
"chart",
"percentage"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_line_temp",
"id": "chart_line",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"category": "chart",
"expected_keywords": [
"line",
"chart",
"temperature"
],
"expected_keywords": ["line", "chart", "temperature"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "chart_radar_skill",
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_radar",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"category": "chart",
"expected_keywords": [
"radar",
"chart",
"skill"
],
"expected_keywords": ["radar", "chart", "skill"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "chart_stacked_cloud",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart",
"expected_keywords": [
"stacked",
"bar",
"chart"
],
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_area_growth",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}",
"category": "chart",
"expected_keywords": [
"line",
"growth",
"chart"
],
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "chart_scatter_eval",
"url": "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}",
"category": "chart",
"expected_keywords": [
"scatter",
"chart",
"runs"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_horizontal_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}",
"category": "chart",
"expected_keywords": [
"bar",
"score",
"ocr"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_bubble_usage",
"url": "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}",
"category": "chart",
"expected_keywords": [
"bubble",
"latency",
"chart"
],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
},
{
"id": "chart_doughnut_devices",
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": [
"doughnut",
"chart",
"device"
],
"expected_keywords": ["doughnut", "chart", "device"],
"ground_truth_ocr": "",
"expected_structure": {
"min_length": 50,
"min_sentences": 2,
"has_numbers": true
}
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "ocr_text_01",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Hermes+OCR+Alpha+01",
"category": "ocr",
"expected_keywords": [
"hermes",
"ocr"
],
"ground_truth_ocr": "Hermes OCR Alpha 01",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "ocr_text_02",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Prompt+Cache+Hit+87%",
"category": "ocr",
"expected_keywords": [
"prompt",
"cache"
],
"ground_truth_ocr": "Prompt Cache Hit 87%",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
},
{
"id": "ocr_text_03",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Session+42+Ready",
"category": "ocr",
"expected_keywords": [
"session",
"42"
],
"ground_truth_ocr": "Session 42 Ready",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "ocr_text_04",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Latency+118+ms",
"category": "ocr",
"expected_keywords": [
"latency",
"118"
],
"ground_truth_ocr": "Latency 118 ms",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": true
}
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "ocr_text_05",
"url": "https://dummyimage.com/1200x320/ffffff/000000.png&text=Voice+Mode+Enabled",
"category": "ocr",
"expected_keywords": [
"voice",
"mode"
],
"ground_truth_ocr": "Voice Mode Enabled",
"expected_structure": {
"min_length": 10,
"min_sentences": 1,
"has_numbers": false
}
"id": "chart_stacked_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "document_text_01",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Invoice+1001+Total+42+Due+2026-04-22",
"category": "document",
"expected_keywords": [
"invoice",
"1001",
"total"
],
"ground_truth_ocr": "Invoice 1001 Total 42 Due 2026-04-22",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "document_text_02",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Form+A+Name+Alice+Status+Approved",
"category": "document",
"expected_keywords": [
"form",
"a",
"name"
],
"ground_truth_ocr": "Form A Name Alice Status Approved",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
},
{
"id": "document_text_03",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Report+Memory+Recall+Score+91+Percent",
"category": "document",
"expected_keywords": [
"report",
"memory",
"recall"
],
"ground_truth_ocr": "Report Memory Recall Score 91 Percent",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_04",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Checklist+Crisis+Escalation+Call+988+Now",
"category": "document",
"expected_keywords": [
"checklist",
"crisis",
"escalation"
],
"ground_truth_ocr": "Checklist Crisis Escalation Call 988 Now",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": true
}
},
{
"id": "document_text_05",
"url": "https://dummyimage.com/1400x900/f8fafc/0f172a.png&text=Meeting+Notes+Vision+Benchmark+Run+Pending",
"category": "document",
"expected_keywords": [
"meeting",
"notes",
"vision"
],
"ground_truth_ocr": "Meeting Notes Vision Benchmark Run Pending",
"expected_structure": {
"min_length": 20,
"min_sentences": 1,
"has_numbers": false
}
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
}
]
]

View File

@@ -22,12 +22,10 @@ import argparse
import asyncio
import base64
import json
import mimetypes
import os
import statistics
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
@@ -43,16 +41,12 @@ MODELS = {
"model_id": "google/gemma-4-27b-it",
"display_name": "Gemma 4 27B",
"provider": "nous",
"fallback_provider": "ollama",
"fallback_model_id": "gemma4:latest",
"description": "Google's multimodal Gemma 4 model",
},
"gemini3_flash": {
"model_id": "google/gemini-3-flash-preview",
"display_name": "Gemini 3 Flash Preview",
"provider": "openrouter",
"fallback_provider": "gemini",
"fallback_model_id": "gemini-2.5-flash",
"description": "Current default vision model",
},
}
@@ -90,150 +84,91 @@ async def analyze_with_model(
"""
import httpx
def _load_image_bytes_cached() -> tuple[bytes, str]:
nonlocal _image_bytes, _mime_type
if _image_bytes is not None:
return _image_bytes, _mime_type
if image_url.startswith(("http://", "https://")):
with urllib.request.urlopen(image_url, timeout=30) as resp:
_image_bytes = resp.read()
_mime_type = resp.headers.get_content_type() or mimetypes.guess_type(image_url)[0] or "image/png"
else:
path = Path(image_url).expanduser()
_image_bytes = path.read_bytes()
_mime_type = mimetypes.guess_type(str(path))[0] or "image/png"
return _image_bytes, _mime_type
def _data_url() -> str:
image_bytes, mime_type = _load_image_bytes_cached()
return f"data:{mime_type};base64,{base64.b64encode(image_bytes).decode()}"
def _provider_key(provider: str) -> str:
if provider == "openrouter":
return os.getenv("OPENROUTER_API_KEY", "")
if provider == "nous":
return os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
if provider == "gemini":
return os.getenv("GEMINI_API_KEY", "") or os.getenv("GOOGLE_API_KEY", "")
return os.getenv(f"{provider.upper()}_API_KEY", "")
provider = model_config["provider"]
model_id = model_config["model_id"]
candidates = [(provider, model_id)]
if model_config.get("fallback_provider") and model_config.get("fallback_model_id"):
candidates.append((model_config["fallback_provider"], model_config["fallback_model_id"]))
_image_bytes: Optional[bytes] = None
_mime_type = "image/png"
failures = []
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
for candidate_provider, candidate_model in candidates:
api_key = _provider_key(candidate_provider)
start = time.perf_counter()
try:
if candidate_provider in {"openrouter", "nous"}:
api_url = (
"https://openrouter.ai/api/v1/chat/completions"
if candidate_provider == "openrouter"
else "https://inference.nousresearch.com/v1/chat/completions"
)
if not api_key:
raise RuntimeError(f"No API key for provider {candidate_provider}")
payload = {
"model": candidate_model,
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": _data_url() if not image_url.startswith(("http://", "https://")) else image_url}},
],
}],
"max_tokens": 2000,
"temperature": 0.1,
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
analysis = data.get("choices", [{}])[0].get("message", {}).get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
elif candidate_provider == "gemini":
if not api_key:
raise RuntimeError("No API key for provider gemini")
image_bytes, mime_type = _load_image_bytes_cached()
api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{candidate_model}:generateContent?key={api_key}"
payload = {
"contents": [{"parts": [
{"text": prompt},
{"inline_data": {"mime_type": mime_type, "data": base64.b64encode(image_bytes).decode()}},
]}],
"generationConfig": {"temperature": 0.1, "maxOutputTokens": 2000},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload)
resp.raise_for_status()
data = resp.json()
parts = data.get("candidates", [{}])[0].get("content", {}).get("parts", [])
analysis = "\n".join(part.get("text", "") for part in parts if isinstance(part, dict) and part.get("text"))
usage = data.get("usageMetadata", {})
tokens = {
"prompt_tokens": usage.get("promptTokenCount", 0),
"completion_tokens": usage.get("candidatesTokenCount", 0),
"total_tokens": usage.get("totalTokenCount", 0),
}
elif candidate_provider == "ollama":
image_bytes, _ = _load_image_bytes_cached()
payload = {
"model": candidate_model,
"stream": False,
"messages": [{"role": "user", "content": prompt, "images": [base64.b64encode(image_bytes).decode()]}],
"options": {"temperature": 0.1},
}
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post("http://localhost:11434/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
analysis = data.get("message", {}).get("content", "")
tokens = {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
"total_tokens": (data.get("prompt_eval_count", 0) or 0) + (data.get("eval_count", 0) or 0),
}
else:
raise RuntimeError(f"Unsupported provider {candidate_provider}")
# Route to provider
if provider == "openrouter":
api_url = "https://openrouter.ai/api/v1/chat/completions"
api_key = os.getenv("OPENROUTER_API_KEY", "")
elif provider == "nous":
api_url = "https://inference.nousresearch.com/v1/chat/completions"
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
else:
api_url = os.getenv(f"{provider.upper()}_API_URL", "")
api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
latency_ms = (time.perf_counter() - start) * 1000
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
"provider_used": candidate_provider,
"model_used": candidate_model,
}
except Exception as e:
failures.append(f"{candidate_provider}:{candidate_model} => {e}")
if not api_key:
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": f"No API key for provider {provider}",
}
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": " | ".join(failures) if failures else "No runs",
"provider_used": candidates[-1][0] if candidates else provider,
"model_used": candidates[-1][1] if candidates else model_id,
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
payload = {
"model": model_id,
"messages": messages,
"max_tokens": 2000,
"temperature": 0.1,
}
start = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
latency_ms = (time.perf_counter() - start) * 1000
analysis = ""
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
analysis = msg.get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
}
except Exception as e:
return {
"analysis": "",
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
"tokens": {},
"success": False,
"error": str(e),
}
# ---------------------------------------------------------------------------
# Evaluation metrics
@@ -463,13 +398,7 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
failed = [r[model_name] for r in results if not r[model_name]["success"]]
if not model_results:
summary[model_name] = {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
}
summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
continue
latencies = [r["avg_latency_ms"] for r in model_results]
@@ -481,7 +410,6 @@ def aggregate_results(results: List[dict], models: dict) -> dict:
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
"total_runs": len(model_results),
"total_failures": len(failed),
"failure_examples": sorted({f.get("error", "unknown failure") for f in failed})[:3],
"latency": {
"mean_ms": round(statistics.mean(latencies), 1),
"median_ms": round(statistics.median(latencies), 1),
@@ -567,23 +495,6 @@ def to_markdown(report: dict) -> str:
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
)
lines += ["", "## Failure Modes", ""]
had_failures = False
for mkey, mname in config["models"].items():
model_summary = summary.get(mkey, {})
failure_examples = model_summary.get("failure_examples", [])
if not failure_examples and not model_summary.get("error"):
continue
had_failures = True
lines.append(f"### {mname}")
if model_summary.get("error"):
lines.append(f"- Summary: {model_summary['error']}")
for err in failure_examples:
lines.append(f"- {err}")
lines.append("")
if not had_failures:
lines.append("- No provider/runtime failures recorded.")
# Verdict
lines += ["", "## Verdict", ""]
@@ -605,12 +516,8 @@ def to_markdown(report: dict) -> str:
if best_model:
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
lines.append("")
lines.append("Recommendation: keep the best-performing Gemma/Gemini lane from this run and only switch if repeated runs disagree.")
else:
lines.append("Benchmark blocked or insufficient data for a trustworthy winner.")
lines.append("")
lines.append("Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.")
lines.append("No clear winner — insufficient data.")
return "\n".join(lines)
@@ -621,124 +528,44 @@ def to_markdown(report: dict) -> str:
def generate_sample_dataset() -> List[dict]:
"""Generate a larger benchmark dataset aligned with issue #817.
"""Generate a sample test dataset with diverse public images.
Returns 50+ images across screenshots, diagrams, photos, OCR, charts,
and document-like images so the harness matches the issue contract.
Returns list of test image definitions.
"""
dataset: List[dict] = []
screenshots = [
("github_mark", "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png", ["github", "logo", "mark"]),
("github_social", "https://github.githubassets.com/images/modules/site/social-cards.png", ["github", "page", "web"]),
("github_code_search", "https://github.githubassets.com/images/modules/site/features-code-search.png", ["search", "code", "feature"]),
("terminal_capture", "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png", ["terminal", "command", "output"]),
("http_404", "https://http.cat/404.jpg", ["404", "error", "cat"]),
("dummy_cli_01", "https://dummyimage.com/1280x720/111827/f9fafb.png&text=Hermes+CLI+Session+01", ["hermes", "cli", "session"]),
("dummy_cli_02", "https://dummyimage.com/1280x720/0f172a/e2e8f0.png&text=Prompt+Cache+Dashboard", ["prompt", "cache", "dashboard"]),
("dummy_ui_01", "https://dummyimage.com/1280x720/1f2937/f3f4f6.png&text=Settings+Panel+Voice+Mode", ["settings", "voice", "mode"]),
("dummy_ui_02", "https://dummyimage.com/1280x720/334155/f8fafc.png&text=Browser+Vision+Preview", ["browser", "vision", "preview"]),
("dummy_ui_03", "https://dummyimage.com/1280x720/111827/ffffff.png&text=Tool+Call+Inspector", ["tool", "call", "inspector"]),
]
for ident, url, keywords in screenshots:
dataset.append({
"id": f"screenshot_{ident}",
"url": url,
return [
# Screenshots
{
"id": "screenshot_github",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
diagrams = [
("flow_a", "https://dummyimage.com/1200x800/f8fafc/0f172a.png&text=Flowchart+API+Gateway+Queue+Worker", ["flowchart", "api", "worker"]),
("flow_b", "https://dummyimage.com/1200x800/f1f5f9/0f172a.png&text=Architecture+Diagram+Database+Cache+Client", ["architecture", "diagram", "cache"]),
("uml_a", "https://dummyimage.com/1200x800/e2e8f0/0f172a.png&text=Class+Diagram+User+Session+Message", ["class", "diagram", "session"]),
("uml_b", "https://dummyimage.com/1200x800/cbd5e1/0f172a.png&text=Sequence+Diagram+Request+Response", ["sequence", "diagram", "response"]),
("network_a", "https://dummyimage.com/1200x800/ffffff/111827.png&text=Network+Nodes+Edges+Router", ["network", "node", "router"]),
("network_b", "https://dummyimage.com/1200x800/ffffff/1e293b.png&text=Service+Mesh+Proxy+Auth", ["service", "mesh", "auth"]),
("state_machine", "https://dummyimage.com/1200x800/f8fafc/334155.png&text=State+Machine+Idle+Run+Stop", ["state", "machine", "idle"]),
("mind_map", "https://dummyimage.com/1200x800/fefce8/1f2937.png&text=Mind+Map+Memory+Recall+Tools", ["mind", "memory", "tools"]),
("pipeline", "https://dummyimage.com/1200x800/ecfeff/155e75.png&text=Pipeline+Ingest+Rank+Summarize", ["pipeline", "ingest", "summarize"]),
("org_chart", "https://dummyimage.com/1200x800/fdf2f8/831843.png&text=Org+Chart+Lead+Review+Ops", ["org", "chart", "review"]),
]
for ident, url, keywords in diagrams:
dataset.append({
"id": f"diagram_{ident}",
"url": url,
"expected_keywords": ["github", "logo", "octocat"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
# Diagrams
{
"id": "diagram_architecture",
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
"category": "diagram",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": False},
})
for idx in range(1, 11):
dataset.append({
"id": f"photo_random_{idx:02d}",
"url": f"https://picsum.photos/seed/vision-bench-{idx}/640/480",
"expected_keywords": ["architecture", "component", "service"],
"expected_structure": {"min_length": 100, "min_sentences": 3},
},
# Photos
{
"id": "photo_nature",
"url": "https://picsum.photos/seed/bench1/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": False},
})
charts = [
("bar_quarterly", "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}", ["bar", "chart", "revenue"]),
("pie_market", "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}", ["pie", "chart", "percentage"]),
("line_temp", "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}", ["line", "chart", "temperature"]),
("radar_skill", "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}", ["radar", "chart", "skill"]),
("stacked_cloud", "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}", ["stacked", "bar", "chart"]),
("area_growth", "https://quickchart.io/chart?c={type:'line',data:{labels:['W1','W2','W3','W4'],datasets:[{label:'Growth',data:[10,15,18,24],fill:true}]}}", ["line", "growth", "chart"]),
("scatter_eval", "https://quickchart.io/chart?c={type:'scatter',data:{datasets:[{label:'Runs',data:[{x:1,y:70},{x:2,y:75},{x:3,y:82}]}]}}", ["scatter", "chart", "runs"]),
("horizontal_bar", "https://quickchart.io/chart?c={type:'bar',data:{labels:['UI','OCR','Docs'],datasets:[{label:'Score',data:[88,76,91]}]},options:{indexAxis:'y'}}", ["bar", "score", "ocr"]),
("bubble_usage", "https://quickchart.io/chart?c={type:'bubble',data:{datasets:[{label:'Latency',data:[{x:1,y:120,r:8},{x:2,y:95,r:6},{x:3,y:180,r:10}]}]}}", ["bubble", "latency", "chart"]),
("doughnut_devices", "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}", ["doughnut", "chart", "device"]),
]
for ident, url, keywords in charts:
dataset.append({
"id": f"chart_{ident}",
"url": url,
"expected_structure": {"min_length": 30, "min_sentences": 1},
},
# Charts
{
"id": "chart_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
"category": "chart",
"expected_keywords": keywords,
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": True},
})
ocr_texts = [
"Hermes OCR Alpha 01",
"Prompt Cache Hit 87%",
"Session 42 Ready",
"Latency 118 ms",
"Voice Mode Enabled",
"expected_keywords": ["bar", "chart", "data"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
]
for idx, text in enumerate(ocr_texts, start=1):
dataset.append({
"id": f"ocr_text_{idx:02d}",
"url": f"https://dummyimage.com/1200x320/ffffff/000000.png&text={text.replace(' ', '+')}",
"category": "ocr",
"expected_keywords": text.lower().split()[:2],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 10, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
documents = [
"Invoice 1001 Total 42 Due 2026-04-22",
"Form A Name Alice Status Approved",
"Report Memory Recall Score 91 Percent",
"Checklist Crisis Escalation Call 988 Now",
"Meeting Notes Vision Benchmark Run Pending",
]
for idx, text in enumerate(documents, start=1):
dataset.append({
"id": f"document_text_{idx:02d}",
"url": f"https://dummyimage.com/1400x900/f8fafc/0f172a.png&text={text.replace(' ', '+')}",
"category": "document",
"expected_keywords": text.lower().split()[:3],
"ground_truth_ocr": text,
"expected_structure": {"min_length": 20, "min_sentences": 1, "has_numbers": any(ch.isdigit() for ch in text)},
})
return dataset
def load_dataset(path: str) -> List[dict]:
@@ -758,9 +585,7 @@ async def main():
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--markdown-output", default=None, help="Optional markdown report output path")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
parser.add_argument("--limit", type=int, default=0, help="Limit to the first N images for smoke runs")
parser.add_argument("--models", nargs="+", default=None,
help="Models to test (default: all)")
parser.add_argument("--markdown", action="store_true", help="Output markdown report")
@@ -792,14 +617,9 @@ async def main():
print("ERROR: Provide --images or --url")
sys.exit(1)
if args.limit and args.limit > 0:
images = images[:args.limit]
# Run benchmark
report = await run_benchmark_suite(images, selected, args.runs)
markdown_report = to_markdown(report)
# Output
if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
@@ -807,14 +627,8 @@ async def main():
json.dump(report, f, indent=2)
print(f"\nResults saved to {args.output}")
if args.markdown_output:
os.makedirs(os.path.dirname(args.markdown_output) or ".", exist_ok=True)
with open(args.markdown_output, "w", encoding="utf-8") as f:
f.write(markdown_report)
print(f"Markdown report saved to {args.markdown_output}")
if args.markdown or not args.output:
print("\n" + markdown_report)
print("\n" + to_markdown(report))
if __name__ == "__main__":

132
hermes_cli/a2a_cmd.py Normal file
View File

@@ -0,0 +1,132 @@
"""CLI helpers for A2A task delegation."""
from __future__ import annotations
import json
import os
import re
import sys
import time
from pathlib import Path
from typing import Any
from agent.a2a_mtls import A2ATaskClient, A2ATaskServer
from hermes_cli.config import get_hermes_home
def _registry_path() -> Path:
return get_hermes_home() / "a2a_agents.json"
def _default_identity_paths() -> tuple[str, str, str]:
hermes_home = get_hermes_home()
agent_name = os.environ.get("HERMES_AGENT_NAME", "hermes").lower()
cert = os.environ.get(
"HERMES_A2A_CERT",
str(hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.crt"),
)
key = os.environ.get(
"HERMES_A2A_KEY",
str(hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.key"),
)
ca = os.environ.get(
"HERMES_A2A_CA",
str(hermes_home / "pki" / "ca" / "fleet-ca.crt"),
)
return cert, key, ca
def load_agent_registry(path: Path | None = None) -> dict[str, Any]:
registry_path = path or _registry_path()
if not registry_path.exists():
return {}
return json.loads(registry_path.read_text(encoding="utf-8"))
def resolve_agent_url(agent: str, *, registry_path: Path | None = None) -> str:
key = re.sub(r"[^A-Za-z0-9]+", "_", agent).upper()
env_value = os.getenv(f"HERMES_A2A_{key}_URL")
if env_value:
return env_value
registry = load_agent_registry(registry_path)
entry = registry.get(agent)
if isinstance(entry, str) and entry:
return entry
if isinstance(entry, dict):
url = entry.get("url") or entry.get("base_url") or entry.get("card_url")
if url:
return str(url)
if agent.startswith("https://") or agent.startswith("http://"):
return agent
raise SystemExit(f"Unknown A2A agent '{agent}'. Set HERMES_A2A_{key}_URL or add it to {_registry_path()}.")
def _print(data: dict[str, Any]) -> None:
print(json.dumps(data, indent=2, ensure_ascii=False))
def cmd_send(args) -> None:
base_url = args.url or resolve_agent_url(args.agent)
cert, key, ca = args.cert, args.key, args.ca
if not (cert and key and ca):
cert, key, ca = _default_identity_paths()
client = A2ATaskClient(cert=cert, key=key, ca=ca)
card = client.discover_card(base_url)
task = client.send_task(
base_url,
task=args.task,
requester=args.requester,
metadata={"agent": args.agent},
)
if args.wait:
task = client.wait_for_task(
base_url,
task["taskId"],
timeout=args.timeout,
poll_interval=args.poll_interval,
)
_print({
"agent": args.agent,
"url": base_url,
"card": card,
"task": task,
})
def cmd_status(args) -> None:
base_url = args.url or resolve_agent_url(args.agent)
cert, key, ca = args.cert, args.key, args.ca
if not (cert and key and ca):
cert, key, ca = _default_identity_paths()
client = A2ATaskClient(cert=cert, key=key, ca=ca)
task = client.get_task(base_url, args.task_id)
_print({"agent": args.agent, "url": base_url, "task": task})
def cmd_serve(args) -> None:
cert, key, ca = args.cert, args.key, args.ca
if not (cert and key and ca):
cert, key, ca = _default_identity_paths()
server = A2ATaskServer(cert=cert, key=key, ca=ca, host=args.host, port=args.port)
server.start()
print(f"A2A task server listening on https://{args.host}:{args.port}")
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
server.stop()
def cmd_a2a(args) -> None:
command = getattr(args, "a2a_command", None) or "send"
if command == "send":
cmd_send(args)
return
if command == "status":
cmd_status(args)
return
if command == "serve":
cmd_serve(args)
return
raise SystemExit(f"Unknown a2a command: {command}")

View File

@@ -173,6 +173,13 @@ from hermes_constants import OPENROUTER_BASE_URL
logger = logging.getLogger(__name__)
def cmd_a2a(args):
"""Dispatch A2A CLI subcommands lazily to avoid heavy imports at startup."""
from hermes_cli.a2a_cmd import cmd_a2a as _cmd_a2a
return _cmd_a2a(args)
def _relative_time(ts) -> str:
"""Format a timestamp as relative time (e.g., '2h ago', 'yesterday')."""
if not ts:
@@ -4781,6 +4788,45 @@ For more help on a command:
gateway_parser.set_defaults(func=cmd_gateway)
# =========================================================================
# a2a command
# =========================================================================
a2a_parser = subparsers.add_parser(
"a2a",
help="A2A task delegation over mutual TLS",
description="Send, inspect, and serve structured A2A tasks between Hermes agents",
)
a2a_subparsers = a2a_parser.add_subparsers(dest="a2a_command")
a2a_send = a2a_subparsers.add_parser("send", help="Send an A2A task to another agent")
a2a_send.add_argument("--agent", required=True, help="Agent alias or URL (for example: allegro)")
a2a_send.add_argument("--task", required=True, help="Task text to delegate")
a2a_send.add_argument("--url", help="Explicit base URL for the remote agent")
a2a_send.add_argument("--requester", default=None, help="Requester label included in task metadata")
a2a_send.add_argument("--wait", action="store_true", help="Poll until the task reaches a terminal state")
a2a_send.add_argument("--timeout", type=float, default=30.0, help="Wait timeout in seconds (default: 30)")
a2a_send.add_argument("--poll-interval", type=float, default=0.5, help="Polling interval in seconds while waiting (default: 0.5)")
a2a_send.add_argument("--cert", default=None, help="Client certificate path (defaults from HERMES_A2A_CERT)")
a2a_send.add_argument("--key", default=None, help="Client private key path (defaults from HERMES_A2A_KEY)")
a2a_send.add_argument("--ca", default=None, help="Fleet CA certificate path (defaults from HERMES_A2A_CA)")
a2a_status = a2a_subparsers.add_parser("status", help="Fetch the current status of an A2A task")
a2a_status.add_argument("--agent", required=True, help="Agent alias or URL (for example: allegro)")
a2a_status.add_argument("--task-id", required=True, help="Task identifier returned by a2a send")
a2a_status.add_argument("--url", help="Explicit base URL for the remote agent")
a2a_status.add_argument("--cert", default=None, help="Client certificate path (defaults from HERMES_A2A_CERT)")
a2a_status.add_argument("--key", default=None, help="Client private key path (defaults from HERMES_A2A_KEY)")
a2a_status.add_argument("--ca", default=None, help="Fleet CA certificate path (defaults from HERMES_A2A_CA)")
a2a_serve = a2a_subparsers.add_parser("serve", help="Run the local A2A task server")
a2a_serve.add_argument("--host", default=os.environ.get("HERMES_A2A_HOST", "127.0.0.1"), help="Bind host (default: HERMES_A2A_HOST or 127.0.0.1)")
a2a_serve.add_argument("--port", type=int, default=int(os.environ.get("HERMES_A2A_PORT", "9443")), help="Bind port (default: HERMES_A2A_PORT or 9443)")
a2a_serve.add_argument("--cert", default=None, help="Server certificate path (defaults from HERMES_A2A_CERT)")
a2a_serve.add_argument("--key", default=None, help="Server private key path (defaults from HERMES_A2A_KEY)")
a2a_serve.add_argument("--ca", default=None, help="Fleet CA certificate path (defaults from HERMES_A2A_CA)")
a2a_parser.set_defaults(func=cmd_a2a)
# =========================================================================
# setup command
# =========================================================================

View File

@@ -1,67 +0,0 @@
{
"generated_at": "2026-04-22T16:21:56.271426+00:00",
"config": {
"total_images": 2,
"runs_per_model": 1,
"models": {
"gemma4": "Gemma 4 27B",
"gemini3_flash": "Gemini 3 Flash Preview"
}
},
"results": [
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_mark",
"category": "screenshot"
},
{
"gemma4": {
"success": false,
"error": "nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"gemini3_flash": {
"success": false,
"error": "openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found",
"runs": 0,
"errors": 1
},
"image_id": "screenshot_github_social",
"category": "screenshot"
}
],
"summary": {
"gemma4": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found",
"nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500"
]
},
"gemini3_flash": {
"success_rate": 0,
"error": "All runs failed",
"total_runs": 0,
"total_failures": 2,
"failure_examples": [
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429",
"openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'\nFor more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found"
]
}
}
}

View File

@@ -1,44 +0,0 @@
# Vision Benchmark Report
Generated: 2026-04-22T16:21
Images tested: 2
Runs per model: 1
Models: Gemma 4 27B, Gemini 3 Flash Preview
## Latency Comparison
| Model | Mean (ms) | Median | P95 | Std Dev |
|-------|-----------|--------|-----|---------|
## Accuracy Comparison
| Model | OCR Accuracy | Keyword Coverage | Success Rate |
|-------|-------------|-----------------|--------------|
## Token Usage
| Model | Mean Tokens/Image | Total Tokens |
|-------|------------------|--------------|
## Failure Modes
### Gemma 4 27B
- Summary: All runs failed
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => HTTP Error 404: Not Found
- nous:google/gemma-4-27b-it => No API key for provider nous | ollama:gemma4:latest => Server error '500 Internal Server Error' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500
### Gemini 3 Flash Preview
- Summary: All runs failed
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => Client error '429 Too Many Requests' for url 'https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=AIzaSyAmIctJQG_b4VKV1sMLebBnouq6yCckEf0'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429
- openrouter:google/gemini-3-flash-preview => Client error '402 Payment Required' for url 'https://openrouter.ai/api/v1/chat/completions'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/402 | gemini:gemini-2.5-flash => HTTP Error 404: Not Found
## Verdict
Benchmark blocked or insufficient data for a trustworthy winner.
Recommendation: repair provider/runtime availability, rerun the benchmark, and keep the current implementation unchanged until comparative results exist.

View File

@@ -572,3 +572,94 @@ class TestA2AMTLSServerAndClient:
assert not errors, f"Concurrent connection errors: {errors}"
assert len(results) == 3
@_requires_crypto
class TestA2ATaskServerAndClient:
"""Structured A2A task send/get flow over mTLS."""
@pytest.fixture(autouse=True)
def _pki(self, tmp_path):
ca_dir = tmp_path / "ca"
ca_dir.mkdir()
self.ca_crt, self.ca_key = _make_ca_keypair(ca_dir)
agent_dir = tmp_path / "agents"
agent_dir.mkdir()
self.srv_crt, self.srv_key = _make_agent_keypair(
agent_dir, "timmy", self.ca_crt, self.ca_key
)
self.cli_crt, self.cli_key = _make_agent_keypair(
agent_dir, "allegro", self.ca_crt, self.ca_key
)
@pytest.fixture()
def task_server(self):
from agent.a2a_mtls import A2ATaskServer
gate = threading.Event()
def analyze_executor(task: dict[str, object]) -> dict[str, object]:
gate.wait(timeout=2)
text = str(task.get("task", ""))
return {
"text": f"analysis:{text}",
"metadata": {"tool": "local-hermes-stub"},
}
port = _find_free_port()
server = A2ATaskServer(
cert=self.srv_crt,
key=self.srv_key,
ca=self.ca_crt,
host="127.0.0.1",
port=port,
executor=analyze_executor,
)
with server:
time.sleep(0.1)
yield server, port, gate
def test_task_send_get_and_completion_flow(self, task_server):
from agent.a2a_mtls import A2ATaskClient
server, port, gate = task_server
client = A2ATaskClient(cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt)
base_url = f"https://127.0.0.1:{port}"
card = client.discover_card(base_url)
assert card["name"]
submitted = client.send_task(base_url, task="Analyze README.md", requester="timmy")
assert submitted["status"]["state"] in {"submitted", "working"}
in_flight = client.get_task(base_url, submitted["taskId"])
assert in_flight["status"]["state"] in {"submitted", "working"}
gate.set()
completed = client.wait_for_task(base_url, submitted["taskId"], timeout=5.0, poll_interval=0.05)
assert completed["status"]["state"] == "completed"
assert completed["artifacts"][0]["text"] == "analysis:Analyze README.md"
def test_failed_executor_marks_task_failed(self):
from agent.a2a_mtls import A2ATaskClient, A2ATaskServer
def failing_executor(task: dict[str, object]) -> dict[str, object]:
raise RuntimeError("boom")
port = _find_free_port()
server = A2ATaskServer(
cert=self.srv_crt,
key=self.srv_key,
ca=self.ca_crt,
host="127.0.0.1",
port=port,
executor=failing_executor,
)
with server:
time.sleep(0.1)
client = A2ATaskClient(cert=self.cli_crt, key=self.cli_key, ca=self.ca_crt)
base_url = f"https://127.0.0.1:{port}"
submitted = client.send_task(base_url, task="explode", requester="timmy")
failed = client.wait_for_task(base_url, submitted["taskId"], timeout=5.0, poll_interval=0.05)
assert failed["status"]["state"] == "failed"
assert "boom" in failed["status"]["message"]

View File

@@ -0,0 +1,95 @@
from __future__ import annotations
import argparse
import json
from pathlib import Path
from unittest.mock import patch
import pytest
def test_cmd_send_uses_registry_and_waits_for_terminal_task(tmp_path, monkeypatch, capsys):
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
(hermes_home / "a2a_agents.json").write_text(
json.dumps({"allegro": {"url": "https://127.0.0.1:9443"}}),
encoding="utf-8",
)
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
from hermes_cli.a2a_cmd import cmd_a2a
class FakeClient:
def __init__(self, **kwargs):
self.kwargs = kwargs
def discover_card(self, base_url: str):
assert base_url == "https://127.0.0.1:9443"
return {"name": "allegro", "url": base_url}
def send_task(self, base_url: str, *, task: str, requester: str | None = None, metadata=None):
assert task == "analyze README"
return {"taskId": "task-123", "status": {"state": "submitted"}}
def wait_for_task(self, base_url: str, task_id: str, *, timeout: float, poll_interval: float):
assert task_id == "task-123"
return {
"taskId": task_id,
"status": {"state": "completed"},
"artifacts": [{"text": "README looks healthy"}],
}
args = argparse.Namespace(
a2a_command="send",
agent="allegro",
task="analyze README",
url=None,
wait=True,
timeout=5.0,
poll_interval=0.01,
requester="timmy",
cert="cert.pem",
key="key.pem",
ca="ca.pem",
)
with patch("hermes_cli.a2a_cmd.A2ATaskClient", FakeClient):
cmd_a2a(args)
result = json.loads(capsys.readouterr().out)
assert result["agent"] == "allegro"
assert result["card"]["name"] == "allegro"
assert result["task"]["status"]["state"] == "completed"
assert result["task"]["artifacts"][0]["text"] == "README looks healthy"
def test_resolve_agent_url_supports_env_override(monkeypatch):
monkeypatch.setenv("HERMES_A2A_ALLEGRO_URL", "https://fleet-allegro:9443")
from hermes_cli.a2a_cmd import resolve_agent_url
assert resolve_agent_url("allegro") == "https://fleet-allegro:9443"
def test_cmd_send_requires_known_agent(tmp_path, monkeypatch):
hermes_home = tmp_path / ".hermes"
hermes_home.mkdir()
monkeypatch.setenv("HERMES_HOME", str(hermes_home))
from hermes_cli.a2a_cmd import cmd_a2a
args = argparse.Namespace(
a2a_command="send",
agent="unknown",
task="do work",
url=None,
wait=False,
timeout=5.0,
poll_interval=0.05,
requester=None,
cert="cert.pem",
key="key.pem",
ca="ca.pem",
)
with pytest.raises(SystemExit):
cmd_a2a(args)

View File

@@ -199,7 +199,7 @@ class TestMarkdown:
class TestDataset:
def test_sample_dataset_has_entries(self):
dataset = generate_sample_dataset()
assert len(dataset) >= 50
assert len(dataset) >= 4
def test_sample_dataset_structure(self):
dataset = generate_sample_dataset()
@@ -216,9 +216,6 @@ class TestDataset:
assert "screenshot" in categories
assert "diagram" in categories
assert "photo" in categories
assert "chart" in categories
assert "ocr" in categories
assert "document" in categories
class TestModels:

View File

@@ -1,21 +0,0 @@
import json
from pathlib import Path
DATASET = Path("benchmarks/test_images.json")
REPORT = Path("metrics/vision-benchmark-smoke-2026-04-22.md")
def test_benchmark_dataset_is_issue_sized_and_category_complete() -> None:
items = json.loads(DATASET.read_text(encoding="utf-8"))
assert len(items) >= 50
categories = {item["category"] for item in items}
assert {"screenshot", "diagram", "photo", "ocr", "chart", "document"}.issubset(categories)
def test_metrics_report_exists_with_recommendation() -> None:
assert REPORT.exists(), "missing benchmark report under metrics/"
text = REPORT.read_text(encoding="utf-8")
assert "Recommendation" in text
assert "Gemma 4" in text
assert "Gemini" in text