Compare commits

..

2 Commits

Author SHA1 Message Date
Alexander Whitestone
411aea9edf feat: harden tool-call benchmark coverage and reporting for #796
All checks were successful
Lint / lint (pull_request) Successful in 9s
Refs #796
2026-04-22 11:47:11 -04:00
Alexander Whitestone
877005b06e wip: add failing tool-call benchmark regression tests for #796
Refs #796
2026-04-22 11:31:24 -04:00
8 changed files with 573 additions and 851 deletions

View File

@@ -1,326 +0,0 @@
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any, Optional
import httpx
from agent.anthropic_adapter import _is_oauth_token, resolve_anthropic_token
from hermes_cli.auth import _read_codex_tokens, resolve_codex_runtime_credentials
from hermes_cli.runtime_provider import resolve_runtime_provider
def _utc_now() -> datetime:
return datetime.now(timezone.utc)
@dataclass(frozen=True)
class AccountUsageWindow:
label: str
used_percent: Optional[float] = None
reset_at: Optional[datetime] = None
detail: Optional[str] = None
@dataclass(frozen=True)
class AccountUsageSnapshot:
provider: str
source: str
fetched_at: datetime
title: str = "Account limits"
plan: Optional[str] = None
windows: tuple[AccountUsageWindow, ...] = ()
details: tuple[str, ...] = ()
unavailable_reason: Optional[str] = None
@property
def available(self) -> bool:
return bool(self.windows or self.details) and not self.unavailable_reason
def _title_case_slug(value: Optional[str]) -> Optional[str]:
cleaned = str(value or "").strip()
if not cleaned:
return None
return cleaned.replace("_", " ").replace("-", " ").title()
def _parse_dt(value: Any) -> Optional[datetime]:
if value in (None, ""):
return None
if isinstance(value, (int, float)):
return datetime.fromtimestamp(float(value), tz=timezone.utc)
if isinstance(value, str):
text = value.strip()
if not text:
return None
if text.endswith("Z"):
text = text[:-1] + "+00:00"
try:
dt = datetime.fromisoformat(text)
return dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc)
except ValueError:
return None
return None
def _format_reset(dt: Optional[datetime]) -> str:
if not dt:
return "unknown"
local_dt = dt.astimezone()
delta = dt - _utc_now()
total_seconds = int(delta.total_seconds())
if total_seconds <= 0:
return f"now ({local_dt.strftime('%Y-%m-%d %H:%M %Z')})"
hours, rem = divmod(total_seconds, 3600)
minutes = rem // 60
if hours >= 24:
days, hours = divmod(hours, 24)
rel = f"in {days}d {hours}h"
elif hours > 0:
rel = f"in {hours}h {minutes}m"
else:
rel = f"in {minutes}m"
return f"{rel} ({local_dt.strftime('%Y-%m-%d %H:%M %Z')})"
def render_account_usage_lines(snapshot: Optional[AccountUsageSnapshot], *, markdown: bool = False) -> list[str]:
if not snapshot:
return []
header = f"📈 {'**' if markdown else ''}{snapshot.title}{'**' if markdown else ''}"
lines = [header]
if snapshot.plan:
lines.append(f"Provider: {snapshot.provider} ({snapshot.plan})")
else:
lines.append(f"Provider: {snapshot.provider}")
for window in snapshot.windows:
if window.used_percent is None:
base = f"{window.label}: unavailable"
else:
remaining = max(0, round(100 - float(window.used_percent)))
used = max(0, round(float(window.used_percent)))
base = f"{window.label}: {remaining}% remaining ({used}% used)"
if window.reset_at:
base += f" • resets {_format_reset(window.reset_at)}"
elif window.detail:
base += f"{window.detail}"
lines.append(base)
for detail in snapshot.details:
lines.append(detail)
if snapshot.unavailable_reason:
lines.append(f"Unavailable: {snapshot.unavailable_reason}")
return lines
def _resolve_codex_usage_url(base_url: str) -> str:
normalized = (base_url or "").strip().rstrip("/")
if not normalized:
normalized = "https://chatgpt.com/backend-api/codex"
if normalized.endswith("/codex"):
normalized = normalized[: -len("/codex")]
if "/backend-api" in normalized:
return normalized + "/wham/usage"
return normalized + "/api/codex/usage"
def _fetch_codex_account_usage() -> Optional[AccountUsageSnapshot]:
creds = resolve_codex_runtime_credentials(refresh_if_expiring=True)
token_data = _read_codex_tokens()
tokens = token_data.get("tokens") or {}
account_id = str(tokens.get("account_id", "") or "").strip() or None
headers = {
"Authorization": f"Bearer {creds['api_key']}",
"Accept": "application/json",
"User-Agent": "codex-cli",
}
if account_id:
headers["ChatGPT-Account-Id"] = account_id
with httpx.Client(timeout=15.0) as client:
response = client.get(_resolve_codex_usage_url(creds.get("base_url", "")), headers=headers)
response.raise_for_status()
payload = response.json() or {}
rate_limit = payload.get("rate_limit") or {}
windows: list[AccountUsageWindow] = []
for key, label in (("primary_window", "Session"), ("secondary_window", "Weekly")):
window = rate_limit.get(key) or {}
used = window.get("used_percent")
if used is None:
continue
windows.append(
AccountUsageWindow(
label=label,
used_percent=float(used),
reset_at=_parse_dt(window.get("reset_at")),
)
)
details: list[str] = []
credits = payload.get("credits") or {}
if credits.get("has_credits"):
balance = credits.get("balance")
if isinstance(balance, (int, float)):
details.append(f"Credits balance: ${float(balance):.2f}")
elif credits.get("unlimited"):
details.append("Credits balance: unlimited")
return AccountUsageSnapshot(
provider="openai-codex",
source="usage_api",
fetched_at=_utc_now(),
plan=_title_case_slug(payload.get("plan_type")),
windows=tuple(windows),
details=tuple(details),
)
def _fetch_anthropic_account_usage() -> Optional[AccountUsageSnapshot]:
token = (resolve_anthropic_token() or "").strip()
if not token:
return None
if not _is_oauth_token(token):
return AccountUsageSnapshot(
provider="anthropic",
source="oauth_usage_api",
fetched_at=_utc_now(),
unavailable_reason="Anthropic account limits are only available for OAuth-backed Claude accounts.",
)
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/json",
"Content-Type": "application/json",
"anthropic-beta": "oauth-2025-04-20",
"User-Agent": "claude-code/2.1.0",
}
with httpx.Client(timeout=15.0) as client:
response = client.get("https://api.anthropic.com/api/oauth/usage", headers=headers)
response.raise_for_status()
payload = response.json() or {}
windows: list[AccountUsageWindow] = []
mapping = (
("five_hour", "Current session"),
("seven_day", "Current week"),
("seven_day_opus", "Opus week"),
("seven_day_sonnet", "Sonnet week"),
)
for key, label in mapping:
window = payload.get(key) or {}
util = window.get("utilization")
if util is None:
continue
used = float(util) * 100 if float(util) <= 1 else float(util)
windows.append(
AccountUsageWindow(
label=label,
used_percent=used,
reset_at=_parse_dt(window.get("resets_at")),
)
)
details: list[str] = []
extra = payload.get("extra_usage") or {}
if extra.get("is_enabled"):
used_credits = extra.get("used_credits")
monthly_limit = extra.get("monthly_limit")
currency = extra.get("currency") or "USD"
if isinstance(used_credits, (int, float)) and isinstance(monthly_limit, (int, float)):
details.append(
f"Extra usage: {used_credits:.2f} / {monthly_limit:.2f} {currency}"
)
return AccountUsageSnapshot(
provider="anthropic",
source="oauth_usage_api",
fetched_at=_utc_now(),
windows=tuple(windows),
details=tuple(details),
)
def _fetch_openrouter_account_usage(base_url: Optional[str], api_key: Optional[str]) -> Optional[AccountUsageSnapshot]:
runtime = resolve_runtime_provider(
requested="openrouter",
explicit_base_url=base_url,
explicit_api_key=api_key,
)
token = str(runtime.get("api_key", "") or "").strip()
if not token:
return None
normalized = str(runtime.get("base_url", "") or "").rstrip("/")
credits_url = f"{normalized}/credits"
key_url = f"{normalized}/key"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/json",
}
with httpx.Client(timeout=10.0) as client:
credits_resp = client.get(credits_url, headers=headers)
credits_resp.raise_for_status()
credits = (credits_resp.json() or {}).get("data") or {}
try:
key_resp = client.get(key_url, headers=headers)
key_resp.raise_for_status()
key_data = (key_resp.json() or {}).get("data") or {}
except Exception:
key_data = {}
total_credits = float(credits.get("total_credits") or 0.0)
total_usage = float(credits.get("total_usage") or 0.0)
details = [f"Credits balance: ${max(0.0, total_credits - total_usage):.2f}"]
windows: list[AccountUsageWindow] = []
limit = key_data.get("limit")
limit_remaining = key_data.get("limit_remaining")
limit_reset = str(key_data.get("limit_reset") or "").strip()
usage = key_data.get("usage")
if (
isinstance(limit, (int, float))
and float(limit) > 0
and isinstance(limit_remaining, (int, float))
and 0 <= float(limit_remaining) <= float(limit)
):
limit_value = float(limit)
remaining_value = float(limit_remaining)
used_percent = ((limit_value - remaining_value) / limit_value) * 100
detail_parts = [f"${remaining_value:.2f} of ${limit_value:.2f} remaining"]
if limit_reset:
detail_parts.append(f"resets {limit_reset}")
windows.append(
AccountUsageWindow(
label="API key quota",
used_percent=used_percent,
detail="".join(detail_parts),
)
)
if isinstance(usage, (int, float)):
usage_parts = [f"API key usage: ${float(usage):.2f} total"]
for value, label in (
(key_data.get("usage_daily"), "today"),
(key_data.get("usage_weekly"), "this week"),
(key_data.get("usage_monthly"), "this month"),
):
if isinstance(value, (int, float)) and float(value) > 0:
usage_parts.append(f"${float(value):.2f} {label}")
details.append("".join(usage_parts))
return AccountUsageSnapshot(
provider="openrouter",
source="credits_api",
fetched_at=_utc_now(),
windows=tuple(windows),
details=tuple(details),
)
def fetch_account_usage(
provider: Optional[str],
*,
base_url: Optional[str] = None,
api_key: Optional[str] = None,
) -> Optional[AccountUsageSnapshot]:
normalized = str(provider or "").strip().lower()
if normalized in {"", "auto", "custom"}:
return None
try:
if normalized == "openai-codex":
return _fetch_codex_account_usage()
if normalized == "anthropic":
return _fetch_anthropic_account_usage()
if normalized == "openrouter":
return _fetch_openrouter_account_usage(base_url, api_key)
except Exception:
return None
return None

View File

@@ -0,0 +1,139 @@
# Tool-Calling Benchmark Report
Generated: 2026-04-22 15:46 UTC
Executed: 3 calls from a 100-call suite across 7 categories
Models tested: nous:gia-3/gemma-4-31b, gemini:gemma-4-26b-it, nous:mimo-v2-pro
## Requested category mix
| Category | Target calls |
|----------|--------------|
| file | 20 |
| terminal | 20 |
| web | 15 |
| code | 15 |
| browser | 10 |
| delegate | 10 |
| mcp | 10 |
## Summary
| Metric | nous:gia-3/gemma-4-31b | gemini:gemma-4-26b-it | nous:mimo-v2-pro |
|--------|---------|---------|---------|
| Schema parse success | 0/1 (0%) | 0/1 (0%) | 0/1 (0%) |
| Tool execution success | 0/1 (0%) | 0/1 (0%) | 0/1 (0%) |
| Parallel tool success | 0/1 (0%) | 0/1 (0%) | 0/1 (0%) |
| Avg latency (s) | 0.00 | 0.00 | 0.00 |
| Avg tokens per call | 0.0 | 0.0 | 0.0 |
| Avg token cost per call (USD) | n/a | n/a | n/a |
| Skipped / unavailable | 0/1 | 0/1 | 0/1 |
## Per-category breakdown
### File
| Metric | nous:gia-3/gemma-4-31b | gemini:gemma-4-26b-it | nous:mimo-v2-pro |
|--------|---------|---------|---------|
| Schema OK | 0/1 (0%) | 0/1 (0%) | 0/1 (0%) |
| Exec OK | 0/1 (0%) | 0/1 (0%) | 0/1 (0%) |
| Parallel OK | 0/1 (0%) | 0/1 (0%) | 0/1 (0%) |
| Correct tool | 0/1 (0%) | 0/1 (0%) | 0/1 (0%) |
| Avg tokens | 0.0 | 0.0 | 0.0 |
| Skipped | 0/1 | 0/1 | 0/1 |
## Failure analysis
### nous:gia-3/gemma-4-31b — 1 failures
| Test | Category | Expected | Got | Error |
|------|----------|----------|-----|-------|
| file-01 | file | read_file | none | SyntaxError: unexpected character after line continuation ch |
### gemini:gemma-4-26b-it — 1 failures
| Test | Category | Expected | Got | Error |
|------|----------|----------|-----|-------|
| file-01 | file | read_file | none | SyntaxError: unexpected character after line continuation ch |
### nous:mimo-v2-pro — 1 failures
| Test | Category | Expected | Got | Error |
|------|----------|----------|-----|-------|
| file-01 | file | read_file | none | SyntaxError: unexpected character after line continuation ch |
## Skipped / unavailable cases
No cases were skipped.
## Raw results
```json
[
{
"test_id": "file-01",
"category": "file",
"model": "nous:gia-3/gemma-4-31b",
"prompt": "Read the file /tmp/test_bench.txt and show me its contents.",
"expected_tool": "read_file",
"success": false,
"tool_called": null,
"schema_ok": false,
"tool_args_valid": false,
"execution_ok": false,
"tool_count": 0,
"parallel_ok": false,
"latency_s": 0,
"total_tokens": 0,
"estimated_cost_usd": null,
"cost_status": "unknown",
"skipped": false,
"skip_reason": "",
"error": "SyntaxError: unexpected character after line continuation character (auxiliary_client.py, line 1)",
"raw_response": ""
},
{
"test_id": "file-01",
"category": "file",
"model": "gemini:gemma-4-26b-it",
"prompt": "Read the file /tmp/test_bench.txt and show me its contents.",
"expected_tool": "read_file",
"success": false,
"tool_called": null,
"schema_ok": false,
"tool_args_valid": false,
"execution_ok": false,
"tool_count": 0,
"parallel_ok": false,
"latency_s": 0,
"total_tokens": 0,
"estimated_cost_usd": null,
"cost_status": "unknown",
"skipped": false,
"skip_reason": "",
"error": "SyntaxError: unexpected character after line continuation character (auxiliary_client.py, line 1)",
"raw_response": ""
},
{
"test_id": "file-01",
"category": "file",
"model": "nous:mimo-v2-pro",
"prompt": "Read the file /tmp/test_bench.txt and show me its contents.",
"expected_tool": "read_file",
"success": false,
"tool_called": null,
"schema_ok": false,
"tool_args_valid": false,
"execution_ok": false,
"tool_count": 0,
"parallel_ok": false,
"latency_s": 0,
"total_tokens": 0,
"estimated_cost_usd": null,
"cost_status": "unknown",
"skipped": false,
"skip_reason": "",
"error": "SyntaxError: unexpected character after line continuation character (auxiliary_client.py, line 1)",
"raw_response": ""
}
]
```

View File

@@ -8,10 +8,11 @@ success rates, latency, and token costs.
Usage:
python3 benchmarks/tool_call_benchmark.py # full 100-call suite
python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
python3 benchmarks/tool_call_benchmark.py --models nous # single model
python3 benchmarks/tool_call_benchmark.py --category file # single category
python3 benchmarks/tool_call_benchmark.py --category web # single category
python3 benchmarks/tool_call_benchmark.py --compare # issue #796 default model comparison
Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
Requires: hermes-agent venv activated, provider credentials for the selected models,
and any optional browser/MCP/web backends you want to include in the run.
"""
import argparse
@@ -25,10 +26,12 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Ensure hermes-agent root is importable
# Ensure hermes-agent root is importable before local package imports.
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
from agent.usage_pricing import CanonicalUsage, estimate_usage_cost
# ---------------------------------------------------------------------------
# Test Definitions
# ---------------------------------------------------------------------------
@@ -39,9 +42,11 @@ class ToolCall:
id: str
category: str
prompt: str
expected_tool: str # tool name we expect the model to call
expected_params_check: str = "" # substring expected in JSON args
timeout: int = 30 # max seconds per call
expected_tool: str # exact tool name we expect the model to call
expected_params_check: str = "" # substring expected in JSON args
expected_tool_prefix: str = "" # prefix match for dynamic surfaces like mcp_*
expects_parallel: bool = False # whether this prompt should elicit multiple tool calls
timeout: int = 30 # max seconds per call
notes: str = ""
@@ -185,85 +190,107 @@ SUITE: list[ToolCall] = [
ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
"delegate_task", "write"),
# ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
"todo", "benchmark"),
ToolCall("todo-02", "todo", "Show me the current todo list.",
"todo", ""),
ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
"todo", "completed"),
ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
"todo", "Review"),
ToolCall("todo-05", "todo", "Clear all completed todos.",
"todo", "clear"),
ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
date=datetime.now().strftime("%Y-%m-%d")),
"memory", "benchmark"),
ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
"memory", "benchmark"),
ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
"memory", "gemma"),
ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
"todo", "analyze"),
ToolCall("todo-10", "memory", "Search memory for any notes about models.",
"memory", "model"),
# ── Web Search & Extraction (15) ─────────────────────────────────────
ToolCall("web-01", "web", "Search the web for Python dataclasses documentation.",
"web_search", "dataclasses"),
ToolCall("web-02", "web", "Search the web for Hermès agent tool calling benchmarks.",
"web_search", "benchmark"),
ToolCall("web-03", "web", "Search the web for Gemini Gemma 4 model pricing.",
"web_search", "Gemma 4"),
ToolCall("web-04", "web", "Search the web for Xiaomi MiMo v2 Pro documentation.",
"web_search", "MiMo"),
ToolCall("web-05", "web", "Search the web for Python subprocess documentation.",
"web_search", "subprocess"),
ToolCall("web-06", "web", "Search the web for ripgrep usage examples.",
"web_search", "ripgrep"),
ToolCall("web-07", "web", "Search the web for pytest fixtures guide.",
"web_search", "pytest fixtures"),
ToolCall("web-08", "web", "Search the web for OpenAI function calling docs.",
"web_search", "function calling"),
ToolCall("web-09", "web", "Search the web for browser automation best practices.",
"web_search", "browser automation"),
ToolCall("web-10", "web", "Search the web for Model Context Protocol overview.",
"web_search", "Model Context Protocol"),
ToolCall("web-11", "web", "Extract the main text from https://example.com.",
"web_extract", "example.com"),
ToolCall("web-12", "web", "Extract the page content from https://example.org.",
"web_extract", "example.org"),
ToolCall("web-13", "web", "Extract the title and body text from https://www.iana.org/domains/reserved.",
"web_extract", "iana.org"),
ToolCall("web-14", "web", "Extract content from https://httpbin.org/html.",
"web_extract", "httpbin.org"),
ToolCall("web-15", "web", "Extract the main content from https://www.python.org/.",
"web_extract", "python.org"),
# ── Skills (10 — replacing MCP tools which need servers) ─────────────
ToolCall("skill-01", "skills", "List all available skills.",
"skills_list", ""),
ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
"skill_view", "test-driven"),
ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
"skills_list", "git"),
ToolCall("skill-04", "skills", "View the 'code-review' skill.",
"skill_view", "code-review"),
ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
"skills_list", "devops"),
ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
"skill_view", "systematic-debugging"),
ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
"skills_list", "testing"),
ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
"skill_view", "writing-plans"),
ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
"skills_list", "software-development"),
ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
"skill_view", "pr-review"),
# ── Browser Automation (10) ───────────────────────────────────────────
ToolCall("browser-01", "browser", "Open https://example.com in the browser.",
"browser_navigate", "example.com"),
ToolCall("browser-02", "browser", "Open https://www.python.org in the browser.",
"browser_navigate", "python.org"),
ToolCall("browser-03", "browser", "Open https://www.wikipedia.org in the browser.",
"browser_navigate", "wikipedia.org"),
ToolCall("browser-04", "browser", "Navigate the browser to https://example.org.",
"browser_navigate", "example.org"),
ToolCall("browser-05", "browser", "Go to https://httpbin.org/forms/post in the browser.",
"browser_navigate", "httpbin.org/forms/post"),
ToolCall("browser-06", "browser", "Open https://www.iana.org/domains/reserved in the browser.",
"browser_navigate", "iana.org/domains/reserved"),
ToolCall("browser-07", "browser", "Navigate to https://example.net in the browser.",
"browser_navigate", "example.net"),
ToolCall("browser-08", "browser", "Open https://developer.mozilla.org in the browser.",
"browser_navigate", "developer.mozilla.org"),
ToolCall("browser-09", "browser", "Navigate the browser to https://www.rfc-editor.org.",
"browser_navigate", "rfc-editor.org"),
ToolCall("browser-10", "browser", "Open https://www.gnu.org in the browser.",
"browser_navigate", "gnu.org"),
# ── Additional tests to reach 100 ────────────────────────────────────
ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
"write_file", "bench_sort"),
ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
"read_file", "bench_sort"),
ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
"search_files", "class"),
ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
"terminal", "os"),
ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
"terminal", "cpu"),
ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
"execute_code", "flatten"),
ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
"execute_code", "prime"),
ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
"delegate_task", "cwd"),
ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
"todo", "Finalize"),
ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
"memory", "categories"),
ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
"skills_list", "deployment"),
ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
"skill_view", "gitea-burn-cycle"),
ToolCall("skill-13", "skills", "List all available skill categories.",
"skills_list", ""),
ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
"skills_list", "memory"),
ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
"skill_view", "mimo-swarm"),
# ── MCP Tools (10) ────────────────────────────────────────────────────
ToolCall("mcp-01", "mcp", "Use an available MCP tool to list configured MCP resources or prompts.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-02", "mcp", "Use an MCP tool to inspect available resources on a configured server.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-03", "mcp", "Use an MCP tool to read a resource from any configured MCP server.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-04", "mcp", "Use an MCP tool to list prompts from any configured MCP server.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-05", "mcp", "Use an available MCP tool and report what it returns.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-06", "mcp", "Call any safe MCP tool that is currently available and summarize the response.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-07", "mcp", "Use one configured MCP tool to enumerate data or capabilities.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-08", "mcp", "Use an MCP tool to fetch a small piece of data from a connected server.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-09", "mcp", "Invoke an available MCP tool and show the structured result.",
"", "", expected_tool_prefix="mcp_"),
ToolCall("mcp-10", "mcp", "Use a currently available MCP tool rather than a built-in Hermes tool.",
"", "", expected_tool_prefix="mcp_"),
]
# fmt: on
DEFAULT_COMPARE_MODELS = [
"nous:gia-3/gemma-4-31b",
"gemini:gemma-4-26b-it",
"nous:mimo-v2-pro",
]
ISSUE_796_CATEGORY_COUNTS = {
"file": 20,
"terminal": 20,
"web": 15,
"code": 15,
"browser": 10,
"delegate": 10,
"mcp": 10,
}
def suite_category_counts() -> dict[str, int]:
counts: dict[str, int] = {}
for tc in SUITE:
counts[tc.category] = counts.get(tc.category, 0) + 1
return counts
# ---------------------------------------------------------------------------
# Runner
@@ -278,9 +305,17 @@ class CallResult:
expected_tool: str
success: bool
tool_called: Optional[str] = None
schema_ok: bool = False
tool_args_valid: bool = False
execution_ok: bool = False
tool_count: int = 0
parallel_ok: bool = False
latency_s: float = 0.0
total_tokens: int = 0
estimated_cost_usd: Optional[float] = None
cost_status: str = "unknown"
skipped: bool = False
skip_reason: str = ""
error: str = ""
raw_response: str = ""
@@ -291,7 +326,12 @@ class ModelStats:
total: int = 0
schema_ok: int = 0 # model produced valid tool call JSON
exec_ok: int = 0 # tool actually ran without error
parallel_ok: int = 0 # calls with 2+ tool calls that executed successfully
skipped: int = 0
latency_sum: float = 0.0
total_tokens: int = 0
total_cost_usd: float = 0.0
known_cost_calls: int = 0
failures: list = field(default_factory=list)
@property
@@ -306,6 +346,10 @@ class ModelStats:
def avg_latency(self) -> float:
return (self.latency_sum / self.total) if self.total else 0
@property
def avg_cost_usd(self) -> Optional[float]:
return (self.total_cost_usd / self.known_cost_calls) if self.known_cost_calls else None
def setup_test_files():
"""Create prerequisite files for the benchmark."""
@@ -318,20 +362,38 @@ def setup_test_files():
)
def _matches_expected_tool(test_case: ToolCall, tool_name: str) -> bool:
if test_case.expected_tool and tool_name == test_case.expected_tool:
return True
if test_case.expected_tool_prefix and tool_name.startswith(test_case.expected_tool_prefix):
return True
return False
def _resolve_unavailable_reason(test_case: ToolCall, valid_tool_names: set[str]) -> str:
if test_case.expected_tool and test_case.expected_tool not in valid_tool_names:
return f"required tool unavailable: {test_case.expected_tool}"
if test_case.expected_tool_prefix and not any(
name.startswith(test_case.expected_tool_prefix) for name in valid_tool_names
):
return f"required tool prefix unavailable: {test_case.expected_tool_prefix}"
return ""
def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
"""Run a single tool-calling test through the agent."""
from run_agent import AIAgent
result = CallResult(
test_id=tc.id,
category=tc.category,
model=model_spec,
prompt=tc.prompt,
expected_tool=tc.expected_tool,
expected_tool=tc.expected_tool or tc.expected_tool_prefix,
success=False,
)
try:
from run_agent import AIAgent
agent = AIAgent(
model=model_spec,
provider=provider,
@@ -342,6 +404,14 @@ def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
persist_session=False,
)
valid_tool_names = set(getattr(agent, "valid_tool_names", set()))
unavailable_reason = _resolve_unavailable_reason(tc, valid_tool_names)
if unavailable_reason:
result.skipped = True
result.skip_reason = unavailable_reason
result.error = unavailable_reason
return result
t0 = time.time()
conv = agent.run_conversation(
user_message=tc.prompt,
@@ -352,52 +422,75 @@ def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
)
result.latency_s = round(time.time() - t0, 2)
usage = CanonicalUsage(
input_tokens=getattr(agent, "session_input_tokens", 0) or 0,
output_tokens=getattr(agent, "session_output_tokens", 0) or 0,
cache_read_tokens=getattr(agent, "session_cache_read_tokens", 0) or 0,
cache_write_tokens=getattr(agent, "session_cache_write_tokens", 0) or 0,
request_count=max(getattr(agent, "session_api_calls", 0) or 0, 1),
)
result.total_tokens = usage.total_tokens
billed_model = model_spec.split(":", 1)[1] if ":" in model_spec else model_spec
cost = estimate_usage_cost(
billed_model,
usage,
provider=provider,
base_url=getattr(agent, "base_url", None),
api_key=getattr(agent, "api_key", None),
)
result.cost_status = cost.status
result.estimated_cost_usd = float(cost.amount_usd) if cost.amount_usd is not None else None
messages = conv.get("messages", [])
# Find the first assistant message with tool_calls
tool_called = None
tool_args_str = ""
tool_calls = []
for msg in messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc_item in msg["tool_calls"]:
fn = tc_item.get("function", {})
tool_called = fn.get("name", "")
tool_args_str = fn.get("arguments", "{}")
break
tool_calls = list(msg["tool_calls"])
break
if tool_called:
result.tool_called = tool_called
result.schema_ok = True
if tool_calls:
result.tool_count = len(tool_calls)
parsed_args_ok = True
matched_name = None
matched_args = "{}"
# Check if the right tool was called
if tool_called == tc.expected_tool:
result.success = True
for tc_item in tool_calls:
fn = tc_item.get("function", {})
tool_name = fn.get("name", "")
tool_args = fn.get("arguments", "{}")
try:
json.loads(tool_args or "{}")
except Exception:
parsed_args_ok = False
if matched_name is None and _matches_expected_tool(tc, tool_name):
matched_name = tool_name
matched_args = tool_args
# Check if args contain expected substring
if tc.expected_params_check:
result.tool_args_valid = tc.expected_params_check in tool_args_str
else:
result.tool_args_valid = True
result.schema_ok = parsed_args_ok
result.tool_called = matched_name or tool_calls[0].get("function", {}).get("name", "")
if matched_name:
result.tool_args_valid = (
tc.expected_params_check in matched_args if tc.expected_params_check else True
)
result.success = result.schema_ok and result.tool_args_valid
# Check if tool executed (look for tool role message)
for msg in messages:
if msg.get("role") == "tool":
content = msg.get("content", "")
if content and "error" not in content.lower()[:50]:
if content:
result.execution_ok = True
break
elif content:
result.execution_ok = True # got a response, even if error
break
result.parallel_ok = result.tool_count > 1 and result.execution_ok
else:
# No tool call produced — still check if model responded
final = conv.get("final_response", "")
result.raw_response = final[:200] if final else ""
except Exception as e:
result.error = f"{type(e).__name__}: {str(e)[:200]}"
result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
result.latency_s = round(time.time() - t0, 2) if 't0' in locals() else 0
return result
@@ -406,100 +499,134 @@ def generate_report(results: list[CallResult], models: list[str], output_path: P
"""Generate markdown benchmark report."""
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
# Aggregate per model
stats: dict[str, ModelStats] = {}
for m in models:
stats[m] = ModelStats(model=m)
stats: dict[str, ModelStats] = {m: ModelStats(model=m) for m in models}
by_category: dict[str, dict[str, list[CallResult]]] = {}
for r in results:
s = stats[r.model]
s.total += 1
s.schema_ok += int(r.schema_ok)
s.exec_ok += int(r.execution_ok)
s.latency_sum += r.latency_s
if not r.success:
s.failures.append(r)
s.total_tokens += r.total_tokens
if r.estimated_cost_usd is not None:
s.total_cost_usd += r.estimated_cost_usd
s.known_cost_calls += 1
if r.skipped:
s.skipped += 1
else:
s.schema_ok += int(r.schema_ok)
s.exec_ok += int(r.execution_ok)
s.parallel_ok += int(r.parallel_ok)
if not r.success:
s.failures.append(r)
by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
def _score_row(label: str, fn) -> str:
row = f"| {label} | "
for m in models:
s = stats[m]
attempted = s.total - s.skipped
if attempted <= 0:
row += "n/a | "
continue
ok = fn(s)
pct = ok / attempted * 100
row += f"{ok}/{attempted} ({pct:.0f}%) | "
return row
lines = [
f"# Tool-Calling Benchmark Report",
f"",
"# Tool-Calling Benchmark Report",
"",
f"Generated: {now}",
f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
f"Executed: {len(results)} calls from a {len(SUITE)}-call suite across {len(ISSUE_796_CATEGORY_COUNTS)} categories",
f"Models tested: {', '.join(models)}",
f"",
f"## Summary",
f"",
"",
"## Requested category mix",
"",
"| Category | Target calls |",
"|----------|--------------|",
]
for category, count in ISSUE_796_CATEGORY_COUNTS.items():
lines.append(f"| {category} | {count} |")
lines.extend([
"",
"## Summary",
"",
f"| Metric | {' | '.join(models)} |",
f"|--------|{'|'.join('---------' for _ in models)}|",
]
_score_row("Schema parse success", lambda s: s.schema_ok),
_score_row("Tool execution success", lambda s: s.exec_ok),
_score_row("Parallel tool success", lambda s: s.parallel_ok),
])
# Schema parse success
row = "| Schema parse success | "
for m in models:
s = stats[m]
row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
lines.append(row)
# Tool execution success
row = "| Tool execution success | "
for m in models:
s = stats[m]
row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
lines.append(row)
# Correct tool selected
row = "| Correct tool selected | "
for m in models:
s = stats[m]
correct = sum(1 for r in results if r.model == m and r.success)
pct = (correct / s.total * 100) if s.total else 0
row += f"{correct}/{s.total} ({pct:.0f}%) | "
lines.append(row)
# Avg latency
row = "| Avg latency (s) | "
for m in models:
s = stats[m]
row += f"{s.avg_latency:.2f} | "
row += f"{stats[m].avg_latency:.2f} | "
lines.append(row)
row = "| Avg tokens per call | "
for m in models:
total = stats[m].total
avg_tokens = stats[m].total_tokens / total if total else 0
row += f"{avg_tokens:.1f} | "
lines.append(row)
row = "| Avg token cost per call (USD) | "
for m in models:
avg_cost = stats[m].avg_cost_usd
row += (f"{avg_cost:.6f} | " if avg_cost is not None else "n/a | ")
lines.append(row)
row = "| Skipped / unavailable | "
for m in models:
s = stats[m]
row += f"{s.skipped}/{s.total} | "
lines.append(row)
lines.append("")
# Per-category breakdown
lines.append("## Per-Category Breakdown")
lines.append("## Per-category breakdown")
lines.append("")
for cat in sorted(by_category.keys()):
lines.append(f"### {cat.title()}")
lines.append("")
lines.append(f"| Metric | {' | '.join(models)} |")
lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
cat_data = by_category[cat]
for metric_name, fn in [
("Schema OK", lambda r: r.schema_ok),
("Exec OK", lambda r: r.execution_ok),
("Parallel OK", lambda r: r.parallel_ok),
("Correct tool", lambda r: r.success),
]:
row = f"| {metric_name} | "
for m in models:
results_m = cat_data.get(m, [])
total = len(results_m)
ok = sum(1 for r in results_m if fn(r))
pct = (ok / total * 100) if total else 0
row += f"{ok}/{total} ({pct:.0f}%) | "
results_m = by_category[cat].get(m, [])
attempted = [r for r in results_m if not r.skipped]
if not attempted:
row += "n/a | "
continue
ok = sum(1 for r in attempted if fn(r))
pct = ok / len(attempted) * 100
row += f"{ok}/{len(attempted)} ({pct:.0f}%) | "
lines.append(row)
row = "| Avg tokens | "
for m in models:
results_m = by_category[cat].get(m, [])
avg_tokens = sum(r.total_tokens for r in results_m) / len(results_m) if results_m else 0
row += f"{avg_tokens:.1f} | "
lines.append(row)
row = "| Skipped | "
for m in models:
results_m = by_category[cat].get(m, [])
skipped = sum(1 for r in results_m if r.skipped)
row += f"{skipped}/{len(results_m)} | "
lines.append(row)
lines.append("")
# Failure analysis
lines.append("## Failure Analysis")
lines.append("## Failure analysis")
lines.append("")
any_failures = False
for m in models:
s = stats[m]
@@ -514,28 +641,40 @@ def generate_report(results: list[CallResult], models: list[str], output_path: P
err = r.error or "wrong tool"
lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
lines.append("")
if not any_failures:
lines.append("No failures detected.")
lines.append("No model failures detected.")
lines.append("")
# Raw results JSON
lines.append("## Raw Results")
skipped_results = [r for r in results if r.skipped]
lines.append("## Skipped / unavailable cases")
lines.append("")
if skipped_results:
lines.append("| Test | Model | Category | Reason |")
lines.append("|------|-------|----------|--------|")
for r in skipped_results:
lines.append(f"| {r.test_id} | {r.model} | {r.category} | {r.skip_reason[:80]} |")
else:
lines.append("No cases were skipped.")
lines.append("")
lines.append("## Raw results")
lines.append("")
lines.append("```json")
lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
lines.append("```")
report = "\n".join(lines)
output_path.write_text(report)
output_path.write_text(report, encoding="utf-8")
return report
def main():
parser = argparse.ArgumentParser(description="Tool-calling benchmark")
parser.add_argument("--models", nargs="+",
default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
default=list(DEFAULT_COMPARE_MODELS),
help="Model specs to test (provider:model)")
parser.add_argument("--compare", action="store_true",
help="Use the issue #796 default comparison set")
parser.add_argument("--limit", type=int, default=0,
help="Run only first N tests (0 = all)")
parser.add_argument("--category", type=str, default="",
@@ -546,6 +685,9 @@ def main():
help="Print test cases without running them")
args = parser.parse_args()
if args.compare:
args.models = list(DEFAULT_COMPARE_MODELS)
# Filter suite
suite = SUITE[:]
if args.category:

25
cli.py
View File

@@ -13,7 +13,6 @@ Usage:
python cli.py --list-tools # List available tools and exit
"""
import concurrent.futures
import logging
import os
import shutil
@@ -64,7 +63,6 @@ from agent.usage_pricing import (
format_duration_compact,
format_token_count_compact,
)
from agent.account_usage import fetch_account_usage, render_account_usage_lines
from hermes_cli.banner import _format_context_length, format_banner_version_label
_COMMAND_SPINNER_FRAMES = ("", "", "", "", "", "", "", "", "", "")
@@ -6473,29 +6471,6 @@ class HermesCLI:
if cost_result.status == "unknown":
print(f" Note: Pricing unknown for {agent.model}")
# Account limits -- fetched off-thread with a hard timeout so slow
# provider APIs don't hang the prompt.
provider = getattr(agent, "provider", None) or getattr(self, "provider", None)
base_url = getattr(agent, "base_url", None) or getattr(self, "base_url", None)
api_key = getattr(agent, "api_key", None) or getattr(self, "api_key", None)
account_snapshot = None
if provider:
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as _pool:
try:
account_snapshot = _pool.submit(
fetch_account_usage,
provider,
base_url=base_url,
api_key=api_key,
).result(timeout=10.0)
except (concurrent.futures.TimeoutError, Exception):
account_snapshot = None
account_lines = [f" {line}" for line in render_account_usage_lines(account_snapshot)]
if account_lines:
print()
for line in account_lines:
print(line)
if self.verbose:
logging.getLogger().setLevel(logging.DEBUG)
for noisy in ('openai', 'openai._base_client', 'httpx', 'httpcore', 'asyncio', 'hpack', 'grpc', 'modal'):

View File

@@ -28,8 +28,6 @@ from pathlib import Path
from datetime import datetime
from typing import Dict, Optional, Any, List
from agent.account_usage import fetch_account_usage, render_account_usage_lines
# ---------------------------------------------------------------------------
# SSL certificate auto-detection for NixOS and other non-standard systems.
# Must run BEFORE any HTTP library (discord, aiohttp, etc.) is imported.
@@ -6483,38 +6481,6 @@ class GatewayRunner:
if cached:
agent = cached[0]
# Resolve provider/base_url/api_key for the account-usage fetch.
# Prefer the live agent; fall back to persisted billing data on the
# SessionDB row so `/usage` still returns account info between turns
# when no agent is resident.
provider = getattr(agent, "provider", None) if agent and agent is not _AGENT_PENDING_SENTINEL else None
base_url = getattr(agent, "base_url", None) if agent and agent is not _AGENT_PENDING_SENTINEL else None
api_key = getattr(agent, "api_key", None) if agent and agent is not _AGENT_PENDING_SENTINEL else None
if not provider and getattr(self, "_session_db", None) is not None:
try:
_entry_for_billing = self.session_store.get_or_create_session(source)
persisted = self._session_db.get_session(_entry_for_billing.session_id) or {}
except Exception:
persisted = {}
provider = provider or persisted.get("billing_provider")
base_url = base_url or persisted.get("billing_base_url")
# Fetch account usage off the event loop so slow provider APIs don't
# block the gateway. Failures are non-fatal -- account_lines stays [].
account_lines: list[str] = []
if provider:
try:
account_snapshot = await asyncio.to_thread(
fetch_account_usage,
provider,
base_url=base_url,
api_key=api_key,
)
except Exception:
account_snapshot = None
if account_snapshot:
account_lines = render_account_usage_lines(account_snapshot, markdown=True)
if agent and hasattr(agent, "session_total_tokens") and agent.session_api_calls > 0:
lines = []
@@ -6572,10 +6538,6 @@ class GatewayRunner:
if ctx.compression_count:
lines.append(f"Compressions: {ctx.compression_count}")
if account_lines:
lines.append("")
lines.extend(account_lines)
return "\n".join(lines)
# No agent at all -- check session history for a rough count
@@ -6585,18 +6547,12 @@ class GatewayRunner:
from agent.model_metadata import estimate_messages_tokens_rough
msgs = [m for m in history if m.get("role") in ("user", "assistant") and m.get("content")]
approx = estimate_messages_tokens_rough(msgs)
lines = [
"📊 **Session Info**",
f"Messages: {len(msgs)}",
f"Estimated context: ~{approx:,} tokens",
"_(Detailed usage available after the first agent response)_",
]
if account_lines:
lines.append("")
lines.extend(account_lines)
return "\n".join(lines)
if account_lines:
return "\n".join(account_lines)
return (
f"📊 **Session Info**\n"
f"Messages: {len(msgs)}\n"
f"Estimated context: ~{approx:,} tokens\n"
f"_(Detailed usage available after the first agent response)_"
)
return "No usage data available for this session."
async def _handle_insights_command(self, event: MessageEvent) -> str:

View File

@@ -175,79 +175,3 @@ class TestUsageCachedAgent:
result = await runner._handle_usage_command(event)
assert "Cost: included" in result
class TestUsageAccountSection:
"""Account-limits section appended to /usage output."""
@pytest.mark.asyncio
async def test_usage_command_includes_account_section(self, monkeypatch):
agent = _make_mock_agent(provider="openai-codex")
agent.base_url = "https://chatgpt.com/backend-api/codex"
agent.api_key = "unused"
runner = _make_runner(SK, cached_agent=agent)
event = MagicMock()
monkeypatch.setattr(
"gateway.run.fetch_account_usage",
lambda provider, base_url=None, api_key=None: object(),
)
monkeypatch.setattr(
"gateway.run.render_account_usage_lines",
lambda snapshot, markdown=False: [
"📈 **Account limits**",
"Provider: openai-codex (Pro)",
"Session: 85% remaining (15% used)",
],
)
with patch("agent.rate_limit_tracker.format_rate_limit_compact", return_value="RPM: 50/60"), \
patch("agent.usage_pricing.estimate_usage_cost") as mock_cost:
mock_cost.return_value = MagicMock(amount_usd=None, status="included")
result = await runner._handle_usage_command(event)
assert "📊 **Session Token Usage**" in result
assert "📈 **Account limits**" in result
assert "Provider: openai-codex (Pro)" in result
@pytest.mark.asyncio
async def test_usage_command_uses_persisted_provider_when_agent_not_running(self, monkeypatch):
runner = _make_runner(SK)
runner._session_db = MagicMock()
runner._session_db.get_session.return_value = {
"billing_provider": "openai-codex",
"billing_base_url": "https://chatgpt.com/backend-api/codex",
}
session_entry = MagicMock()
session_entry.session_id = "sess-1"
runner.session_store.get_or_create_session.return_value = session_entry
runner.session_store.load_transcript.return_value = [
{"role": "user", "content": "earlier"},
]
calls = {}
async def _fake_to_thread(fn, *args, **kwargs):
calls["args"] = args
calls["kwargs"] = kwargs
return fn(*args, **kwargs)
monkeypatch.setattr("gateway.run.asyncio.to_thread", _fake_to_thread)
monkeypatch.setattr(
"gateway.run.fetch_account_usage",
lambda provider, base_url=None, api_key=None: object(),
)
monkeypatch.setattr(
"gateway.run.render_account_usage_lines",
lambda snapshot, markdown=False: [
"📈 **Account limits**",
"Provider: openai-codex (Pro)",
],
)
event = MagicMock()
result = await runner._handle_usage_command(event)
assert calls["args"] == ("openai-codex",)
assert calls["kwargs"]["base_url"] == "https://chatgpt.com/backend-api/codex"
assert "📊 **Session Info**" in result
assert "📈 **Account limits**" in result

View File

@@ -1,203 +0,0 @@
from datetime import datetime, timezone
from agent.account_usage import (
AccountUsageSnapshot,
AccountUsageWindow,
fetch_account_usage,
render_account_usage_lines,
)
class _Response:
def __init__(self, payload, status_code=200):
self._payload = payload
self.status_code = status_code
def raise_for_status(self):
if self.status_code >= 400:
raise RuntimeError(f"HTTP {self.status_code}")
def json(self):
return self._payload
class _Client:
def __init__(self, payload):
self._payload = payload
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def get(self, url, headers=None):
return _Response(self._payload)
class _RoutingClient:
def __init__(self, payloads):
self._payloads = payloads
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def get(self, url, headers=None):
return _Response(self._payloads[url])
def test_fetch_account_usage_codex(monkeypatch):
monkeypatch.setattr(
"agent.account_usage.resolve_codex_runtime_credentials",
lambda refresh_if_expiring=True: {
"provider": "openai-codex",
"base_url": "https://chatgpt.com/backend-api/codex",
"api_key": "***",
},
)
monkeypatch.setattr(
"agent.account_usage._read_codex_tokens",
lambda: {"tokens": {"account_id": "acct_123"}},
)
monkeypatch.setattr(
"agent.account_usage.httpx.Client",
lambda timeout=15.0: _Client(
{
"plan_type": "pro",
"rate_limit": {
"primary_window": {
"used_percent": 15,
"reset_at": 1_900_000_000,
"limit_window_seconds": 18000,
},
"secondary_window": {
"used_percent": 40,
"reset_at": 1_900_500_000,
"limit_window_seconds": 604800,
},
},
"credits": {"has_credits": True, "balance": 12.5},
}
),
)
snapshot = fetch_account_usage("openai-codex")
assert snapshot is not None
assert snapshot.plan == "Pro"
assert len(snapshot.windows) == 2
assert snapshot.windows[0].label == "Session"
assert snapshot.windows[0].used_percent == 15.0
assert snapshot.windows[0].reset_at == datetime.fromtimestamp(1_900_000_000, tz=timezone.utc)
assert "Credits balance: $12.50" in snapshot.details
def test_render_account_usage_lines_includes_reset_and_provider():
snapshot = AccountUsageSnapshot(
provider="openai-codex",
source="usage_api",
fetched_at=datetime.now(timezone.utc),
plan="Pro",
windows=(
AccountUsageWindow(
label="Session",
used_percent=25,
reset_at=datetime.now(timezone.utc),
),
),
details=("Credits balance: $9.99",),
)
lines = render_account_usage_lines(snapshot)
assert lines[0] == "📈 Account limits"
assert "openai-codex (Pro)" in lines[1]
assert "Session: 75% remaining (25% used)" in lines[2]
assert "Credits balance: $9.99" in lines[3]
def test_fetch_account_usage_openrouter_uses_limit_remaining_and_ignores_deprecated_rate_limit(monkeypatch):
monkeypatch.setattr(
"agent.account_usage.resolve_runtime_provider",
lambda requested, explicit_base_url=None, explicit_api_key=None: {
"provider": "openrouter",
"base_url": "https://openrouter.ai/api/v1",
"api_key": "***",
},
)
monkeypatch.setattr(
"agent.account_usage.httpx.Client",
lambda timeout=10.0: _RoutingClient(
{
"https://openrouter.ai/api/v1/credits": {
"data": {"total_credits": 300.0, "total_usage": 10.92}
},
"https://openrouter.ai/api/v1/key": {
"data": {
"limit": 100.0,
"limit_remaining": 70.0,
"limit_reset": "monthly",
"usage": 12.5,
"usage_daily": 0.5,
"usage_weekly": 2.0,
"usage_monthly": 8.0,
"rate_limit": {"requests": -1, "interval": "10s"},
}
},
}
),
)
snapshot = fetch_account_usage("openrouter")
assert snapshot is not None
assert snapshot.windows == (
AccountUsageWindow(
label="API key quota",
used_percent=30.0,
detail="$70.00 of $100.00 remaining • resets monthly",
),
)
assert "Credits balance: $289.08" in snapshot.details
assert "API key usage: $12.50 total • $0.50 today • $2.00 this week • $8.00 this month" in snapshot.details
assert all("-1 requests / 10s" not in line for line in render_account_usage_lines(snapshot))
def test_fetch_account_usage_openrouter_omits_quota_window_when_key_has_no_limit(monkeypatch):
monkeypatch.setattr(
"agent.account_usage.resolve_runtime_provider",
lambda requested, explicit_base_url=None, explicit_api_key=None: {
"provider": "openrouter",
"base_url": "https://openrouter.ai/api/v1",
"api_key": "***",
},
)
monkeypatch.setattr(
"agent.account_usage.httpx.Client",
lambda timeout=10.0: _RoutingClient(
{
"https://openrouter.ai/api/v1/credits": {
"data": {"total_credits": 100.0, "total_usage": 25.5}
},
"https://openrouter.ai/api/v1/key": {
"data": {
"limit": None,
"limit_remaining": None,
"usage": 25.5,
"usage_daily": 1.25,
"usage_weekly": 4.5,
"usage_monthly": 18.0,
}
},
}
),
)
snapshot = fetch_account_usage("openrouter")
assert snapshot is not None
assert snapshot.windows == ()
assert "Credits balance: $74.50" in snapshot.details
assert "API key usage: $25.50 total • $1.25 today • $4.50 this week • $18.00 this month" in snapshot.details

View File

@@ -0,0 +1,115 @@
"""Tests for Issue #796 tool-calling benchmark coverage and reporting."""
import sys
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import patch
sys.path.insert(0, str(Path(__file__).parent.parent / "benchmarks"))
from tool_call_benchmark import ( # noqa: E402
CallResult,
DEFAULT_COMPARE_MODELS,
ISSUE_796_CATEGORY_COUNTS,
ToolCall,
generate_report,
run_single_test,
suite_category_counts,
)
def test_suite_counts_match_issue_796_distribution():
counts = suite_category_counts()
assert counts == ISSUE_796_CATEGORY_COUNTS
assert sum(counts.values()) == 100
def test_default_compare_models_cover_issue_796_lanes():
assert len(DEFAULT_COMPARE_MODELS) == 3
assert any("gemma-4-31b" in spec for spec in DEFAULT_COMPARE_MODELS)
assert any("gemma-4-26b" in spec for spec in DEFAULT_COMPARE_MODELS)
assert any("mimo-v2-pro" in spec for spec in DEFAULT_COMPARE_MODELS)
def test_generate_report_includes_parallel_and_cost_metrics(tmp_path):
output_path = tmp_path / "report.md"
results = [
CallResult(
test_id="file-01",
category="file",
model="gemma-4-31b",
prompt="Read the file.",
expected_tool="read_file",
success=True,
tool_called="read_file",
schema_ok=True,
tool_args_valid=True,
execution_ok=True,
tool_count=2,
parallel_ok=True,
latency_s=1.25,
total_tokens=123,
estimated_cost_usd=0.0012,
cost_status="estimated",
),
CallResult(
test_id="web-01",
category="web",
model="mimo-v2-pro",
prompt="Search the web.",
expected_tool="web_search",
success=False,
tool_called="web_search",
schema_ok=True,
tool_args_valid=False,
execution_ok=False,
tool_count=1,
parallel_ok=False,
latency_s=2.5,
error="bad args",
total_tokens=456,
estimated_cost_usd=None,
cost_status="unknown",
skipped=True,
skip_reason="web_search unavailable",
),
]
report = generate_report(results, ["gemma-4-31b", "mimo-v2-pro"], output_path)
assert output_path.exists()
assert "Parallel tool success" in report
assert "Avg token cost per call (USD)" in report
assert "Skipped / unavailable" in report
assert "Requested category mix" in report
def test_run_single_test_skips_when_expected_tool_unavailable():
class FakeAgent:
def __init__(self, *args, **kwargs):
self.valid_tool_names = {"read_file", "terminal"}
self.session_input_tokens = 0
self.session_output_tokens = 0
self.session_cache_read_tokens = 0
self.session_cache_write_tokens = 0
self.session_api_calls = 0
self.base_url = ""
self.api_key = None
def run_conversation(self, *args, **kwargs):
raise AssertionError("run_conversation should not be called for unavailable tools")
tc = ToolCall(
id="mcp-01",
category="mcp",
prompt="Use an MCP tool to list resources.",
expected_tool="",
expected_tool_prefix="mcp_",
)
with patch.dict(sys.modules, {"run_agent": SimpleNamespace(AIAgent=FakeAgent)}):
result = run_single_test(tc, "gemini:gemma-4-31b-it", "gemini")
assert result.skipped is True
assert "mcp_" in result.skip_reason
assert result.success is False