Files
hermes-agent/docs/evaluations/tensorzero-860-evaluation.json
Alexander Whitestone 755e7513a1
All checks were successful
Lint / lint (pull_request) Successful in 37s
docs: add grounded tensorzero evaluation packet (#860)
- add a script that inventories Hermes routing/evaluation surfaces relevant to a TensorZero cutover
- generate a markdown and JSON evaluation packet for issue #860
- score gateway replacement, config migration, canary rollout, session feedback, and eval-suite readiness
- add focused regression tests for touchpoint scanning, requirement scoring, and report rendering

Refs #860
2026-04-22 11:33:31 -04:00

1131 lines
42 KiB
JSON

{
"issue_number": 860,
"issue_title": "tensorzero LLMOps platform evaluation",
"issue_url": "https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/860",
"recommendation": "Not ready for direct replacement. Recommend a shadow-evaluation phase first: keep Hermes routing live, inventory the migration seams, export SessionDB/trajectory data into an offline TensorZero experiment loop, and only design a canary gateway once percentage-based rollout controls exist.",
"touchpoints": [
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 601,
"matched_text": "fallback_model: Dict[str, Any] = None,"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 995,
"matched_text": "# failure). Supports both legacy single-dict ``fallback_model`` and"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 996,
"matched_text": "# new list ``fallback_providers`` format."
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 997,
"matched_text": "if isinstance(fallback_model, list):"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 998,
"matched_text": "self._fallback_chain = ["
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 999,
"matched_text": "f for f in fallback_model"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1002,
"matched_text": "elif isinstance(fallback_model, dict) and fallback_model.get(\"provider\") and fallback_model.get(\"model\"):"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1003,
"matched_text": "self._fallback_chain = [fallback_model]"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1005,
"matched_text": "self._fallback_chain = []"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1009,
"matched_text": "self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1010,
"matched_text": "if self._fallback_chain and not self.quiet_mode:"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1011,
"matched_text": "if len(self._fallback_chain) == 1:"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1012,
"matched_text": "fb = self._fallback_chain[0]"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1015,
"matched_text": "print(f\"\ud83d\udd04 Fallback chain ({len(self._fallback_chain)} providers): \" +"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 1016,
"matched_text": "\" \u2192 \".join(f\"{f['model']} ({f['provider']})\" for f in self._fallback_chain))"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 5624,
"matched_text": "if self._fallback_index >= len(self._fallback_chain):"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 5627,
"matched_text": "fb = self._fallback_chain[self._fallback_index]"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 8559,
"matched_text": "if self._fallback_index < len(self._fallback_chain):"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 9355,
"matched_text": "if is_rate_limited and self._fallback_index < len(self._fallback_chain):"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 10460,
"matched_text": "if _truly_empty and self._fallback_chain:"
},
{
"label": "fallback_chain",
"file_path": "run_agent.py",
"line_number": 10514,
"matched_text": "+ (\" and fallback attempts.\" if self._fallback_chain else"
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 241,
"matched_text": "\"smart_model_routing\": {"
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 370,
"matched_text": "# (e.g. platform_toolsets, provider_routing, memory, honcho, etc.)"
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 1753,
"matched_text": "pr = CLI_CONFIG.get(\"provider_routing\", {}) or {}"
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 1762,
"matched_text": "# Supports new list format (fallback_providers) and legacy single-dict (fallback_model)."
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 1763,
"matched_text": "fb = CLI_CONFIG.get(\"fallback_providers\") or CLI_CONFIG.get(\"fallback_model\") or []"
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 1770,
"matched_text": "self._smart_model_routing = CLI_CONFIG.get(\"smart_model_routing\", {}) or {}"
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 2771,
"matched_text": "from agent.smart_model_routing import resolve_turn_route"
},
{
"label": "provider_routing_config",
"file_path": "cli.py",
"line_number": 2776,
"matched_text": "self._smart_model_routing,"
},
{
"label": "runtime_provider",
"file_path": "hermes_cli/runtime_provider.py",
"line_number": 209,
"matched_text": "def resolve_requested_provider(requested: Optional[str] = None) -> str:"
},
{
"label": "runtime_provider",
"file_path": "hermes_cli/runtime_provider.py",
"line_number": 649,
"matched_text": "def resolve_runtime_provider("
},
{
"label": "smart_model_routing",
"file_path": "agent/smart_model_routing.py",
"line_number": 62,
"matched_text": "def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:"
},
{
"label": "smart_model_routing",
"file_path": "agent/smart_model_routing.py",
"line_number": 110,
"matched_text": "def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:"
},
{
"label": "gateway_provider_routing",
"file_path": "gateway/run.py",
"line_number": 1271,
"matched_text": "def _load_provider_routing() -> dict:"
},
{
"label": "gateway_provider_routing",
"file_path": "gateway/run.py",
"line_number": 1285,
"matched_text": "def _load_fallback_model() -> list | dict | None:"
},
{
"label": "gateway_provider_routing",
"file_path": "gateway/run.py",
"line_number": 1306,
"matched_text": "def _load_smart_model_routing() -> dict:"
},
{
"label": "cron_runtime_provider",
"file_path": "cron/scheduler.py",
"line_number": 684,
"matched_text": "pr = _cfg.get(\"provider_routing\", {})"
},
{
"label": "cron_runtime_provider",
"file_path": "cron/scheduler.py",
"line_number": 688,
"matched_text": "resolve_runtime_provider,"
},
{
"label": "cron_runtime_provider",
"file_path": "cron/scheduler.py",
"line_number": 697,
"matched_text": "runtime = resolve_runtime_provider(**runtime_kwargs)"
},
{
"label": "cron_runtime_provider",
"file_path": "cron/scheduler.py",
"line_number": 702,
"matched_text": "from agent.smart_model_routing import resolve_turn_route"
},
{
"label": "cron_runtime_provider",
"file_path": "cron/scheduler.py",
"line_number": 703,
"matched_text": "turn_route = resolve_turn_route("
},
{
"label": "cron_runtime_provider",
"file_path": "cron/scheduler.py",
"line_number": 717,
"matched_text": "fallback_model = _cfg.get(\"fallback_providers\") or _cfg.get(\"fallback_model\") or None"
},
{
"label": "cron_runtime_provider",
"file_path": "cron/scheduler.py",
"line_number": 746,
"matched_text": "fallback_model=fallback_model,"
},
{
"label": "auxiliary_fallback_chain",
"file_path": "agent/auxiliary_client.py",
"line_number": 1018,
"matched_text": "def _get_provider_chain() -> List[tuple]:"
},
{
"label": "auxiliary_fallback_chain",
"file_path": "agent/auxiliary_client.py",
"line_number": 1107,
"matched_text": "for label, try_fn in _get_provider_chain():"
},
{
"label": "auxiliary_fallback_chain",
"file_path": "agent/auxiliary_client.py",
"line_number": 1189,
"matched_text": "# \u2500\u2500 Step 2: aggregator / fallback chain \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500"
},
{
"label": "auxiliary_fallback_chain",
"file_path": "agent/auxiliary_client.py",
"line_number": 1191,
"matched_text": "for label, try_fn in _get_provider_chain():"
},
{
"label": "auxiliary_fallback_chain",
"file_path": "agent/auxiliary_client.py",
"line_number": 2397,
"matched_text": "# error, fall through to the fallback chain below."
},
{
"label": "auxiliary_fallback_chain",
"file_path": "agent/auxiliary_client.py",
"line_number": 2417,
"matched_text": "# auto (the default) = best-effort fallback chain. (#7559)"
},
{
"label": "auxiliary_fallback_chain",
"file_path": "agent/auxiliary_client.py",
"line_number": 2589,
"matched_text": "# error, fall through to the fallback chain below."
},
{
"label": "delegate_runtime_provider",
"file_path": "tools/delegate_tool.py",
"line_number": 662,
"matched_text": "# bundle (base_url, api_key, api_mode) via the same runtime provider system"
},
{
"label": "delegate_runtime_provider",
"file_path": "tools/delegate_tool.py",
"line_number": 854,
"matched_text": "provider) is resolved via the runtime provider system \u2014 the same path used"
},
{
"label": "delegate_runtime_provider",
"file_path": "tools/delegate_tool.py",
"line_number": 909,
"matched_text": "from hermes_cli.runtime_provider import resolve_runtime_provider"
},
{
"label": "delegate_runtime_provider",
"file_path": "tools/delegate_tool.py",
"line_number": 910,
"matched_text": "runtime = resolve_runtime_provider(requested=configured_provider)"
},
{
"label": "session_db",
"file_path": "hermes_state.py",
"line_number": 115,
"matched_text": "class SessionDB:"
},
{
"label": "trajectory_export",
"file_path": "batch_runner.py",
"line_number": 320,
"matched_text": "save_trajectories=False, # We handle saving ourselves"
},
{
"label": "trajectory_export",
"file_path": "batch_runner.py",
"line_number": 346,
"matched_text": "trajectory = agent._convert_to_trajectory_format("
},
{
"label": "trajectory_export",
"file_path": "batch_runner.py",
"line_number": 460,
"matched_text": "trajectory_entry = {"
},
{
"label": "trajectory_export",
"file_path": "batch_runner.py",
"line_number": 474,
"matched_text": "f.write(json.dumps(trajectory_entry, ensure_ascii=False) + \"\\n\")"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 3,
"matched_text": "Tool-Calling Benchmark \u2014 Gemma 4 vs mimo-v2-pro regression test."
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 9,
"matched_text": "python3 benchmarks/tool_call_benchmark.py # full 100-call suite"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 10,
"matched_text": "python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 11,
"matched_text": "python3 benchmarks/tool_call_benchmark.py --models nous # single model"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 12,
"matched_text": "python3 benchmarks/tool_call_benchmark.py --category file # single category"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 37,
"matched_text": "class ToolCall:"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 51,
"matched_text": "ToolCall(\"file-01\", \"file\", \"Read the file /tmp/test_bench.txt and show me its contents.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 53,
"matched_text": "ToolCall(\"file-02\", \"file\", \"Write 'hello benchmark' to /tmp/test_bench_out.txt\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 55,
"matched_text": "ToolCall(\"file-03\", \"file\", \"Search for the word 'import' in all Python files in the current directory.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 57,
"matched_text": "ToolCall(\"file-04\", \"file\", \"Read lines 1-20 of /etc/hosts\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 59,
"matched_text": "ToolCall(\"file-05\", \"file\", \"Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 61,
"matched_text": "ToolCall(\"file-06\", \"file\", \"Search for files matching *.py in the current directory.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 63,
"matched_text": "ToolCall(\"file-07\", \"file\", \"Read the first 10 lines of /etc/passwd\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 65,
"matched_text": "ToolCall(\"file-08\", \"file\", \"Write a JSON config to /tmp/bench_config.json with key 'debug': true\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 67,
"matched_text": "ToolCall(\"file-09\", \"file\", \"Search for 'def test_' in Python test files.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 69,
"matched_text": "ToolCall(\"file-10\", \"file\", \"Read /tmp/bench_config.json and tell me what's in it.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 71,
"matched_text": "ToolCall(\"file-11\", \"file\", \"Create a file /tmp/bench_readme.md with one line: '# Benchmark'\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 73,
"matched_text": "ToolCall(\"file-12\", \"file\", \"Search for 'TODO' comments in all .py files.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 75,
"matched_text": "ToolCall(\"file-13\", \"file\", \"Read /tmp/bench_readme.md\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 77,
"matched_text": "ToolCall(\"file-14\", \"file\", \"Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 78,
"matched_text": "\"patch\", \"Tool Benchmark\"),"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 79,
"matched_text": "ToolCall(\"file-15\", \"file\", \"Write a Python one-liner to /tmp/bench_hello.py that prints hello.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 81,
"matched_text": "ToolCall(\"file-16\", \"file\", \"Search for all .json files in /tmp/.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 83,
"matched_text": "ToolCall(\"file-17\", \"file\", \"Read /tmp/bench_hello.py and verify it has print('hello').\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 85,
"matched_text": "ToolCall(\"file-18\", \"file\", \"Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 87,
"matched_text": "ToolCall(\"file-19\", \"file\", \"List files matching 'bench*' in /tmp/.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 89,
"matched_text": "ToolCall(\"file-20\", \"file\", \"Read /tmp/test_bench.txt again and summarize its contents.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 93,
"matched_text": "ToolCall(\"term-01\", \"terminal\", \"Run `echo hello world` in the terminal.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 95,
"matched_text": "ToolCall(\"term-02\", \"terminal\", \"Run `date` to get the current date and time.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 97,
"matched_text": "ToolCall(\"term-03\", \"terminal\", \"Run `uname -a` to get system information.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 99,
"matched_text": "ToolCall(\"term-04\", \"terminal\", \"Run `pwd` to show the current directory.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 101,
"matched_text": "ToolCall(\"term-05\", \"terminal\", \"Run `ls -la /tmp/ | head -20` to list temp files.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 103,
"matched_text": "ToolCall(\"term-06\", \"terminal\", \"Run `whoami` to show the current user.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 105,
"matched_text": "ToolCall(\"term-07\", \"terminal\", \"Run `df -h` to show disk usage.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 107,
"matched_text": "ToolCall(\"term-08\", \"terminal\", \"Run `python3 --version` to check Python version.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 109,
"matched_text": "ToolCall(\"term-09\", \"terminal\", \"Run `cat /etc/hostname` to get the hostname.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 111,
"matched_text": "ToolCall(\"term-10\", \"terminal\", \"Run `uptime` to see system uptime.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 113,
"matched_text": "ToolCall(\"term-11\", \"terminal\", \"Run `env | grep PATH` to show the PATH variable.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 115,
"matched_text": "ToolCall(\"term-12\", \"terminal\", \"Run `wc -l /etc/passwd` to count lines.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 117,
"matched_text": "ToolCall(\"term-13\", \"terminal\", \"Run `echo $SHELL` to show the current shell.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 119,
"matched_text": "ToolCall(\"term-14\", \"terminal\", \"Run `free -h || vm_stat` to check memory usage.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 121,
"matched_text": "ToolCall(\"term-15\", \"terminal\", \"Run `id` to show user and group IDs.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 123,
"matched_text": "ToolCall(\"term-16\", \"terminal\", \"Run `hostname` to get the machine hostname.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 125,
"matched_text": "ToolCall(\"term-17\", \"terminal\", \"Run `echo {1..5}` to test brace expansion.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 127,
"matched_text": "ToolCall(\"term-18\", \"terminal\", \"Run `seq 1 5` to generate a number sequence.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 129,
"matched_text": "ToolCall(\"term-19\", \"terminal\", \"Run `python3 -c 'print(2+2)'` to compute 2+2.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 131,
"matched_text": "ToolCall(\"term-20\", \"terminal\", \"Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 135,
"matched_text": "ToolCall(\"code-01\", \"code\", \"Execute a Python script that computes factorial of 10.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 137,
"matched_text": "ToolCall(\"code-02\", \"code\", \"Run Python to read /tmp/test_bench.txt and count its words.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 139,
"matched_text": "ToolCall(\"code-03\", \"code\", \"Execute Python to generate the first 20 Fibonacci numbers.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 141,
"matched_text": "ToolCall(\"code-04\", \"code\", \"Run Python to parse JSON from a string and print keys.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 143,
"matched_text": "ToolCall(\"code-05\", \"code\", \"Execute Python to list all files in /tmp/ matching 'bench*'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 145,
"matched_text": "ToolCall(\"code-06\", \"code\", \"Run Python to compute the sum of squares from 1 to 100.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 147,
"matched_text": "ToolCall(\"code-07\", \"code\", \"Execute Python to check if 'racecar' is a palindrome.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 149,
"matched_text": "ToolCall(\"code-08\", \"code\", \"Run Python to create a CSV string with 5 rows of sample data.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 151,
"matched_text": "ToolCall(\"code-09\", \"code\", \"Execute Python to sort a list [5,2,8,1,9] and print the result.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 153,
"matched_text": "ToolCall(\"code-10\", \"code\", \"Run Python to count lines in /etc/passwd.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 155,
"matched_text": "ToolCall(\"code-11\", \"code\", \"Execute Python to hash the string 'benchmark' with SHA256.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 157,
"matched_text": "ToolCall(\"code-12\", \"code\", \"Run Python to get the current UTC timestamp.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 159,
"matched_text": "ToolCall(\"code-13\", \"code\", \"Execute Python to convert 'hello world' to uppercase and reverse it.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 161,
"matched_text": "ToolCall(\"code-14\", \"code\", \"Run Python to create a dictionary of system info (platform, python version).\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 163,
"matched_text": "ToolCall(\"code-15\", \"code\", \"Execute Python to check internet connectivity by resolving google.com.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 167,
"matched_text": "ToolCall(\"deleg-01\", \"delegate\", \"Use a subagent to find all .log files in /tmp/.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 169,
"matched_text": "ToolCall(\"deleg-02\", \"delegate\", \"Delegate to a subagent: what is 15 * 37?\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 171,
"matched_text": "ToolCall(\"deleg-03\", \"delegate\", \"Use a subagent to check if Python 3 is installed and its version.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 173,
"matched_text": "ToolCall(\"deleg-04\", \"delegate\", \"Delegate: read /tmp/test_bench.txt and summarize it in one sentence.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 175,
"matched_text": "ToolCall(\"deleg-05\", \"delegate\", \"Use a subagent to list the contents of /tmp/ directory.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 177,
"matched_text": "ToolCall(\"deleg-06\", \"delegate\", \"Delegate: count the number of .py files in the current directory.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 179,
"matched_text": "ToolCall(\"deleg-07\", \"delegate\", \"Use a subagent to check disk space with df -h.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 181,
"matched_text": "ToolCall(\"deleg-08\", \"delegate\", \"Delegate: what OS are we running on?\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 183,
"matched_text": "ToolCall(\"deleg-09\", \"delegate\", \"Use a subagent to find the hostname of this machine.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 185,
"matched_text": "ToolCall(\"deleg-10\", \"delegate\", \"Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 189,
"matched_text": "ToolCall(\"todo-01\", \"todo\", \"Add a todo item: 'Run benchmark suite'\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 190,
"matched_text": "\"todo\", \"benchmark\"),"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 191,
"matched_text": "ToolCall(\"todo-02\", \"todo\", \"Show me the current todo list.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 193,
"matched_text": "ToolCall(\"todo-03\", \"todo\", \"Mark the first todo item as completed.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 195,
"matched_text": "ToolCall(\"todo-04\", \"todo\", \"Add a todo: 'Review benchmark results' with status pending.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 197,
"matched_text": "ToolCall(\"todo-05\", \"todo\", \"Clear all completed todos.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 199,
"matched_text": "ToolCall(\"todo-06\", \"memory\", \"Save this to memory: 'benchmark ran on {date}'\".format("
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 201,
"matched_text": "\"memory\", \"benchmark\"),"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 202,
"matched_text": "ToolCall(\"todo-07\", \"memory\", \"Search memory for 'benchmark'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 203,
"matched_text": "\"memory\", \"benchmark\"),"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 204,
"matched_text": "ToolCall(\"todo-08\", \"memory\", \"Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 206,
"matched_text": "ToolCall(\"todo-09\", \"todo\", \"Add three todo items: 'analyze', 'report', 'cleanup'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 208,
"matched_text": "ToolCall(\"todo-10\", \"memory\", \"Search memory for any notes about models.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 212,
"matched_text": "ToolCall(\"skill-01\", \"skills\", \"List all available skills.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 214,
"matched_text": "ToolCall(\"skill-02\", \"skills\", \"View the skill called 'test-driven-development'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 216,
"matched_text": "ToolCall(\"skill-03\", \"skills\", \"Search for skills related to 'git'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 218,
"matched_text": "ToolCall(\"skill-04\", \"skills\", \"View the 'code-review' skill.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 220,
"matched_text": "ToolCall(\"skill-05\", \"skills\", \"List all skills in the 'devops' category.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 222,
"matched_text": "ToolCall(\"skill-06\", \"skills\", \"View the 'systematic-debugging' skill.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 224,
"matched_text": "ToolCall(\"skill-07\", \"skills\", \"Search for skills about 'testing'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 226,
"matched_text": "ToolCall(\"skill-08\", \"skills\", \"View the 'writing-plans' skill.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 228,
"matched_text": "ToolCall(\"skill-09\", \"skills\", \"List skills in 'software-development' category.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 230,
"matched_text": "ToolCall(\"skill-10\", \"skills\", \"View the 'pr-review-discipline' skill.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 234,
"matched_text": "ToolCall(\"file-21\", \"file\", \"Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 236,
"matched_text": "ToolCall(\"file-22\", \"file\", \"Read /tmp/bench_sort.py back and confirm it exists.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 238,
"matched_text": "ToolCall(\"file-23\", \"file\", \"Search for 'class' in all .py files in the benchmarks directory.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 240,
"matched_text": "ToolCall(\"term-21\", \"terminal\", \"Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 242,
"matched_text": "ToolCall(\"term-22\", \"terminal\", \"Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 244,
"matched_text": "ToolCall(\"code-16\", \"code\", \"Execute Python to flatten a nested list [[1,2],[3,4],[5]].\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 246,
"matched_text": "ToolCall(\"code-17\", \"code\", \"Run Python to check if a number 17 is prime.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 248,
"matched_text": "ToolCall(\"deleg-11\", \"delegate\", \"Delegate: what is the current working directory?\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 250,
"matched_text": "ToolCall(\"todo-11\", \"todo\", \"Add a todo: 'Finalize benchmark report' status pending.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 252,
"matched_text": "ToolCall(\"todo-12\", \"memory\", \"Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 254,
"matched_text": "ToolCall(\"skill-11\", \"skills\", \"Search for skills about 'deployment'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 256,
"matched_text": "ToolCall(\"skill-12\", \"skills\", \"View the 'gitea-burn-cycle' skill.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 258,
"matched_text": "ToolCall(\"skill-13\", \"skills\", \"List all available skill categories.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 260,
"matched_text": "ToolCall(\"skill-14\", \"skills\", \"Search for skills related to 'memory'.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 262,
"matched_text": "ToolCall(\"skill-15\", \"skills\", \"View the 'mimo-swarm' skill.\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 311,
"matched_text": "\"\"\"Create prerequisite files for the benchmark.\"\"\""
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 313,
"matched_text": "\"This is a benchmark test file.\\n\""
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 349,
"matched_text": "\"You are a benchmark test runner. Execute the user's request by calling \""
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 406,
"matched_text": "\"\"\"Generate markdown benchmark report.\"\"\""
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 428,
"matched_text": "f\"# Tool-Calling Benchmark Report\","
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 535,
"matched_text": "parser = argparse.ArgumentParser(description=\"Tool-calling benchmark\")"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 544,
"matched_text": "help=\"Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)\")"
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 565,
"matched_text": "output_path = Path(args.output) if args.output else REPO_ROOT / \"benchmarks\" / f\"gemma4-tool-calling-{date_str}.md\""
},
{
"label": "benchmark_suite",
"file_path": "benchmarks/tool_call_benchmark.py",
"line_number": 575,
"matched_text": "print(f\"Benchmark: {len(suite)} tests \u00d7 {len(model_specs)} models = {len(suite) * len(model_specs)} calls\")"
}
],
"requirements": [
{
"key": "gateway_replacement",
"name": "Gateway replacement scope",
"status": "partial",
"evidence_labels": [
"fallback_chain",
"runtime_provider",
"gateway_provider_routing",
"cron_runtime_provider",
"auxiliary_fallback_chain",
"delegate_runtime_provider"
],
"summary": "Hermes already spreads provider routing across core agent, runtime provider, gateway, cron, auxiliary, and delegation seams; TensorZero would need parity across all of them before it can replace the gateway layer."
},
{
"key": "config_migration",
"name": "Config migration",
"status": "partial",
"evidence_labels": [
"provider_routing_config",
"runtime_provider",
"smart_model_routing",
"fallback_chain"
],
"summary": "Hermes has multiple config concepts to migrate (`provider_routing`, `fallback_providers`, `smart_model_routing`, runtime provider resolution), so TensorZero is not a drop-in config swap."
},
{
"key": "canary_rollout",
"name": "10% traffic canary",
"status": "gap",
"evidence_labels": [],
"summary": "The repo shows semantic routing and fallback, but no grounded 10% traffic-split canary mechanism. A TensorZero cutover would need new percentage-based rollout controls and observability hooks."
},
{
"key": "session_feedback",
"name": "Session data for prompt optimization",
"status": "partial",
"evidence_labels": [
"session_db",
"trajectory_export"
],
"summary": "Hermes already has SessionDB and trajectory export surfaces that can feed offline optimization data, but not a TensorZero-native ingestion path yet."
},
{
"key": "evaluation_suite",
"name": "Evaluation suite / A/B testing",
"status": "partial",
"evidence_labels": [
"benchmark_suite",
"trajectory_export"
],
"summary": "Hermes already has benchmark/trajectory machinery that can seed TensorZero A/B evaluation, but no integrated TensorZero experiment runner or live evaluation gateway."
}
]
}