hermes-agent/docs/evaluations/tensorzero-860-evaluation.json

{
  "issue_number": 860,
  "issue_title": "tensorzero LLMOps platform evaluation",
  "issue_url": "https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/860",
  "recommendation": "Not ready for direct replacement. Recommend a shadow-evaluation phase first: keep Hermes routing live, inventory the migration seams, export SessionDB/trajectory data into an offline TensorZero experiment loop, and only design a canary gateway once percentage-based rollout controls exist.",
  "touchpoints": [
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 601,
      "matched_text": "fallback_model: Dict[str, Any] = None,"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 995,
      "matched_text": "# failure).  Supports both legacy single-dict ``fallback_model`` and"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 996,
      "matched_text": "# new list ``fallback_providers`` format."
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 997,
      "matched_text": "if isinstance(fallback_model, list):"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 998,
      "matched_text": "self._fallback_chain = ["
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 999,
      "matched_text": "f for f in fallback_model"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1002,
      "matched_text": "elif isinstance(fallback_model, dict) and fallback_model.get(\"provider\") and fallback_model.get(\"model\"):"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1003,
      "matched_text": "self._fallback_chain = [fallback_model]"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1005,
      "matched_text": "self._fallback_chain = []"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1009,
      "matched_text": "self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1010,
      "matched_text": "if self._fallback_chain and not self.quiet_mode:"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1011,
      "matched_text": "if len(self._fallback_chain) == 1:"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1012,
      "matched_text": "fb = self._fallback_chain[0]"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1015,
      "matched_text": "print(f\"\ud83d\udd04 Fallback chain ({len(self._fallback_chain)} providers): \" +"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 1016,
      "matched_text": "\" \u2192 \".join(f\"{f['model']} ({f['provider']})\" for f in self._fallback_chain))"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 5624,
      "matched_text": "if self._fallback_index >= len(self._fallback_chain):"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 5627,
      "matched_text": "fb = self._fallback_chain[self._fallback_index]"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 8559,
      "matched_text": "if self._fallback_index < len(self._fallback_chain):"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 9355,
      "matched_text": "if is_rate_limited and self._fallback_index < len(self._fallback_chain):"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 10460,
      "matched_text": "if _truly_empty and self._fallback_chain:"
    },
    {
      "label": "fallback_chain",
      "file_path": "run_agent.py",
      "line_number": 10514,
      "matched_text": "+ (\" and fallback attempts.\" if self._fallback_chain else"
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 241,
      "matched_text": "\"smart_model_routing\": {"
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 370,
      "matched_text": "# (e.g. platform_toolsets, provider_routing, memory, honcho, etc.)"
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 1753,
      "matched_text": "pr = CLI_CONFIG.get(\"provider_routing\", {}) or {}"
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 1762,
      "matched_text": "# Supports new list format (fallback_providers) and legacy single-dict (fallback_model)."
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 1763,
      "matched_text": "fb = CLI_CONFIG.get(\"fallback_providers\") or CLI_CONFIG.get(\"fallback_model\") or []"
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 1770,
      "matched_text": "self._smart_model_routing = CLI_CONFIG.get(\"smart_model_routing\", {}) or {}"
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 2771,
      "matched_text": "from agent.smart_model_routing import resolve_turn_route"
    },
    {
      "label": "provider_routing_config",
      "file_path": "cli.py",
      "line_number": 2776,
      "matched_text": "self._smart_model_routing,"
    },
    {
      "label": "runtime_provider",
      "file_path": "hermes_cli/runtime_provider.py",
      "line_number": 209,
      "matched_text": "def resolve_requested_provider(requested: Optional[str] = None) -> str:"
    },
    {
      "label": "runtime_provider",
      "file_path": "hermes_cli/runtime_provider.py",
      "line_number": 649,
      "matched_text": "def resolve_runtime_provider("
    },
    {
      "label": "smart_model_routing",
      "file_path": "agent/smart_model_routing.py",
      "line_number": 62,
      "matched_text": "def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:"
    },
    {
      "label": "smart_model_routing",
      "file_path": "agent/smart_model_routing.py",
      "line_number": 110,
      "matched_text": "def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:"
    },
    {
      "label": "gateway_provider_routing",
      "file_path": "gateway/run.py",
      "line_number": 1271,
      "matched_text": "def _load_provider_routing() -> dict:"
    },
    {
      "label": "gateway_provider_routing",
      "file_path": "gateway/run.py",
      "line_number": 1285,
      "matched_text": "def _load_fallback_model() -> list | dict | None:"
    },
    {
      "label": "gateway_provider_routing",
      "file_path": "gateway/run.py",
      "line_number": 1306,
      "matched_text": "def _load_smart_model_routing() -> dict:"
    },
    {
      "label": "cron_runtime_provider",
      "file_path": "cron/scheduler.py",
      "line_number": 684,
      "matched_text": "pr = _cfg.get(\"provider_routing\", {})"
    },
    {
      "label": "cron_runtime_provider",
      "file_path": "cron/scheduler.py",
      "line_number": 688,
      "matched_text": "resolve_runtime_provider,"
    },
    {
      "label": "cron_runtime_provider",
      "file_path": "cron/scheduler.py",
      "line_number": 697,
      "matched_text": "runtime = resolve_runtime_provider(**runtime_kwargs)"
    },
    {
      "label": "cron_runtime_provider",
      "file_path": "cron/scheduler.py",
      "line_number": 702,
      "matched_text": "from agent.smart_model_routing import resolve_turn_route"
    },
    {
      "label": "cron_runtime_provider",
      "file_path": "cron/scheduler.py",
      "line_number": 703,
      "matched_text": "turn_route = resolve_turn_route("
    },
    {
      "label": "cron_runtime_provider",
      "file_path": "cron/scheduler.py",
      "line_number": 717,
      "matched_text": "fallback_model = _cfg.get(\"fallback_providers\") or _cfg.get(\"fallback_model\") or None"
    },
    {
      "label": "cron_runtime_provider",
      "file_path": "cron/scheduler.py",
      "line_number": 746,
      "matched_text": "fallback_model=fallback_model,"
    },
    {
      "label": "auxiliary_fallback_chain",
      "file_path": "agent/auxiliary_client.py",
      "line_number": 1018,
      "matched_text": "def _get_provider_chain() -> List[tuple]:"
    },
    {
      "label": "auxiliary_fallback_chain",
      "file_path": "agent/auxiliary_client.py",
      "line_number": 1107,
      "matched_text": "for label, try_fn in _get_provider_chain():"
    },
    {
      "label": "auxiliary_fallback_chain",
      "file_path": "agent/auxiliary_client.py",
      "line_number": 1189,
      "matched_text": "# \u2500\u2500 Step 2: aggregator / fallback chain \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500"
    },
    {
      "label": "auxiliary_fallback_chain",
      "file_path": "agent/auxiliary_client.py",
      "line_number": 1191,
      "matched_text": "for label, try_fn in _get_provider_chain():"
    },
    {
      "label": "auxiliary_fallback_chain",
      "file_path": "agent/auxiliary_client.py",
      "line_number": 2397,
      "matched_text": "# error, fall through to the fallback chain below."
    },
    {
      "label": "auxiliary_fallback_chain",
      "file_path": "agent/auxiliary_client.py",
      "line_number": 2417,
      "matched_text": "# auto (the default) = best-effort fallback chain.  (#7559)"
    },
    {
      "label": "auxiliary_fallback_chain",
      "file_path": "agent/auxiliary_client.py",
      "line_number": 2589,
      "matched_text": "# error, fall through to the fallback chain below."
    },
    {
      "label": "delegate_runtime_provider",
      "file_path": "tools/delegate_tool.py",
      "line_number": 662,
      "matched_text": "# bundle (base_url, api_key, api_mode) via the same runtime provider system"
    },
    {
      "label": "delegate_runtime_provider",
      "file_path": "tools/delegate_tool.py",
      "line_number": 854,
      "matched_text": "provider) is resolved via the runtime provider system \u2014 the same path used"
    },
    {
      "label": "delegate_runtime_provider",
      "file_path": "tools/delegate_tool.py",
      "line_number": 909,
      "matched_text": "from hermes_cli.runtime_provider import resolve_runtime_provider"
    },
    {
      "label": "delegate_runtime_provider",
      "file_path": "tools/delegate_tool.py",
      "line_number": 910,
      "matched_text": "runtime = resolve_runtime_provider(requested=configured_provider)"
    },
    {
      "label": "session_db",
      "file_path": "hermes_state.py",
      "line_number": 115,
      "matched_text": "class SessionDB:"
    },
    {
      "label": "trajectory_export",
      "file_path": "batch_runner.py",
      "line_number": 320,
      "matched_text": "save_trajectories=False,  # We handle saving ourselves"
    },
    {
      "label": "trajectory_export",
      "file_path": "batch_runner.py",
      "line_number": 346,
      "matched_text": "trajectory = agent._convert_to_trajectory_format("
    },
    {
      "label": "trajectory_export",
      "file_path": "batch_runner.py",
      "line_number": 460,
      "matched_text": "trajectory_entry = {"
    },
    {
      "label": "trajectory_export",
      "file_path": "batch_runner.py",
      "line_number": 474,
      "matched_text": "f.write(json.dumps(trajectory_entry, ensure_ascii=False) + \"\\n\")"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 3,
      "matched_text": "Tool-Calling Benchmark \u2014 Gemma 4 vs mimo-v2-pro regression test."
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 9,
      "matched_text": "python3 benchmarks/tool_call_benchmark.py                  # full 100-call suite"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 10,
      "matched_text": "python3 benchmarks/tool_call_benchmark.py --limit 10       # quick smoke test"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 11,
      "matched_text": "python3 benchmarks/tool_call_benchmark.py --models nous     # single model"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 12,
      "matched_text": "python3 benchmarks/tool_call_benchmark.py --category file   # single category"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 37,
      "matched_text": "class ToolCall:"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 51,
      "matched_text": "ToolCall(\"file-01\", \"file\", \"Read the file /tmp/test_bench.txt and show me its contents.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 53,
      "matched_text": "ToolCall(\"file-02\", \"file\", \"Write 'hello benchmark' to /tmp/test_bench_out.txt\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 55,
      "matched_text": "ToolCall(\"file-03\", \"file\", \"Search for the word 'import' in all Python files in the current directory.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 57,
      "matched_text": "ToolCall(\"file-04\", \"file\", \"Read lines 1-20 of /etc/hosts\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 59,
      "matched_text": "ToolCall(\"file-05\", \"file\", \"Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 61,
      "matched_text": "ToolCall(\"file-06\", \"file\", \"Search for files matching *.py in the current directory.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 63,
      "matched_text": "ToolCall(\"file-07\", \"file\", \"Read the first 10 lines of /etc/passwd\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 65,
      "matched_text": "ToolCall(\"file-08\", \"file\", \"Write a JSON config to /tmp/bench_config.json with key 'debug': true\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 67,
      "matched_text": "ToolCall(\"file-09\", \"file\", \"Search for 'def test_' in Python test files.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 69,
      "matched_text": "ToolCall(\"file-10\", \"file\", \"Read /tmp/bench_config.json and tell me what's in it.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 71,
      "matched_text": "ToolCall(\"file-11\", \"file\", \"Create a file /tmp/bench_readme.md with one line: '# Benchmark'\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 73,
      "matched_text": "ToolCall(\"file-12\", \"file\", \"Search for 'TODO' comments in all .py files.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 75,
      "matched_text": "ToolCall(\"file-13\", \"file\", \"Read /tmp/bench_readme.md\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 77,
      "matched_text": "ToolCall(\"file-14\", \"file\", \"Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 78,
      "matched_text": "\"patch\", \"Tool Benchmark\"),"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 79,
      "matched_text": "ToolCall(\"file-15\", \"file\", \"Write a Python one-liner to /tmp/bench_hello.py that prints hello.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 81,
      "matched_text": "ToolCall(\"file-16\", \"file\", \"Search for all .json files in /tmp/.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 83,
      "matched_text": "ToolCall(\"file-17\", \"file\", \"Read /tmp/bench_hello.py and verify it has print('hello').\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 85,
      "matched_text": "ToolCall(\"file-18\", \"file\", \"Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 87,
      "matched_text": "ToolCall(\"file-19\", \"file\", \"List files matching 'bench*' in /tmp/.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 89,
      "matched_text": "ToolCall(\"file-20\", \"file\", \"Read /tmp/test_bench.txt again and summarize its contents.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 93,
      "matched_text": "ToolCall(\"term-01\", \"terminal\", \"Run `echo hello world` in the terminal.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 95,
      "matched_text": "ToolCall(\"term-02\", \"terminal\", \"Run `date` to get the current date and time.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 97,
      "matched_text": "ToolCall(\"term-03\", \"terminal\", \"Run `uname -a` to get system information.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 99,
      "matched_text": "ToolCall(\"term-04\", \"terminal\", \"Run `pwd` to show the current directory.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 101,
      "matched_text": "ToolCall(\"term-05\", \"terminal\", \"Run `ls -la /tmp/ | head -20` to list temp files.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 103,
      "matched_text": "ToolCall(\"term-06\", \"terminal\", \"Run `whoami` to show the current user.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 105,
      "matched_text": "ToolCall(\"term-07\", \"terminal\", \"Run `df -h` to show disk usage.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 107,
      "matched_text": "ToolCall(\"term-08\", \"terminal\", \"Run `python3 --version` to check Python version.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 109,
      "matched_text": "ToolCall(\"term-09\", \"terminal\", \"Run `cat /etc/hostname` to get the hostname.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 111,
      "matched_text": "ToolCall(\"term-10\", \"terminal\", \"Run `uptime` to see system uptime.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 113,
      "matched_text": "ToolCall(\"term-11\", \"terminal\", \"Run `env | grep PATH` to show the PATH variable.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 115,
      "matched_text": "ToolCall(\"term-12\", \"terminal\", \"Run `wc -l /etc/passwd` to count lines.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 117,
      "matched_text": "ToolCall(\"term-13\", \"terminal\", \"Run `echo $SHELL` to show the current shell.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 119,
      "matched_text": "ToolCall(\"term-14\", \"terminal\", \"Run `free -h || vm_stat` to check memory usage.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 121,
      "matched_text": "ToolCall(\"term-15\", \"terminal\", \"Run `id` to show user and group IDs.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 123,
      "matched_text": "ToolCall(\"term-16\", \"terminal\", \"Run `hostname` to get the machine hostname.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 125,
      "matched_text": "ToolCall(\"term-17\", \"terminal\", \"Run `echo {1..5}` to test brace expansion.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 127,
      "matched_text": "ToolCall(\"term-18\", \"terminal\", \"Run `seq 1 5` to generate a number sequence.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 129,
      "matched_text": "ToolCall(\"term-19\", \"terminal\", \"Run `python3 -c 'print(2+2)'` to compute 2+2.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 131,
      "matched_text": "ToolCall(\"term-20\", \"terminal\", \"Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 135,
      "matched_text": "ToolCall(\"code-01\", \"code\", \"Execute a Python script that computes factorial of 10.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 137,
      "matched_text": "ToolCall(\"code-02\", \"code\", \"Run Python to read /tmp/test_bench.txt and count its words.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 139,
      "matched_text": "ToolCall(\"code-03\", \"code\", \"Execute Python to generate the first 20 Fibonacci numbers.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 141,
      "matched_text": "ToolCall(\"code-04\", \"code\", \"Run Python to parse JSON from a string and print keys.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 143,
      "matched_text": "ToolCall(\"code-05\", \"code\", \"Execute Python to list all files in /tmp/ matching 'bench*'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 145,
      "matched_text": "ToolCall(\"code-06\", \"code\", \"Run Python to compute the sum of squares from 1 to 100.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 147,
      "matched_text": "ToolCall(\"code-07\", \"code\", \"Execute Python to check if 'racecar' is a palindrome.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 149,
      "matched_text": "ToolCall(\"code-08\", \"code\", \"Run Python to create a CSV string with 5 rows of sample data.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 151,
      "matched_text": "ToolCall(\"code-09\", \"code\", \"Execute Python to sort a list [5,2,8,1,9] and print the result.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 153,
      "matched_text": "ToolCall(\"code-10\", \"code\", \"Run Python to count lines in /etc/passwd.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 155,
      "matched_text": "ToolCall(\"code-11\", \"code\", \"Execute Python to hash the string 'benchmark' with SHA256.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 157,
      "matched_text": "ToolCall(\"code-12\", \"code\", \"Run Python to get the current UTC timestamp.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 159,
      "matched_text": "ToolCall(\"code-13\", \"code\", \"Execute Python to convert 'hello world' to uppercase and reverse it.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 161,
      "matched_text": "ToolCall(\"code-14\", \"code\", \"Run Python to create a dictionary of system info (platform, python version).\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 163,
      "matched_text": "ToolCall(\"code-15\", \"code\", \"Execute Python to check internet connectivity by resolving google.com.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 167,
      "matched_text": "ToolCall(\"deleg-01\", \"delegate\", \"Use a subagent to find all .log files in /tmp/.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 169,
      "matched_text": "ToolCall(\"deleg-02\", \"delegate\", \"Delegate to a subagent: what is 15 * 37?\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 171,
      "matched_text": "ToolCall(\"deleg-03\", \"delegate\", \"Use a subagent to check if Python 3 is installed and its version.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 173,
      "matched_text": "ToolCall(\"deleg-04\", \"delegate\", \"Delegate: read /tmp/test_bench.txt and summarize it in one sentence.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 175,
      "matched_text": "ToolCall(\"deleg-05\", \"delegate\", \"Use a subagent to list the contents of /tmp/ directory.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 177,
      "matched_text": "ToolCall(\"deleg-06\", \"delegate\", \"Delegate: count the number of .py files in the current directory.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 179,
      "matched_text": "ToolCall(\"deleg-07\", \"delegate\", \"Use a subagent to check disk space with df -h.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 181,
      "matched_text": "ToolCall(\"deleg-08\", \"delegate\", \"Delegate: what OS are we running on?\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 183,
      "matched_text": "ToolCall(\"deleg-09\", \"delegate\", \"Use a subagent to find the hostname of this machine.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 185,
      "matched_text": "ToolCall(\"deleg-10\", \"delegate\", \"Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 189,
      "matched_text": "ToolCall(\"todo-01\", \"todo\", \"Add a todo item: 'Run benchmark suite'\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 190,
      "matched_text": "\"todo\", \"benchmark\"),"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 191,
      "matched_text": "ToolCall(\"todo-02\", \"todo\", \"Show me the current todo list.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 193,
      "matched_text": "ToolCall(\"todo-03\", \"todo\", \"Mark the first todo item as completed.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 195,
      "matched_text": "ToolCall(\"todo-04\", \"todo\", \"Add a todo: 'Review benchmark results' with status pending.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 197,
      "matched_text": "ToolCall(\"todo-05\", \"todo\", \"Clear all completed todos.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 199,
      "matched_text": "ToolCall(\"todo-06\", \"memory\", \"Save this to memory: 'benchmark ran on {date}'\".format("
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 201,
      "matched_text": "\"memory\", \"benchmark\"),"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 202,
      "matched_text": "ToolCall(\"todo-07\", \"memory\", \"Search memory for 'benchmark'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 203,
      "matched_text": "\"memory\", \"benchmark\"),"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 204,
      "matched_text": "ToolCall(\"todo-08\", \"memory\", \"Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 206,
      "matched_text": "ToolCall(\"todo-09\", \"todo\", \"Add three todo items: 'analyze', 'report', 'cleanup'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 208,
      "matched_text": "ToolCall(\"todo-10\", \"memory\", \"Search memory for any notes about models.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 212,
      "matched_text": "ToolCall(\"skill-01\", \"skills\", \"List all available skills.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 214,
      "matched_text": "ToolCall(\"skill-02\", \"skills\", \"View the skill called 'test-driven-development'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 216,
      "matched_text": "ToolCall(\"skill-03\", \"skills\", \"Search for skills related to 'git'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 218,
      "matched_text": "ToolCall(\"skill-04\", \"skills\", \"View the 'code-review' skill.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 220,
      "matched_text": "ToolCall(\"skill-05\", \"skills\", \"List all skills in the 'devops' category.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 222,
      "matched_text": "ToolCall(\"skill-06\", \"skills\", \"View the 'systematic-debugging' skill.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 224,
      "matched_text": "ToolCall(\"skill-07\", \"skills\", \"Search for skills about 'testing'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 226,
      "matched_text": "ToolCall(\"skill-08\", \"skills\", \"View the 'writing-plans' skill.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 228,
      "matched_text": "ToolCall(\"skill-09\", \"skills\", \"List skills in 'software-development' category.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 230,
      "matched_text": "ToolCall(\"skill-10\", \"skills\", \"View the 'pr-review-discipline' skill.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 234,
      "matched_text": "ToolCall(\"file-21\", \"file\", \"Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 236,
      "matched_text": "ToolCall(\"file-22\", \"file\", \"Read /tmp/bench_sort.py back and confirm it exists.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 238,
      "matched_text": "ToolCall(\"file-23\", \"file\", \"Search for 'class' in all .py files in the benchmarks directory.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 240,
      "matched_text": "ToolCall(\"term-21\", \"terminal\", \"Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 242,
      "matched_text": "ToolCall(\"term-22\", \"terminal\", \"Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 244,
      "matched_text": "ToolCall(\"code-16\", \"code\", \"Execute Python to flatten a nested list [[1,2],[3,4],[5]].\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 246,
      "matched_text": "ToolCall(\"code-17\", \"code\", \"Run Python to check if a number 17 is prime.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 248,
      "matched_text": "ToolCall(\"deleg-11\", \"delegate\", \"Delegate: what is the current working directory?\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 250,
      "matched_text": "ToolCall(\"todo-11\", \"todo\", \"Add a todo: 'Finalize benchmark report' status pending.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 252,
      "matched_text": "ToolCall(\"todo-12\", \"memory\", \"Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 254,
      "matched_text": "ToolCall(\"skill-11\", \"skills\", \"Search for skills about 'deployment'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 256,
      "matched_text": "ToolCall(\"skill-12\", \"skills\", \"View the 'gitea-burn-cycle' skill.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 258,
      "matched_text": "ToolCall(\"skill-13\", \"skills\", \"List all available skill categories.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 260,
      "matched_text": "ToolCall(\"skill-14\", \"skills\", \"Search for skills related to 'memory'.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 262,
      "matched_text": "ToolCall(\"skill-15\", \"skills\", \"View the 'mimo-swarm' skill.\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 311,
      "matched_text": "\"\"\"Create prerequisite files for the benchmark.\"\"\""
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 313,
      "matched_text": "\"This is a benchmark test file.\\n\""
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 349,
      "matched_text": "\"You are a benchmark test runner. Execute the user's request by calling \""
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 406,
      "matched_text": "\"\"\"Generate markdown benchmark report.\"\"\""
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 428,
      "matched_text": "f\"# Tool-Calling Benchmark Report\","
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 535,
      "matched_text": "parser = argparse.ArgumentParser(description=\"Tool-calling benchmark\")"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 544,
      "matched_text": "help=\"Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)\")"
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 565,
      "matched_text": "output_path = Path(args.output) if args.output else REPO_ROOT / \"benchmarks\" / f\"gemma4-tool-calling-{date_str}.md\""
    },
    {
      "label": "benchmark_suite",
      "file_path": "benchmarks/tool_call_benchmark.py",
      "line_number": 575,
      "matched_text": "print(f\"Benchmark: {len(suite)} tests \u00d7 {len(model_specs)} models = {len(suite) * len(model_specs)} calls\")"
    }
  ],
  "requirements": [
    {
      "key": "gateway_replacement",
      "name": "Gateway replacement scope",
      "status": "partial",
      "evidence_labels": [
        "fallback_chain",
        "runtime_provider",
        "gateway_provider_routing",
        "cron_runtime_provider",
        "auxiliary_fallback_chain",
        "delegate_runtime_provider"
      ],
      "summary": "Hermes already spreads provider routing across core agent, runtime provider, gateway, cron, auxiliary, and delegation seams; TensorZero would need parity across all of them before it can replace the gateway layer."
    },
    {
      "key": "config_migration",
      "name": "Config migration",
      "status": "partial",
      "evidence_labels": [
        "provider_routing_config",
        "runtime_provider",
        "smart_model_routing",
        "fallback_chain"
      ],
      "summary": "Hermes has multiple config concepts to migrate (`provider_routing`, `fallback_providers`, `smart_model_routing`, runtime provider resolution), so TensorZero is not a drop-in config swap."
    },
    {
      "key": "canary_rollout",
      "name": "10% traffic canary",
      "status": "gap",
      "evidence_labels": [],
      "summary": "The repo shows semantic routing and fallback, but no grounded 10% traffic-split canary mechanism. A TensorZero cutover would need new percentage-based rollout controls and observability hooks."
    },
    {
      "key": "session_feedback",
      "name": "Session data for prompt optimization",
      "status": "partial",
      "evidence_labels": [
        "session_db",
        "trajectory_export"
      ],
      "summary": "Hermes already has SessionDB and trajectory export surfaces that can feed offline optimization data, but not a TensorZero-native ingestion path yet."
    },
    {
      "key": "evaluation_suite",
      "name": "Evaluation suite / A/B testing",
      "status": "partial",
      "evidence_labels": [
        "benchmark_suite",
        "trajectory_export"
      ],
      "summary": "Hermes already has benchmark/trajectory machinery that can seed TensorZero A/B evaluation, but no integrated TensorZero experiment runner or live evaluation gateway."
    }
  ]
}