All checks were successful
Lint / lint (pull_request) Successful in 37s
- add a script that inventories Hermes routing/evaluation surfaces relevant to a TensorZero cutover - generate a markdown and JSON evaluation packet for issue #860 - score gateway replacement, config migration, canary rollout, session feedback, and eval-suite readiness - add focused regression tests for touchpoint scanning, requirement scoring, and report rendering Refs #860
1131 lines
42 KiB
JSON
1131 lines
42 KiB
JSON
{
|
|
"issue_number": 860,
|
|
"issue_title": "tensorzero LLMOps platform evaluation",
|
|
"issue_url": "https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/860",
|
|
"recommendation": "Not ready for direct replacement. Recommend a shadow-evaluation phase first: keep Hermes routing live, inventory the migration seams, export SessionDB/trajectory data into an offline TensorZero experiment loop, and only design a canary gateway once percentage-based rollout controls exist.",
|
|
"touchpoints": [
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 601,
|
|
"matched_text": "fallback_model: Dict[str, Any] = None,"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 995,
|
|
"matched_text": "# failure). Supports both legacy single-dict ``fallback_model`` and"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 996,
|
|
"matched_text": "# new list ``fallback_providers`` format."
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 997,
|
|
"matched_text": "if isinstance(fallback_model, list):"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 998,
|
|
"matched_text": "self._fallback_chain = ["
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 999,
|
|
"matched_text": "f for f in fallback_model"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1002,
|
|
"matched_text": "elif isinstance(fallback_model, dict) and fallback_model.get(\"provider\") and fallback_model.get(\"model\"):"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1003,
|
|
"matched_text": "self._fallback_chain = [fallback_model]"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1005,
|
|
"matched_text": "self._fallback_chain = []"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1009,
|
|
"matched_text": "self._fallback_model = self._fallback_chain[0] if self._fallback_chain else None"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1010,
|
|
"matched_text": "if self._fallback_chain and not self.quiet_mode:"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1011,
|
|
"matched_text": "if len(self._fallback_chain) == 1:"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1012,
|
|
"matched_text": "fb = self._fallback_chain[0]"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1015,
|
|
"matched_text": "print(f\"\ud83d\udd04 Fallback chain ({len(self._fallback_chain)} providers): \" +"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 1016,
|
|
"matched_text": "\" \u2192 \".join(f\"{f['model']} ({f['provider']})\" for f in self._fallback_chain))"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 5624,
|
|
"matched_text": "if self._fallback_index >= len(self._fallback_chain):"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 5627,
|
|
"matched_text": "fb = self._fallback_chain[self._fallback_index]"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 8559,
|
|
"matched_text": "if self._fallback_index < len(self._fallback_chain):"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 9355,
|
|
"matched_text": "if is_rate_limited and self._fallback_index < len(self._fallback_chain):"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 10460,
|
|
"matched_text": "if _truly_empty and self._fallback_chain:"
|
|
},
|
|
{
|
|
"label": "fallback_chain",
|
|
"file_path": "run_agent.py",
|
|
"line_number": 10514,
|
|
"matched_text": "+ (\" and fallback attempts.\" if self._fallback_chain else"
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 241,
|
|
"matched_text": "\"smart_model_routing\": {"
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 370,
|
|
"matched_text": "# (e.g. platform_toolsets, provider_routing, memory, honcho, etc.)"
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 1753,
|
|
"matched_text": "pr = CLI_CONFIG.get(\"provider_routing\", {}) or {}"
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 1762,
|
|
"matched_text": "# Supports new list format (fallback_providers) and legacy single-dict (fallback_model)."
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 1763,
|
|
"matched_text": "fb = CLI_CONFIG.get(\"fallback_providers\") or CLI_CONFIG.get(\"fallback_model\") or []"
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 1770,
|
|
"matched_text": "self._smart_model_routing = CLI_CONFIG.get(\"smart_model_routing\", {}) or {}"
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 2771,
|
|
"matched_text": "from agent.smart_model_routing import resolve_turn_route"
|
|
},
|
|
{
|
|
"label": "provider_routing_config",
|
|
"file_path": "cli.py",
|
|
"line_number": 2776,
|
|
"matched_text": "self._smart_model_routing,"
|
|
},
|
|
{
|
|
"label": "runtime_provider",
|
|
"file_path": "hermes_cli/runtime_provider.py",
|
|
"line_number": 209,
|
|
"matched_text": "def resolve_requested_provider(requested: Optional[str] = None) -> str:"
|
|
},
|
|
{
|
|
"label": "runtime_provider",
|
|
"file_path": "hermes_cli/runtime_provider.py",
|
|
"line_number": 649,
|
|
"matched_text": "def resolve_runtime_provider("
|
|
},
|
|
{
|
|
"label": "smart_model_routing",
|
|
"file_path": "agent/smart_model_routing.py",
|
|
"line_number": 62,
|
|
"matched_text": "def choose_cheap_model_route(user_message: str, routing_config: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:"
|
|
},
|
|
{
|
|
"label": "smart_model_routing",
|
|
"file_path": "agent/smart_model_routing.py",
|
|
"line_number": 110,
|
|
"matched_text": "def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any]], primary: Dict[str, Any]) -> Dict[str, Any]:"
|
|
},
|
|
{
|
|
"label": "gateway_provider_routing",
|
|
"file_path": "gateway/run.py",
|
|
"line_number": 1271,
|
|
"matched_text": "def _load_provider_routing() -> dict:"
|
|
},
|
|
{
|
|
"label": "gateway_provider_routing",
|
|
"file_path": "gateway/run.py",
|
|
"line_number": 1285,
|
|
"matched_text": "def _load_fallback_model() -> list | dict | None:"
|
|
},
|
|
{
|
|
"label": "gateway_provider_routing",
|
|
"file_path": "gateway/run.py",
|
|
"line_number": 1306,
|
|
"matched_text": "def _load_smart_model_routing() -> dict:"
|
|
},
|
|
{
|
|
"label": "cron_runtime_provider",
|
|
"file_path": "cron/scheduler.py",
|
|
"line_number": 684,
|
|
"matched_text": "pr = _cfg.get(\"provider_routing\", {})"
|
|
},
|
|
{
|
|
"label": "cron_runtime_provider",
|
|
"file_path": "cron/scheduler.py",
|
|
"line_number": 688,
|
|
"matched_text": "resolve_runtime_provider,"
|
|
},
|
|
{
|
|
"label": "cron_runtime_provider",
|
|
"file_path": "cron/scheduler.py",
|
|
"line_number": 697,
|
|
"matched_text": "runtime = resolve_runtime_provider(**runtime_kwargs)"
|
|
},
|
|
{
|
|
"label": "cron_runtime_provider",
|
|
"file_path": "cron/scheduler.py",
|
|
"line_number": 702,
|
|
"matched_text": "from agent.smart_model_routing import resolve_turn_route"
|
|
},
|
|
{
|
|
"label": "cron_runtime_provider",
|
|
"file_path": "cron/scheduler.py",
|
|
"line_number": 703,
|
|
"matched_text": "turn_route = resolve_turn_route("
|
|
},
|
|
{
|
|
"label": "cron_runtime_provider",
|
|
"file_path": "cron/scheduler.py",
|
|
"line_number": 717,
|
|
"matched_text": "fallback_model = _cfg.get(\"fallback_providers\") or _cfg.get(\"fallback_model\") or None"
|
|
},
|
|
{
|
|
"label": "cron_runtime_provider",
|
|
"file_path": "cron/scheduler.py",
|
|
"line_number": 746,
|
|
"matched_text": "fallback_model=fallback_model,"
|
|
},
|
|
{
|
|
"label": "auxiliary_fallback_chain",
|
|
"file_path": "agent/auxiliary_client.py",
|
|
"line_number": 1018,
|
|
"matched_text": "def _get_provider_chain() -> List[tuple]:"
|
|
},
|
|
{
|
|
"label": "auxiliary_fallback_chain",
|
|
"file_path": "agent/auxiliary_client.py",
|
|
"line_number": 1107,
|
|
"matched_text": "for label, try_fn in _get_provider_chain():"
|
|
},
|
|
{
|
|
"label": "auxiliary_fallback_chain",
|
|
"file_path": "agent/auxiliary_client.py",
|
|
"line_number": 1189,
|
|
"matched_text": "# \u2500\u2500 Step 2: aggregator / fallback chain \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500"
|
|
},
|
|
{
|
|
"label": "auxiliary_fallback_chain",
|
|
"file_path": "agent/auxiliary_client.py",
|
|
"line_number": 1191,
|
|
"matched_text": "for label, try_fn in _get_provider_chain():"
|
|
},
|
|
{
|
|
"label": "auxiliary_fallback_chain",
|
|
"file_path": "agent/auxiliary_client.py",
|
|
"line_number": 2397,
|
|
"matched_text": "# error, fall through to the fallback chain below."
|
|
},
|
|
{
|
|
"label": "auxiliary_fallback_chain",
|
|
"file_path": "agent/auxiliary_client.py",
|
|
"line_number": 2417,
|
|
"matched_text": "# auto (the default) = best-effort fallback chain. (#7559)"
|
|
},
|
|
{
|
|
"label": "auxiliary_fallback_chain",
|
|
"file_path": "agent/auxiliary_client.py",
|
|
"line_number": 2589,
|
|
"matched_text": "# error, fall through to the fallback chain below."
|
|
},
|
|
{
|
|
"label": "delegate_runtime_provider",
|
|
"file_path": "tools/delegate_tool.py",
|
|
"line_number": 662,
|
|
"matched_text": "# bundle (base_url, api_key, api_mode) via the same runtime provider system"
|
|
},
|
|
{
|
|
"label": "delegate_runtime_provider",
|
|
"file_path": "tools/delegate_tool.py",
|
|
"line_number": 854,
|
|
"matched_text": "provider) is resolved via the runtime provider system \u2014 the same path used"
|
|
},
|
|
{
|
|
"label": "delegate_runtime_provider",
|
|
"file_path": "tools/delegate_tool.py",
|
|
"line_number": 909,
|
|
"matched_text": "from hermes_cli.runtime_provider import resolve_runtime_provider"
|
|
},
|
|
{
|
|
"label": "delegate_runtime_provider",
|
|
"file_path": "tools/delegate_tool.py",
|
|
"line_number": 910,
|
|
"matched_text": "runtime = resolve_runtime_provider(requested=configured_provider)"
|
|
},
|
|
{
|
|
"label": "session_db",
|
|
"file_path": "hermes_state.py",
|
|
"line_number": 115,
|
|
"matched_text": "class SessionDB:"
|
|
},
|
|
{
|
|
"label": "trajectory_export",
|
|
"file_path": "batch_runner.py",
|
|
"line_number": 320,
|
|
"matched_text": "save_trajectories=False, # We handle saving ourselves"
|
|
},
|
|
{
|
|
"label": "trajectory_export",
|
|
"file_path": "batch_runner.py",
|
|
"line_number": 346,
|
|
"matched_text": "trajectory = agent._convert_to_trajectory_format("
|
|
},
|
|
{
|
|
"label": "trajectory_export",
|
|
"file_path": "batch_runner.py",
|
|
"line_number": 460,
|
|
"matched_text": "trajectory_entry = {"
|
|
},
|
|
{
|
|
"label": "trajectory_export",
|
|
"file_path": "batch_runner.py",
|
|
"line_number": 474,
|
|
"matched_text": "f.write(json.dumps(trajectory_entry, ensure_ascii=False) + \"\\n\")"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 3,
|
|
"matched_text": "Tool-Calling Benchmark \u2014 Gemma 4 vs mimo-v2-pro regression test."
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 9,
|
|
"matched_text": "python3 benchmarks/tool_call_benchmark.py # full 100-call suite"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 10,
|
|
"matched_text": "python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 11,
|
|
"matched_text": "python3 benchmarks/tool_call_benchmark.py --models nous # single model"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 12,
|
|
"matched_text": "python3 benchmarks/tool_call_benchmark.py --category file # single category"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 37,
|
|
"matched_text": "class ToolCall:"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 51,
|
|
"matched_text": "ToolCall(\"file-01\", \"file\", \"Read the file /tmp/test_bench.txt and show me its contents.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 53,
|
|
"matched_text": "ToolCall(\"file-02\", \"file\", \"Write 'hello benchmark' to /tmp/test_bench_out.txt\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 55,
|
|
"matched_text": "ToolCall(\"file-03\", \"file\", \"Search for the word 'import' in all Python files in the current directory.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 57,
|
|
"matched_text": "ToolCall(\"file-04\", \"file\", \"Read lines 1-20 of /etc/hosts\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 59,
|
|
"matched_text": "ToolCall(\"file-05\", \"file\", \"Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 61,
|
|
"matched_text": "ToolCall(\"file-06\", \"file\", \"Search for files matching *.py in the current directory.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 63,
|
|
"matched_text": "ToolCall(\"file-07\", \"file\", \"Read the first 10 lines of /etc/passwd\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 65,
|
|
"matched_text": "ToolCall(\"file-08\", \"file\", \"Write a JSON config to /tmp/bench_config.json with key 'debug': true\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 67,
|
|
"matched_text": "ToolCall(\"file-09\", \"file\", \"Search for 'def test_' in Python test files.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 69,
|
|
"matched_text": "ToolCall(\"file-10\", \"file\", \"Read /tmp/bench_config.json and tell me what's in it.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 71,
|
|
"matched_text": "ToolCall(\"file-11\", \"file\", \"Create a file /tmp/bench_readme.md with one line: '# Benchmark'\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 73,
|
|
"matched_text": "ToolCall(\"file-12\", \"file\", \"Search for 'TODO' comments in all .py files.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 75,
|
|
"matched_text": "ToolCall(\"file-13\", \"file\", \"Read /tmp/bench_readme.md\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 77,
|
|
"matched_text": "ToolCall(\"file-14\", \"file\", \"Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 78,
|
|
"matched_text": "\"patch\", \"Tool Benchmark\"),"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 79,
|
|
"matched_text": "ToolCall(\"file-15\", \"file\", \"Write a Python one-liner to /tmp/bench_hello.py that prints hello.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 81,
|
|
"matched_text": "ToolCall(\"file-16\", \"file\", \"Search for all .json files in /tmp/.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 83,
|
|
"matched_text": "ToolCall(\"file-17\", \"file\", \"Read /tmp/bench_hello.py and verify it has print('hello').\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 85,
|
|
"matched_text": "ToolCall(\"file-18\", \"file\", \"Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 87,
|
|
"matched_text": "ToolCall(\"file-19\", \"file\", \"List files matching 'bench*' in /tmp/.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 89,
|
|
"matched_text": "ToolCall(\"file-20\", \"file\", \"Read /tmp/test_bench.txt again and summarize its contents.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 93,
|
|
"matched_text": "ToolCall(\"term-01\", \"terminal\", \"Run `echo hello world` in the terminal.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 95,
|
|
"matched_text": "ToolCall(\"term-02\", \"terminal\", \"Run `date` to get the current date and time.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 97,
|
|
"matched_text": "ToolCall(\"term-03\", \"terminal\", \"Run `uname -a` to get system information.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 99,
|
|
"matched_text": "ToolCall(\"term-04\", \"terminal\", \"Run `pwd` to show the current directory.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 101,
|
|
"matched_text": "ToolCall(\"term-05\", \"terminal\", \"Run `ls -la /tmp/ | head -20` to list temp files.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 103,
|
|
"matched_text": "ToolCall(\"term-06\", \"terminal\", \"Run `whoami` to show the current user.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 105,
|
|
"matched_text": "ToolCall(\"term-07\", \"terminal\", \"Run `df -h` to show disk usage.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 107,
|
|
"matched_text": "ToolCall(\"term-08\", \"terminal\", \"Run `python3 --version` to check Python version.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 109,
|
|
"matched_text": "ToolCall(\"term-09\", \"terminal\", \"Run `cat /etc/hostname` to get the hostname.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 111,
|
|
"matched_text": "ToolCall(\"term-10\", \"terminal\", \"Run `uptime` to see system uptime.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 113,
|
|
"matched_text": "ToolCall(\"term-11\", \"terminal\", \"Run `env | grep PATH` to show the PATH variable.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 115,
|
|
"matched_text": "ToolCall(\"term-12\", \"terminal\", \"Run `wc -l /etc/passwd` to count lines.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 117,
|
|
"matched_text": "ToolCall(\"term-13\", \"terminal\", \"Run `echo $SHELL` to show the current shell.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 119,
|
|
"matched_text": "ToolCall(\"term-14\", \"terminal\", \"Run `free -h || vm_stat` to check memory usage.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 121,
|
|
"matched_text": "ToolCall(\"term-15\", \"terminal\", \"Run `id` to show user and group IDs.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 123,
|
|
"matched_text": "ToolCall(\"term-16\", \"terminal\", \"Run `hostname` to get the machine hostname.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 125,
|
|
"matched_text": "ToolCall(\"term-17\", \"terminal\", \"Run `echo {1..5}` to test brace expansion.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 127,
|
|
"matched_text": "ToolCall(\"term-18\", \"terminal\", \"Run `seq 1 5` to generate a number sequence.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 129,
|
|
"matched_text": "ToolCall(\"term-19\", \"terminal\", \"Run `python3 -c 'print(2+2)'` to compute 2+2.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 131,
|
|
"matched_text": "ToolCall(\"term-20\", \"terminal\", \"Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 135,
|
|
"matched_text": "ToolCall(\"code-01\", \"code\", \"Execute a Python script that computes factorial of 10.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 137,
|
|
"matched_text": "ToolCall(\"code-02\", \"code\", \"Run Python to read /tmp/test_bench.txt and count its words.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 139,
|
|
"matched_text": "ToolCall(\"code-03\", \"code\", \"Execute Python to generate the first 20 Fibonacci numbers.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 141,
|
|
"matched_text": "ToolCall(\"code-04\", \"code\", \"Run Python to parse JSON from a string and print keys.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 143,
|
|
"matched_text": "ToolCall(\"code-05\", \"code\", \"Execute Python to list all files in /tmp/ matching 'bench*'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 145,
|
|
"matched_text": "ToolCall(\"code-06\", \"code\", \"Run Python to compute the sum of squares from 1 to 100.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 147,
|
|
"matched_text": "ToolCall(\"code-07\", \"code\", \"Execute Python to check if 'racecar' is a palindrome.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 149,
|
|
"matched_text": "ToolCall(\"code-08\", \"code\", \"Run Python to create a CSV string with 5 rows of sample data.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 151,
|
|
"matched_text": "ToolCall(\"code-09\", \"code\", \"Execute Python to sort a list [5,2,8,1,9] and print the result.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 153,
|
|
"matched_text": "ToolCall(\"code-10\", \"code\", \"Run Python to count lines in /etc/passwd.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 155,
|
|
"matched_text": "ToolCall(\"code-11\", \"code\", \"Execute Python to hash the string 'benchmark' with SHA256.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 157,
|
|
"matched_text": "ToolCall(\"code-12\", \"code\", \"Run Python to get the current UTC timestamp.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 159,
|
|
"matched_text": "ToolCall(\"code-13\", \"code\", \"Execute Python to convert 'hello world' to uppercase and reverse it.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 161,
|
|
"matched_text": "ToolCall(\"code-14\", \"code\", \"Run Python to create a dictionary of system info (platform, python version).\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 163,
|
|
"matched_text": "ToolCall(\"code-15\", \"code\", \"Execute Python to check internet connectivity by resolving google.com.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 167,
|
|
"matched_text": "ToolCall(\"deleg-01\", \"delegate\", \"Use a subagent to find all .log files in /tmp/.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 169,
|
|
"matched_text": "ToolCall(\"deleg-02\", \"delegate\", \"Delegate to a subagent: what is 15 * 37?\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 171,
|
|
"matched_text": "ToolCall(\"deleg-03\", \"delegate\", \"Use a subagent to check if Python 3 is installed and its version.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 173,
|
|
"matched_text": "ToolCall(\"deleg-04\", \"delegate\", \"Delegate: read /tmp/test_bench.txt and summarize it in one sentence.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 175,
|
|
"matched_text": "ToolCall(\"deleg-05\", \"delegate\", \"Use a subagent to list the contents of /tmp/ directory.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 177,
|
|
"matched_text": "ToolCall(\"deleg-06\", \"delegate\", \"Delegate: count the number of .py files in the current directory.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 179,
|
|
"matched_text": "ToolCall(\"deleg-07\", \"delegate\", \"Use a subagent to check disk space with df -h.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 181,
|
|
"matched_text": "ToolCall(\"deleg-08\", \"delegate\", \"Delegate: what OS are we running on?\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 183,
|
|
"matched_text": "ToolCall(\"deleg-09\", \"delegate\", \"Use a subagent to find the hostname of this machine.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 185,
|
|
"matched_text": "ToolCall(\"deleg-10\", \"delegate\", \"Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 189,
|
|
"matched_text": "ToolCall(\"todo-01\", \"todo\", \"Add a todo item: 'Run benchmark suite'\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 190,
|
|
"matched_text": "\"todo\", \"benchmark\"),"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 191,
|
|
"matched_text": "ToolCall(\"todo-02\", \"todo\", \"Show me the current todo list.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 193,
|
|
"matched_text": "ToolCall(\"todo-03\", \"todo\", \"Mark the first todo item as completed.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 195,
|
|
"matched_text": "ToolCall(\"todo-04\", \"todo\", \"Add a todo: 'Review benchmark results' with status pending.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 197,
|
|
"matched_text": "ToolCall(\"todo-05\", \"todo\", \"Clear all completed todos.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 199,
|
|
"matched_text": "ToolCall(\"todo-06\", \"memory\", \"Save this to memory: 'benchmark ran on {date}'\".format("
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 201,
|
|
"matched_text": "\"memory\", \"benchmark\"),"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 202,
|
|
"matched_text": "ToolCall(\"todo-07\", \"memory\", \"Search memory for 'benchmark'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 203,
|
|
"matched_text": "\"memory\", \"benchmark\"),"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 204,
|
|
"matched_text": "ToolCall(\"todo-08\", \"memory\", \"Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 206,
|
|
"matched_text": "ToolCall(\"todo-09\", \"todo\", \"Add three todo items: 'analyze', 'report', 'cleanup'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 208,
|
|
"matched_text": "ToolCall(\"todo-10\", \"memory\", \"Search memory for any notes about models.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 212,
|
|
"matched_text": "ToolCall(\"skill-01\", \"skills\", \"List all available skills.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 214,
|
|
"matched_text": "ToolCall(\"skill-02\", \"skills\", \"View the skill called 'test-driven-development'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 216,
|
|
"matched_text": "ToolCall(\"skill-03\", \"skills\", \"Search for skills related to 'git'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 218,
|
|
"matched_text": "ToolCall(\"skill-04\", \"skills\", \"View the 'code-review' skill.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 220,
|
|
"matched_text": "ToolCall(\"skill-05\", \"skills\", \"List all skills in the 'devops' category.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 222,
|
|
"matched_text": "ToolCall(\"skill-06\", \"skills\", \"View the 'systematic-debugging' skill.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 224,
|
|
"matched_text": "ToolCall(\"skill-07\", \"skills\", \"Search for skills about 'testing'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 226,
|
|
"matched_text": "ToolCall(\"skill-08\", \"skills\", \"View the 'writing-plans' skill.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 228,
|
|
"matched_text": "ToolCall(\"skill-09\", \"skills\", \"List skills in 'software-development' category.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 230,
|
|
"matched_text": "ToolCall(\"skill-10\", \"skills\", \"View the 'pr-review-discipline' skill.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 234,
|
|
"matched_text": "ToolCall(\"file-21\", \"file\", \"Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 236,
|
|
"matched_text": "ToolCall(\"file-22\", \"file\", \"Read /tmp/bench_sort.py back and confirm it exists.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 238,
|
|
"matched_text": "ToolCall(\"file-23\", \"file\", \"Search for 'class' in all .py files in the benchmarks directory.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 240,
|
|
"matched_text": "ToolCall(\"term-21\", \"terminal\", \"Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 242,
|
|
"matched_text": "ToolCall(\"term-22\", \"terminal\", \"Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 244,
|
|
"matched_text": "ToolCall(\"code-16\", \"code\", \"Execute Python to flatten a nested list [[1,2],[3,4],[5]].\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 246,
|
|
"matched_text": "ToolCall(\"code-17\", \"code\", \"Run Python to check if a number 17 is prime.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 248,
|
|
"matched_text": "ToolCall(\"deleg-11\", \"delegate\", \"Delegate: what is the current working directory?\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 250,
|
|
"matched_text": "ToolCall(\"todo-11\", \"todo\", \"Add a todo: 'Finalize benchmark report' status pending.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 252,
|
|
"matched_text": "ToolCall(\"todo-12\", \"memory\", \"Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 254,
|
|
"matched_text": "ToolCall(\"skill-11\", \"skills\", \"Search for skills about 'deployment'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 256,
|
|
"matched_text": "ToolCall(\"skill-12\", \"skills\", \"View the 'gitea-burn-cycle' skill.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 258,
|
|
"matched_text": "ToolCall(\"skill-13\", \"skills\", \"List all available skill categories.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 260,
|
|
"matched_text": "ToolCall(\"skill-14\", \"skills\", \"Search for skills related to 'memory'.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 262,
|
|
"matched_text": "ToolCall(\"skill-15\", \"skills\", \"View the 'mimo-swarm' skill.\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 311,
|
|
"matched_text": "\"\"\"Create prerequisite files for the benchmark.\"\"\""
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 313,
|
|
"matched_text": "\"This is a benchmark test file.\\n\""
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 349,
|
|
"matched_text": "\"You are a benchmark test runner. Execute the user's request by calling \""
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 406,
|
|
"matched_text": "\"\"\"Generate markdown benchmark report.\"\"\""
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 428,
|
|
"matched_text": "f\"# Tool-Calling Benchmark Report\","
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 535,
|
|
"matched_text": "parser = argparse.ArgumentParser(description=\"Tool-calling benchmark\")"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 544,
|
|
"matched_text": "help=\"Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)\")"
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 565,
|
|
"matched_text": "output_path = Path(args.output) if args.output else REPO_ROOT / \"benchmarks\" / f\"gemma4-tool-calling-{date_str}.md\""
|
|
},
|
|
{
|
|
"label": "benchmark_suite",
|
|
"file_path": "benchmarks/tool_call_benchmark.py",
|
|
"line_number": 575,
|
|
"matched_text": "print(f\"Benchmark: {len(suite)} tests \u00d7 {len(model_specs)} models = {len(suite) * len(model_specs)} calls\")"
|
|
}
|
|
],
|
|
"requirements": [
|
|
{
|
|
"key": "gateway_replacement",
|
|
"name": "Gateway replacement scope",
|
|
"status": "partial",
|
|
"evidence_labels": [
|
|
"fallback_chain",
|
|
"runtime_provider",
|
|
"gateway_provider_routing",
|
|
"cron_runtime_provider",
|
|
"auxiliary_fallback_chain",
|
|
"delegate_runtime_provider"
|
|
],
|
|
"summary": "Hermes already spreads provider routing across core agent, runtime provider, gateway, cron, auxiliary, and delegation seams; TensorZero would need parity across all of them before it can replace the gateway layer."
|
|
},
|
|
{
|
|
"key": "config_migration",
|
|
"name": "Config migration",
|
|
"status": "partial",
|
|
"evidence_labels": [
|
|
"provider_routing_config",
|
|
"runtime_provider",
|
|
"smart_model_routing",
|
|
"fallback_chain"
|
|
],
|
|
"summary": "Hermes has multiple config concepts to migrate (`provider_routing`, `fallback_providers`, `smart_model_routing`, runtime provider resolution), so TensorZero is not a drop-in config swap."
|
|
},
|
|
{
|
|
"key": "canary_rollout",
|
|
"name": "10% traffic canary",
|
|
"status": "gap",
|
|
"evidence_labels": [],
|
|
"summary": "The repo shows semantic routing and fallback, but no grounded 10% traffic-split canary mechanism. A TensorZero cutover would need new percentage-based rollout controls and observability hooks."
|
|
},
|
|
{
|
|
"key": "session_feedback",
|
|
"name": "Session data for prompt optimization",
|
|
"status": "partial",
|
|
"evidence_labels": [
|
|
"session_db",
|
|
"trajectory_export"
|
|
],
|
|
"summary": "Hermes already has SessionDB and trajectory export surfaces that can feed offline optimization data, but not a TensorZero-native ingestion path yet."
|
|
},
|
|
{
|
|
"key": "evaluation_suite",
|
|
"name": "Evaluation suite / A/B testing",
|
|
"status": "partial",
|
|
"evidence_labels": [
|
|
"benchmark_suite",
|
|
"trajectory_export"
|
|
],
|
|
"summary": "Hermes already has benchmark/trajectory machinery that can seed TensorZero A/B evaluation, but no integrated TensorZero experiment runner or live evaluation gateway."
|
|
}
|
|
]
|
|
} |