From 13f545967010d0ddc19046eb6ef6caca095f991d Mon Sep 17 00:00:00 2001 From: dmahan93 Date: Mon, 9 Mar 2026 21:32:23 -0500 Subject: [PATCH] fix: use ManagedServer for vLLM in TBLite eval + local_vllm config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TBLite eval was bypassing ManagedServer and calling ServerManager directly, which uses /v1/chat/completions — not available on the atropos vllm_api_server (/generate only). Now uses _use_managed_server() to detect vLLM/SGLang backends and route through ManagedServer (Phase 2) with proper tool_parser and /generate endpoint. Falls back to Phase 1 for OpenAI endpoints. Also adds local_vllm.yaml config for running against a local vLLM server with Docker sandboxes. --- .../benchmarks/tblite/local_vllm.yaml | 39 +++++++++++++++++ .../terminalbench_2/terminalbench2_env.py | 42 ++++++++++++++----- 2 files changed, 70 insertions(+), 11 deletions(-) create mode 100644 environments/benchmarks/tblite/local_vllm.yaml diff --git a/environments/benchmarks/tblite/local_vllm.yaml b/environments/benchmarks/tblite/local_vllm.yaml new file mode 100644 index 00000000..b6574a6b --- /dev/null +++ b/environments/benchmarks/tblite/local_vllm.yaml @@ -0,0 +1,39 @@ +# OpenThoughts-TBLite Evaluation -- Local vLLM Backend +# +# Runs against a local vLLM server with Docker sandboxes. +# +# Start the vLLM server from the atropos directory: +# python -m example_trainer.vllm_api_server \ +# --model Qwen/Qwen3-4B-Thinking-2507 \ +# --port 9001 \ +# --gpu-memory-utilization 0.8 \ +# --max-model-len=32000 +# +# Then run: +# python environments/benchmarks/tblite/tblite_env.py evaluate \ +# --config environments/benchmarks/tblite/local_vllm.yaml + +env: + enabled_toolsets: ["terminal", "file"] + max_agent_turns: 60 + max_token_length: 16000 + agent_temperature: 0.6 + terminal_backend: "docker" + terminal_timeout: 300 + tool_pool_size: 16 + dataset_name: "NousResearch/openthoughts-tblite" + test_timeout: 600 + task_timeout: 1200 + eval_concurrency: 8 + tool_call_parser: "hermes" + tokenizer_name: "Qwen/Qwen3-4B-Thinking-2507" + use_wandb: false + wandb_name: "tblite-qwen3-4b-thinking" + ensure_scores_are_not_same: false + data_dir_to_save_evals: "environments/benchmarks/evals/tblite-qwen3-4b-local" + +openai: + base_url: "http://localhost:9001" + model_name: "Qwen/Qwen3-4B-Thinking-2507" + server_type: "vllm" + health_check: false diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index 59ca17e3..1b52c15f 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -468,17 +468,37 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): messages.append({"role": "user", "content": self.format_prompt(eval_item)}) # --- 4. Run agent loop --- - agent = HermesAgentLoop( - server=self.server, - tool_schemas=tools, - valid_tool_names=valid_names, - max_turns=self.config.max_agent_turns, - task_id=task_id, - temperature=self.config.agent_temperature, - max_tokens=self.c...gth, - extra_body=self.config.extra_body, - ) - result = await agent.run(messages) + # Use ManagedServer (Phase 2) for vLLM/SGLang backends to get + # token-level tracking via /generate. Falls back to direct + # ServerManager (Phase 1) for OpenAI endpoints. + if self._use_managed_server(): + async with self.server.managed_server( + tokenizer=self.tokenizer, + preserve_think_blocks=bool(self.config.thinking_mode), + ) as managed: + agent = HermesAgentLoop( + server=managed, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + extra_body=self.config.extra_body, + ) + result = await agent.run(messages) + else: + agent = HermesAgentLoop( + server=self.server, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + extra_body=self.config.extra_body, + ) + result = await agent.run(messages) # --- 5. Verify -- run test suite in the agent's sandbox --- # Skip verification if the agent produced no meaningful output