diff --git a/environments/benchmarks/tblite/local_vllm.yaml b/environments/benchmarks/tblite/local_vllm.yaml new file mode 100644 index 000000000..b6574a6bd --- /dev/null +++ b/environments/benchmarks/tblite/local_vllm.yaml @@ -0,0 +1,39 @@ +# OpenThoughts-TBLite Evaluation -- Local vLLM Backend +# +# Runs against a local vLLM server with Docker sandboxes. +# +# Start the vLLM server from the atropos directory: +# python -m example_trainer.vllm_api_server \ +# --model Qwen/Qwen3-4B-Thinking-2507 \ +# --port 9001 \ +# --gpu-memory-utilization 0.8 \ +# --max-model-len=32000 +# +# Then run: +# python environments/benchmarks/tblite/tblite_env.py evaluate \ +# --config environments/benchmarks/tblite/local_vllm.yaml + +env: + enabled_toolsets: ["terminal", "file"] + max_agent_turns: 60 + max_token_length: 16000 + agent_temperature: 0.6 + terminal_backend: "docker" + terminal_timeout: 300 + tool_pool_size: 16 + dataset_name: "NousResearch/openthoughts-tblite" + test_timeout: 600 + task_timeout: 1200 + eval_concurrency: 8 + tool_call_parser: "hermes" + tokenizer_name: "Qwen/Qwen3-4B-Thinking-2507" + use_wandb: false + wandb_name: "tblite-qwen3-4b-thinking" + ensure_scores_are_not_same: false + data_dir_to_save_evals: "environments/benchmarks/evals/tblite-qwen3-4b-local" + +openai: + base_url: "http://localhost:9001" + model_name: "Qwen/Qwen3-4B-Thinking-2507" + server_type: "vllm" + health_check: false diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index 59ca17e3d..1b52c15f8 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -468,17 +468,37 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): messages.append({"role": "user", "content": self.format_prompt(eval_item)}) # --- 4. Run agent loop --- - agent = HermesAgentLoop( - server=self.server, - tool_schemas=tools, - valid_tool_names=valid_names, - max_turns=self.config.max_agent_turns, - task_id=task_id, - temperature=self.config.agent_temperature, - max_tokens=self.c...gth, - extra_body=self.config.extra_body, - ) - result = await agent.run(messages) + # Use ManagedServer (Phase 2) for vLLM/SGLang backends to get + # token-level tracking via /generate. Falls back to direct + # ServerManager (Phase 1) for OpenAI endpoints. + if self._use_managed_server(): + async with self.server.managed_server( + tokenizer=self.tokenizer, + preserve_think_blocks=bool(self.config.thinking_mode), + ) as managed: + agent = HermesAgentLoop( + server=managed, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + extra_body=self.config.extra_body, + ) + result = await agent.run(messages) + else: + agent = HermesAgentLoop( + server=self.server, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + extra_body=self.config.extra_body, + ) + result = await agent.run(messages) # --- 5. Verify -- run test suite in the agent's sandbox --- # Skip verification if the agent produced no meaningful output