Add new environments and enhance tool context functionality

- Introduced new environments: Terminal Test Environment and SWE Environment, each with default configurations for testing and software engineering tasks. - Added TerminalBench 2.0 evaluation environment with comprehensive setup for agentic LLMs, including task execution and verification. - Enhanced ToolContext with methods for uploading and downloading files, ensuring binary-safe operations. - Updated documentation across environments to reflect new features and usage instructions. - Refactored existing environment configurations for consistency and clarity.
2026-02-10 19:39:05 +00:00
parent e8343f2d87
commit 35ad3146a8
18 changed files with 1428 additions and 19 deletions
--- a/evals/terminal-bench-2/evaluate_config.yaml
+++ b/evals/terminal-bench-2/evaluate_config.yaml
@@ -0,0 +1,64 @@
+env:
+  group_size: 1
+  max_num_workers: -1
+  max_eval_workers: 16
+  max_num_workers_per_node: 8
+  steps_per_eval: 1
+  max_token_length: 32000
+  eval_handling: STOP_TRAIN
+  eval_limit_ratio: 0.5
+  inference_weight: 1.0
+  batch_size: -1
+  max_batches_offpolicy: 3
+  tokenizer_name: NousResearch/Hermes-3-Llama-3.1-8B
+  use_wandb: false
+  rollout_server_url: http://localhost:8000
+  total_steps: 1
+  wandb_name: terminal-bench-2
+  num_rollouts_to_keep: 32
+  num_rollouts_per_group_for_logging: 1
+  ensure_scores_are_not_same: false
+  data_path_to_save_groups: null
+  data_dir_to_save_evals: evals/terminal-bench-2
+  min_items_sent_before_logging: 2
+  include_messages: false
+  min_batch_allocation: null
+  worker_timeout: 600.0
+  thinking_mode: false
+  reasoning_effort: null
+  max_reasoning_tokens: null
+  custom_thinking_prompt: null
+  enabled_toolsets:
+  - terminal
+  - file
+  disabled_toolsets: null
+  distribution: null
+  max_agent_turns: 60
+  system_prompt: 'You are a skilled software engineer and system administrator with
+    access to a terminal and file tools. You are working inside a Linux container
+    environment. Complete the user''s task by using the available tools. Be methodical:
+    explore the environment first, plan your approach, then execute step by step.
+    Verify your work before finishing.'
+  agent_temperature: 1.0
+  terminal_backend: modal
+  dataset_name: NousResearch/terminal-bench-2
+  dataset_split: train
+  prompt_field: prompt
+  tool_call_parser: hermes
+  test_timeout: 180
+  force_build: false
+  task_filter: fix-git
+  skip_tasks: null
+openai:
+- timeout: 1200
+  num_max_requests_at_once: 512
+  num_requests_for_eval: 64
+  model_name: anthropic/claude-sonnet-4
+  rolling_buffer_length: 1000
+  server_type: openai
+  api_key: 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
+  base_url: https://openrouter.ai/api/v1
+  n_kwarg_is_ignored: false
+  health_check: false
+slurm: false
+testing: false