Files
hermes-agent/evals/terminal-bench-2/evaluate_config.yaml
teknium 35ad3146a8 Add new environments and enhance tool context functionality
- Introduced new environments: Terminal Test Environment and SWE Environment, each with default configurations for testing and software engineering tasks.
- Added TerminalBench 2.0 evaluation environment with comprehensive setup for agentic LLMs, including task execution and verification.
- Enhanced ToolContext with methods for uploading and downloading files, ensuring binary-safe operations.
- Updated documentation across environments to reflect new features and usage instructions.
- Refactored existing environment configurations for consistency and clarity.
2026-02-10 19:39:05 +00:00

65 lines
1.9 KiB
YAML

env:
group_size: 1
max_num_workers: -1
max_eval_workers: 16
max_num_workers_per_node: 8
steps_per_eval: 1
max_token_length: 32000
eval_handling: STOP_TRAIN
eval_limit_ratio: 0.5
inference_weight: 1.0
batch_size: -1
max_batches_offpolicy: 3
tokenizer_name: NousResearch/Hermes-3-Llama-3.1-8B
use_wandb: false
rollout_server_url: http://localhost:8000
total_steps: 1
wandb_name: terminal-bench-2
num_rollouts_to_keep: 32
num_rollouts_per_group_for_logging: 1
ensure_scores_are_not_same: false
data_path_to_save_groups: null
data_dir_to_save_evals: evals/terminal-bench-2
min_items_sent_before_logging: 2
include_messages: false
min_batch_allocation: null
worker_timeout: 600.0
thinking_mode: false
reasoning_effort: null
max_reasoning_tokens: null
custom_thinking_prompt: null
enabled_toolsets:
- terminal
- file
disabled_toolsets: null
distribution: null
max_agent_turns: 60
system_prompt: 'You are a skilled software engineer and system administrator with
access to a terminal and file tools. You are working inside a Linux container
environment. Complete the user''s task by using the available tools. Be methodical:
explore the environment first, plan your approach, then execute step by step.
Verify your work before finishing.'
agent_temperature: 1.0
terminal_backend: modal
dataset_name: NousResearch/terminal-bench-2
dataset_split: train
prompt_field: prompt
tool_call_parser: hermes
test_timeout: 180
force_build: false
task_filter: fix-git
skip_tasks: null
openai:
- timeout: 1200
num_max_requests_at_once: 512
num_requests_for_eval: 64
model_name: anthropic/claude-sonnet-4
rolling_buffer_length: 1000
server_type: openai
api_key: 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
base_url: https://openrouter.ai/api/v1
n_kwarg_is_ignored: false
health_check: false
slurm: false
testing: false