hermes-agent/evals/terminal-bench-2/evaluate_config.yaml

env:
  group_size: 1
  max_num_workers: -1
  max_eval_workers: 16
  max_num_workers_per_node: 8
  steps_per_eval: 1
  max_token_length: 32000
  eval_handling: STOP_TRAIN
  eval_limit_ratio: 0.5
  inference_weight: 1.0
  batch_size: -1
  max_batches_offpolicy: 3
  tokenizer_name: NousResearch/Hermes-3-Llama-3.1-8B
  use_wandb: false
  rollout_server_url: http://localhost:8000
  total_steps: 1
  wandb_name: terminal-bench-2
  num_rollouts_to_keep: 32
  num_rollouts_per_group_for_logging: 1
  ensure_scores_are_not_same: false
  data_path_to_save_groups: null
  data_dir_to_save_evals: evals/terminal-bench-2
  min_items_sent_before_logging: 2
  include_messages: false
  min_batch_allocation: null
  worker_timeout: 600.0
  thinking_mode: false
  reasoning_effort: null
  max_reasoning_tokens: null
  custom_thinking_prompt: null
  enabled_toolsets:
  - terminal
  - file
  disabled_toolsets: null
  distribution: null
  max_agent_turns: 60
  system_prompt: 'You are a skilled software engineer and system administrator with
    access to a terminal and file tools. You are working inside a Linux container
    environment. Complete the user''s task by using the available tools. Be methodical:
    explore the environment first, plan your approach, then execute step by step.
    Verify your work before finishing.'
  agent_temperature: 1.0
  terminal_backend: modal
  dataset_name: NousResearch/terminal-bench-2
  dataset_split: train
  prompt_field: prompt
  tool_call_parser: hermes
  test_timeout: 180
  force_build: false
  task_filter: fix-git
  skip_tasks: null
openai:
- timeout: 1200
  num_max_requests_at_once: 512
  num_requests_for_eval: 64
  model_name: anthropic/claude-sonnet-4
  rolling_buffer_length: 1000
  server_type: openai
  api_key: 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
  base_url: https://openrouter.ai/api/v1
  n_kwarg_is_ignored: false
  health_check: false
slurm: false
testing: false