Introduced a new evaluation environment for OpenThoughts-TBLite, including the main evaluation script, configuration YAML, and README documentation. This environment provides a faster alternative to Terminal-Bench 2.0, featuring 100 difficulty-calibrated tasks for terminal agents. The setup allows for easy evaluation and configuration, enhancing the benchmarking capabilities for terminal agents.
40 lines
1.5 KiB
YAML
40 lines
1.5 KiB
YAML
# OpenThoughts-TBLite Evaluation -- Default Configuration
|
|
#
|
|
# Eval-only environment for the TBLite benchmark (100 difficulty-calibrated
|
|
# terminal tasks, a faster proxy for Terminal-Bench 2.0).
|
|
# Uses Modal terminal backend for per-task cloud-isolated sandboxes
|
|
# and OpenRouter for inference.
|
|
#
|
|
# Usage:
|
|
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
|
# --config environments/benchmarks/tblite/default.yaml
|
|
#
|
|
# # Override model:
|
|
# python environments/benchmarks/tblite/tblite_env.py evaluate \
|
|
# --config environments/benchmarks/tblite/default.yaml \
|
|
# --openai.model_name anthropic/claude-sonnet-4
|
|
|
|
env:
|
|
enabled_toolsets: ["terminal", "file"]
|
|
max_agent_turns: 60
|
|
max_token_length: 32000
|
|
agent_temperature: 0.8
|
|
terminal_backend: "modal"
|
|
terminal_timeout: 300 # 5 min per command (builds, pip install)
|
|
tool_pool_size: 128 # thread pool for 100 parallel tasks
|
|
dataset_name: "NousResearch/openthoughts-tblite"
|
|
test_timeout: 600
|
|
task_timeout: 1200 # 20 min wall-clock per task (TBLite tasks are faster)
|
|
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
|
|
use_wandb: true
|
|
wandb_name: "openthoughts-tblite"
|
|
ensure_scores_are_not_same: false
|
|
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite"
|
|
|
|
openai:
|
|
base_url: "https://openrouter.ai/api/v1"
|
|
model_name: "anthropic/claude-opus-4.6"
|
|
server_type: "openai"
|
|
health_check: false
|
|
# api_key loaded from OPENROUTER_API_KEY in .env
|