Introduced a new evaluation environment for OpenThoughts-TBLite, including the main evaluation script, configuration YAML, and README documentation. This environment provides a faster alternative to Terminal-Bench 2.0, featuring 100 difficulty-calibrated tasks for terminal agents. The setup allows for easy evaluation and configuration, enhancing the benchmarking capabilities for terminal agents.
120 lines
3.6 KiB
Python
120 lines
3.6 KiB
Python
"""
|
|
OpenThoughts-TBLite Evaluation Environment
|
|
|
|
A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal
|
|
agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults
|
|
to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated
|
|
tasks vs TB2's 89 harder tasks).
|
|
|
|
TBLite tasks are a curated subset of TB2 with a difficulty distribution
|
|
designed to give meaningful signal even for smaller models:
|
|
- Easy (40 tasks): >= 70% pass rate with Claude Haiku 4.5
|
|
- Medium (26 tasks): 40-69% pass rate
|
|
- Hard (26 tasks): 10-39% pass rate
|
|
- Extreme (8 tasks): < 10% pass rate
|
|
|
|
Usage:
|
|
python environments/benchmarks/tblite/tblite_env.py evaluate
|
|
|
|
# Filter to specific tasks:
|
|
python environments/benchmarks/tblite/tblite_env.py evaluate \\
|
|
--env.task_filter "broken-python,pandas-etl"
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
|
|
_repo_root = Path(__file__).resolve().parent.parent.parent.parent
|
|
if str(_repo_root) not in sys.path:
|
|
sys.path.insert(0, str(_repo_root))
|
|
|
|
from pydantic import Field
|
|
|
|
from atroposlib.envs.base import EvalHandlingEnum
|
|
from atroposlib.envs.server_handling.server_manager import APIServerConfig
|
|
|
|
from environments.benchmarks.terminalbench_2.terminalbench2_env import (
|
|
TerminalBench2EvalConfig,
|
|
TerminalBench2EvalEnv,
|
|
)
|
|
|
|
|
|
class TBLiteEvalConfig(TerminalBench2EvalConfig):
|
|
"""Configuration for the OpenThoughts-TBLite evaluation environment.
|
|
|
|
Inherits all TB2 config fields. Only the dataset default and task timeout
|
|
differ -- TBLite tasks are calibrated to be faster.
|
|
"""
|
|
|
|
dataset_name: str = Field(
|
|
default="NousResearch/openthoughts-tblite",
|
|
description="HuggingFace dataset containing TBLite tasks.",
|
|
)
|
|
|
|
task_timeout: int = Field(
|
|
default=1200,
|
|
description="Maximum wall-clock seconds per task. TBLite tasks are "
|
|
"generally faster than TB2, so 20 minutes is usually sufficient.",
|
|
)
|
|
|
|
|
|
class TBLiteEvalEnv(TerminalBench2EvalEnv):
|
|
"""OpenThoughts-TBLite evaluation environment.
|
|
|
|
Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop,
|
|
test verification, Docker image resolution, metrics, wandb logging).
|
|
Only the default configuration differs.
|
|
"""
|
|
|
|
name = "openthoughts-tblite"
|
|
env_config_cls = TBLiteEvalConfig
|
|
|
|
@classmethod
|
|
def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]:
|
|
env_config = TBLiteEvalConfig(
|
|
enabled_toolsets=["terminal", "file"],
|
|
disabled_toolsets=None,
|
|
distribution=None,
|
|
|
|
max_agent_turns=60,
|
|
max_token_length=16000,
|
|
agent_temperature=0.6,
|
|
system_prompt=None,
|
|
|
|
terminal_backend="modal",
|
|
terminal_timeout=300,
|
|
|
|
test_timeout=180,
|
|
|
|
# 100 tasks in parallel
|
|
tool_pool_size=128,
|
|
|
|
eval_handling=EvalHandlingEnum.STOP_TRAIN,
|
|
group_size=1,
|
|
steps_per_eval=1,
|
|
total_steps=1,
|
|
|
|
tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
|
|
use_wandb=True,
|
|
wandb_name="openthoughts-tblite",
|
|
ensure_scores_are_not_same=False,
|
|
)
|
|
|
|
server_configs = [
|
|
APIServerConfig(
|
|
base_url="https://openrouter.ai/api/v1",
|
|
model_name="anthropic/claude-sonnet-4",
|
|
server_type="openai",
|
|
api_key=os.getenv("OPENROUTER_API_KEY", ""),
|
|
health_check=False,
|
|
)
|
|
]
|
|
|
|
return env_config, server_configs
|
|
|
|
|
|
if __name__ == "__main__":
|
|
TBLiteEvalEnv.cli()
|