environments/benchmarks/tblite/tblite_env.py

"""
OpenThoughts-TBLite Evaluation Environment

A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal
agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults
to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated
tasks vs TB2's 89 harder tasks).

TBLite tasks are a curated subset of TB2 with a difficulty distribution
designed to give meaningful signal even for smaller models:
  - Easy (40 tasks):   >= 70% pass rate with Claude Haiku 4.5
  - Medium (26 tasks): 40-69% pass rate
  - Hard (26 tasks):   10-39% pass rate
  - Extreme (8 tasks): < 10% pass rate

Usage:
    python environments/benchmarks/tblite/tblite_env.py evaluate

    # Filter to specific tasks:
    python environments/benchmarks/tblite/tblite_env.py evaluate \\
        --env.task_filter "broken-python,pandas-etl"
"""

import os
import sys
from pathlib import Path
from typing import List, Tuple

_repo_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_repo_root) not in sys.path:
    sys.path.insert(0, str(_repo_root))

from pydantic import Field

from atroposlib.envs.base import EvalHandlingEnum
from atroposlib.envs.server_handling.server_manager import APIServerConfig

from environments.benchmarks.terminalbench_2.terminalbench2_env import (
    TerminalBench2EvalConfig,
    TerminalBench2EvalEnv,
)


class TBLiteEvalConfig(TerminalBench2EvalConfig):
    """Configuration for the OpenThoughts-TBLite evaluation environment.

    Inherits all TB2 config fields. Only the dataset default and task timeout
    differ -- TBLite tasks are calibrated to be faster.
    """

    dataset_name: str = Field(
        default="NousResearch/openthoughts-tblite",
        description="HuggingFace dataset containing TBLite tasks.",
    )

    task_timeout: int = Field(
        default=1200,
        description="Maximum wall-clock seconds per task. TBLite tasks are "
        "generally faster than TB2, so 20 minutes is usually sufficient.",
    )


class TBLiteEvalEnv(TerminalBench2EvalEnv):
    """OpenThoughts-TBLite evaluation environment.

    Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop,
    test verification, Docker image resolution, metrics, wandb logging).
    Only the default configuration differs.
    """

    name = "openthoughts-tblite"
    env_config_cls = TBLiteEvalConfig

    @classmethod
    def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]:
        env_config = TBLiteEvalConfig(
            enabled_toolsets=["terminal", "file"],
            disabled_toolsets=None,
            distribution=None,

            max_agent_turns=60,
            max_token_length=16000,
            agent_temperature=0.6,
            system_prompt=None,

            terminal_backend="modal",
            terminal_timeout=300,

            test_timeout=180,

            # 100 tasks in parallel
            tool_pool_size=128,

            eval_handling=EvalHandlingEnum.STOP_TRAIN,
            group_size=1,
            steps_per_eval=1,
            total_steps=1,

            tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",
            use_wandb=True,
            wandb_name="openthoughts-tblite",
            ensure_scores_are_not_same=False,
        )

        server_configs = [
            APIServerConfig(
                base_url="https://openrouter.ai/api/v1",
                model_name="anthropic/claude-sonnet-4",
                server_type="openai",
                api_key=os.getenv("OPENROUTER_API_KEY", ""),
                health_check=False,
            )
        ]

        return env_config, server_configs


if __name__ == "__main__":
    TBLiteEvalEnv.cli()
feat: add OpenThoughts-TBLite evaluation environment and configuration files Introduced a new evaluation environment for OpenThoughts-TBLite, including the main evaluation script, configuration YAML, and README documentation. This environment provides a faster alternative to Terminal-Bench 2.0, featuring 100 difficulty-calibrated tasks for terminal agents. The setup allows for easy evaluation and configuration, enhancing the benchmarking capabilities for terminal agents. 2026-03-04 11:38:32 +00:00			`"""`
			`OpenThoughts-TBLite Evaluation Environment`

			`A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal`
			`agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults`
			`to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated`
			`tasks vs TB2's 89 harder tasks).`

			`TBLite tasks are a curated subset of TB2 with a difficulty distribution`
			`designed to give meaningful signal even for smaller models:`
			`- Easy (40 tasks): >= 70% pass rate with Claude Haiku 4.5`
			`- Medium (26 tasks): 40-69% pass rate`
			`- Hard (26 tasks): 10-39% pass rate`
			`- Extreme (8 tasks): < 10% pass rate`

			`Usage:`
			`python environments/benchmarks/tblite/tblite_env.py evaluate`

			`# Filter to specific tasks:`
			`python environments/benchmarks/tblite/tblite_env.py evaluate \\`
			`--env.task_filter "broken-python,pandas-etl"`
			`"""`

			`import os`
			`import sys`
			`from pathlib import Path`
			`from typing import List, Tuple`

			`_repo_root = Path(__file__).resolve().parent.parent.parent.parent`
			`if str(_repo_root) not in sys.path:`
			`sys.path.insert(0, str(_repo_root))`

			`from pydantic import Field`

			`from atroposlib.envs.base import EvalHandlingEnum`
			`from atroposlib.envs.server_handling.server_manager import APIServerConfig`

			`from environments.benchmarks.terminalbench_2.terminalbench2_env import (`
			`TerminalBench2EvalConfig,`
			`TerminalBench2EvalEnv,`
			`)`


			`class TBLiteEvalConfig(TerminalBench2EvalConfig):`
			`"""Configuration for the OpenThoughts-TBLite evaluation environment.`

			`Inherits all TB2 config fields. Only the dataset default and task timeout`
			`differ -- TBLite tasks are calibrated to be faster.`
			`"""`

			`dataset_name: str = Field(`
			`default="NousResearch/openthoughts-tblite",`
			`description="HuggingFace dataset containing TBLite tasks.",`
			`)`

			`task_timeout: int = Field(`
			`default=1200,`
			`description="Maximum wall-clock seconds per task. TBLite tasks are "`
			`"generally faster than TB2, so 20 minutes is usually sufficient.",`
			`)`


			`class TBLiteEvalEnv(TerminalBench2EvalEnv):`
			`"""OpenThoughts-TBLite evaluation environment.`

			`Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop,`
			`test verification, Docker image resolution, metrics, wandb logging).`
			`Only the default configuration differs.`
			`"""`

			`name = "openthoughts-tblite"`
			`env_config_cls = TBLiteEvalConfig`

			`@classmethod`
			`def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]:`
			`env_config = TBLiteEvalConfig(`
			`enabled_toolsets=["terminal", "file"],`
			`disabled_toolsets=None,`
			`distribution=None,`

			`max_agent_turns=60,`
			`max_token_length=16000,`
			`agent_temperature=0.6,`
			`system_prompt=None,`

			`terminal_backend="modal",`
			`terminal_timeout=300,`

			`test_timeout=180,`

			`# 100 tasks in parallel`
			`tool_pool_size=128,`

			`eval_handling=EvalHandlingEnum.STOP_TRAIN,`
			`group_size=1,`
			`steps_per_eval=1,`
			`total_steps=1,`

			`tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B",`
			`use_wandb=True,`
			`wandb_name="openthoughts-tblite",`
			`ensure_scores_are_not_same=False,`
			`)`

			`server_configs = [`
			`APIServerConfig(`
			`base_url="https://openrouter.ai/api/v1",`
			`model_name="anthropic/claude-sonnet-4",`
			`server_type="openai",`
			`api_key=os.getenv("OPENROUTER_API_KEY", ""),`
			`health_check=False,`
			`)`
			`]`

			`return env_config, server_configs`


			`if __name__ == "__main__":`
			`TBLiteEvalEnv.cli()`