From 0ea6c343259a0925d6db25e1d11f7cc853da184b Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 4 Mar 2026 11:38:32 +0000 Subject: [PATCH] feat: add OpenThoughts-TBLite evaluation environment and configuration files Introduced a new evaluation environment for OpenThoughts-TBLite, including the main evaluation script, configuration YAML, and README documentation. This environment provides a faster alternative to Terminal-Bench 2.0, featuring 100 difficulty-calibrated tasks for terminal agents. The setup allows for easy evaluation and configuration, enhancing the benchmarking capabilities for terminal agents. --- environments/benchmarks/tblite/README.md | 73 ++++++++++++ environments/benchmarks/tblite/__init__.py | 0 environments/benchmarks/tblite/default.yaml | 39 ++++++ environments/benchmarks/tblite/tblite_env.py | 119 +++++++++++++++++++ 4 files changed, 231 insertions(+) create mode 100644 environments/benchmarks/tblite/README.md create mode 100644 environments/benchmarks/tblite/__init__.py create mode 100644 environments/benchmarks/tblite/default.yaml create mode 100644 environments/benchmarks/tblite/tblite_env.py diff --git a/environments/benchmarks/tblite/README.md b/environments/benchmarks/tblite/README.md new file mode 100644 index 000000000..54b3745c3 --- /dev/null +++ b/environments/benchmarks/tblite/README.md @@ -0,0 +1,73 @@ +# OpenThoughts-TBLite Evaluation Environment + +This environment evaluates terminal agents on the [OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite) benchmark, a difficulty-calibrated subset of [Terminal-Bench 2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0). + +## Source + +OpenThoughts-TBLite was created by the [OpenThoughts](https://www.openthoughts.ai/) Agent team in collaboration with [Snorkel AI](https://snorkel.ai/) and [Bespoke Labs](https://bespokelabs.ai/). The original dataset and documentation live at: + +- **Dataset (source):** [open-thoughts/OpenThoughts-TBLite](https://huggingface.co/datasets/open-thoughts/OpenThoughts-TBLite) +- **GitHub:** [open-thoughts/OpenThoughts-TBLite](https://github.com/open-thoughts/OpenThoughts-TBLite) +- **Blog post:** [openthoughts.ai/blog/openthoughts-tblite](https://www.openthoughts.ai/blog/openthoughts-tblite) + +## Our Dataset + +We converted the source into the same schema used by our Terminal-Bench 2.0 environment (pre-built Docker Hub images, base64-encoded test tarballs, etc.) and published it as: + +- **Dataset (ours):** [NousResearch/openthoughts-tblite](https://huggingface.co/datasets/NousResearch/openthoughts-tblite) +- **Docker images:** `nousresearch/tblite-:latest` on Docker Hub (100 images) + +The conversion script is at `scripts/prepare_tblite_dataset.py`. + +## Why TBLite? + +Terminal-Bench 2.0 is one of the strongest frontier evaluations for terminal agents, but when a model scores near the floor (e.g., Qwen 3 8B at <1%), many changes look identical in aggregate score. TBLite addresses this by calibrating task difficulty using Claude Haiku 4.5 as a reference: + +| Difficulty | Pass Rate Range | Tasks | +|------------|----------------|-------| +| Easy | >= 70% | 40 | +| Medium | 40-69% | 26 | +| Hard | 10-39% | 26 | +| Extreme | < 10% | 8 | + +This gives enough solvable tasks to detect small improvements quickly, while preserving enough hard tasks to avoid saturation. The correlation between TBLite and TB2 scores is **r = 0.911**. + +TBLite also runs 2.6-8x faster than the full TB2, making it practical for iteration loops. + +## Usage + +```bash +# Run the full benchmark +python environments/benchmarks/tblite/tblite_env.py evaluate + +# Filter to specific tasks +python environments/benchmarks/tblite/tblite_env.py evaluate \ + --env.task_filter "broken-python,pandas-etl" + +# Use a different model +python environments/benchmarks/tblite/tblite_env.py evaluate \ + --server.model_name "qwen/qwen3-30b" +``` + +## Architecture + +`TBLiteEvalEnv` is a thin subclass of `TerminalBench2EvalEnv`. All evaluation logic (agent loop, Docker sandbox management, test verification, metrics) is inherited. Only the defaults differ: + +| Setting | TB2 | TBLite | +|----------------|----------------------------------|-----------------------------------------| +| Dataset | `NousResearch/terminal-bench-2` | `NousResearch/openthoughts-tblite` | +| Tasks | 89 | 100 | +| Task timeout | 1800s (30 min) | 1200s (20 min) | +| Wandb name | `terminal-bench-2` | `openthoughts-tblite` | + +## Citation + +```bibtex +@software{OpenThoughts-TBLite, + author = {OpenThoughts-Agent team, Snorkel AI, Bespoke Labs}, + month = Feb, + title = {{OpenThoughts-TBLite: A High-Signal Benchmark for Iterating on Terminal Agents}}, + howpublished = {https://www.openthoughts.ai/blog/openthoughts-tblite}, + year = {2026} +} +``` diff --git a/environments/benchmarks/tblite/__init__.py b/environments/benchmarks/tblite/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/environments/benchmarks/tblite/default.yaml b/environments/benchmarks/tblite/default.yaml new file mode 100644 index 000000000..cb5218280 --- /dev/null +++ b/environments/benchmarks/tblite/default.yaml @@ -0,0 +1,39 @@ +# OpenThoughts-TBLite Evaluation -- Default Configuration +# +# Eval-only environment for the TBLite benchmark (100 difficulty-calibrated +# terminal tasks, a faster proxy for Terminal-Bench 2.0). +# Uses Modal terminal backend for per-task cloud-isolated sandboxes +# and OpenRouter for inference. +# +# Usage: +# python environments/benchmarks/tblite/tblite_env.py evaluate \ +# --config environments/benchmarks/tblite/default.yaml +# +# # Override model: +# python environments/benchmarks/tblite/tblite_env.py evaluate \ +# --config environments/benchmarks/tblite/default.yaml \ +# --openai.model_name anthropic/claude-sonnet-4 + +env: + enabled_toolsets: ["terminal", "file"] + max_agent_turns: 60 + max_token_length: 32000 + agent_temperature: 0.8 + terminal_backend: "modal" + terminal_timeout: 300 # 5 min per command (builds, pip install) + tool_pool_size: 128 # thread pool for 100 parallel tasks + dataset_name: "NousResearch/openthoughts-tblite" + test_timeout: 600 + task_timeout: 1200 # 20 min wall-clock per task (TBLite tasks are faster) + tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" + use_wandb: true + wandb_name: "openthoughts-tblite" + ensure_scores_are_not_same: false + data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite" + +openai: + base_url: "https://openrouter.ai/api/v1" + model_name: "anthropic/claude-opus-4.6" + server_type: "openai" + health_check: false + # api_key loaded from OPENROUTER_API_KEY in .env diff --git a/environments/benchmarks/tblite/tblite_env.py b/environments/benchmarks/tblite/tblite_env.py new file mode 100644 index 000000000..4b23f9cc5 --- /dev/null +++ b/environments/benchmarks/tblite/tblite_env.py @@ -0,0 +1,119 @@ +""" +OpenThoughts-TBLite Evaluation Environment + +A lighter, faster alternative to Terminal-Bench 2.0 for iterating on terminal +agents. Uses the same evaluation logic as TerminalBench2EvalEnv but defaults +to the NousResearch/openthoughts-tblite dataset (100 difficulty-calibrated +tasks vs TB2's 89 harder tasks). + +TBLite tasks are a curated subset of TB2 with a difficulty distribution +designed to give meaningful signal even for smaller models: + - Easy (40 tasks): >= 70% pass rate with Claude Haiku 4.5 + - Medium (26 tasks): 40-69% pass rate + - Hard (26 tasks): 10-39% pass rate + - Extreme (8 tasks): < 10% pass rate + +Usage: + python environments/benchmarks/tblite/tblite_env.py evaluate + + # Filter to specific tasks: + python environments/benchmarks/tblite/tblite_env.py evaluate \\ + --env.task_filter "broken-python,pandas-etl" +""" + +import os +import sys +from pathlib import Path +from typing import List, Tuple + +_repo_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +from pydantic import Field + +from atroposlib.envs.base import EvalHandlingEnum +from atroposlib.envs.server_handling.server_manager import APIServerConfig + +from environments.benchmarks.terminalbench_2.terminalbench2_env import ( + TerminalBench2EvalConfig, + TerminalBench2EvalEnv, +) + + +class TBLiteEvalConfig(TerminalBench2EvalConfig): + """Configuration for the OpenThoughts-TBLite evaluation environment. + + Inherits all TB2 config fields. Only the dataset default and task timeout + differ -- TBLite tasks are calibrated to be faster. + """ + + dataset_name: str = Field( + default="NousResearch/openthoughts-tblite", + description="HuggingFace dataset containing TBLite tasks.", + ) + + task_timeout: int = Field( + default=1200, + description="Maximum wall-clock seconds per task. TBLite tasks are " + "generally faster than TB2, so 20 minutes is usually sufficient.", + ) + + +class TBLiteEvalEnv(TerminalBench2EvalEnv): + """OpenThoughts-TBLite evaluation environment. + + Inherits all evaluation logic from TerminalBench2EvalEnv (agent loop, + test verification, Docker image resolution, metrics, wandb logging). + Only the default configuration differs. + """ + + name = "openthoughts-tblite" + env_config_cls = TBLiteEvalConfig + + @classmethod + def config_init(cls) -> Tuple[TBLiteEvalConfig, List[APIServerConfig]]: + env_config = TBLiteEvalConfig( + enabled_toolsets=["terminal", "file"], + disabled_toolsets=None, + distribution=None, + + max_agent_turns=60, + max_token_length=16000, + agent_temperature=0.6, + system_prompt=None, + + terminal_backend="modal", + terminal_timeout=300, + + test_timeout=180, + + # 100 tasks in parallel + tool_pool_size=128, + + eval_handling=EvalHandlingEnum.STOP_TRAIN, + group_size=1, + steps_per_eval=1, + total_steps=1, + + tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B", + use_wandb=True, + wandb_name="openthoughts-tblite", + ensure_scores_are_not_same=False, + ) + + server_configs = [ + APIServerConfig( + base_url="https://openrouter.ai/api/v1", + model_name="anthropic/claude-sonnet-4", + server_type="openai", + api_key=os.getenv("OPENROUTER_API_KEY", ""), + health_check=False, + ) + ] + + return env_config, server_configs + + +if __name__ == "__main__": + TBLiteEvalEnv.cli()