Enhance TerminalBench2 environment with task filtering due to incompat with modal and logging improvements

- Updated task filter descriptions for clarity and added a new skip task feature to exclude incompatible tasks.
- Introduced a set of modal incompatible tasks to prevent execution errors in cloud environments.
- Implemented streaming JSONL logging for task results, preserving data even on interruptions.
- Refactored task evaluation logic to include skipped task reporting and improved error handling.
This commit is contained in:
teknium
2026-02-12 05:36:45 +00:00
parent a291cc99cf
commit 1b7bc299f3

View File

@@ -103,12 +103,12 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
# --- Task filtering (comma-separated from CLI) ---
task_filter: Optional[str] = Field(
default=None,
description="Comma-separated task names to run (e.g., 'fix-git,broken-pipe'). "
description="Comma-separated task names to run (e.g., 'fix-git,git-multibranch'). "
"If not set, all tasks are run.",
)
skip_tasks: Optional[str] = Field(
default=None,
description="Comma-separated task names to skip (e.g., 'heavy-task,slow-task').",
description="Comma-separated task names to skip on top of the default skip list.",
)
# --- Per-task wall-clock timeout ---
@@ -119,6 +119,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
)
# Tasks that cannot run properly on Modal and are excluded from scoring.
MODAL_INCOMPATIBLE_TASKS = {
"qemu-startup", # Needs KVM/hardware virtualization
"qemu-alpine-ssh", # Needs KVM/hardware virtualization
"crack-7z-hash", # Password brute-force -- too slow for cloud sandbox timeouts
}
# =============================================================================
# Tar extraction helper
# =============================================================================
@@ -186,13 +194,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
max_agent_turns=60,
max_token_length=16000,
agent_temperature=0.6,
system_prompt=(
"You are a skilled software engineer and system administrator with "
"access to a terminal and file tools. You are working inside a Linux "
"container environment. Complete the user's task by using the available "
"tools. Be methodical: explore the environment first, plan your approach, "
"then execute step by step. Verify your work before finishing."
),
system_prompt=None,
# Modal backend for per-task cloud-isolated sandboxes
terminal_backend="modal",
@@ -258,10 +260,18 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
allowed = {name.strip() for name in self.config.task_filter.split(",")}
tasks = [t for t in tasks if t["task_name"] in allowed]
print(f" Filtered to {len(tasks)} tasks: {sorted(allowed)}")
# Skip tasks incompatible with the current backend (e.g., QEMU on Modal)
# plus any user-specified skip_tasks
skip = set(MODAL_INCOMPATIBLE_TASKS) if self.config.terminal_backend == "modal" else set()
if self.config.skip_tasks:
skip = {name.strip() for name in self.config.skip_tasks.split(",")}
skip |= {name.strip() for name in self.config.skip_tasks.split(",")}
if skip:
before = len(tasks)
tasks = [t for t in tasks if t["task_name"] not in skip]
print(f" After skip_tasks: {len(tasks)} tasks (skipped: {sorted(skip)})")
skipped = before - len(tasks)
if skipped > 0:
print(f" Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}")
self.all_eval_items = tasks
self.iter = 0
@@ -274,10 +284,30 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
# Reward tracking for wandb logging
self.eval_metrics: List[Tuple[str, float]] = []
# Streaming JSONL writer -- saves each task's full conversation
# immediately on completion so data is preserved even on Ctrl+C.
# Timestamped filename so each run produces a unique file.
import datetime
log_dir = os.path.join(os.path.dirname(__file__), "logs")
os.makedirs(log_dir, exist_ok=True)
run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
self._streaming_file = open(self._streaming_path, "w")
self._streaming_lock = __import__("threading").Lock()
print(f" Streaming results to: {self._streaming_path}")
print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories")
for cat, indices in sorted(self.category_index.items()):
print(f" {cat}: {len(indices)} tasks")
def _save_result(self, result: Dict[str, Any]):
"""Write a single task result to the streaming JSONL file immediately."""
if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
return
with self._streaming_lock:
self._streaming_file.write(json.dumps(result, ensure_ascii=False, default=str) + "\n")
self._streaming_file.flush()
# =========================================================================
# Training pipeline stubs -- NOT used in eval-only mode
# =========================================================================
@@ -423,6 +453,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
extra_body=self.config.extra_body,
)
result = await agent.run(messages)
@@ -463,24 +494,29 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
task_name, reward, result.turns_used, result.finished_naturally,
)
return {
out = {
"passed": passed,
"reward": reward,
"task_name": task_name,
"category": category,
"turns_used": result.turns_used,
"finished_naturally": result.finished_naturally,
"messages": result.messages,
}
self._save_result(out)
return out
except Exception as e:
elapsed = time.time() - task_start
logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True)
tqdm.write(f" [ERROR] {task_name}: {e} ({elapsed:.0f}s)")
return {
out = {
"passed": False, "reward": 0.0,
"task_name": task_name, "category": category,
"error": str(e),
}
self._save_result(out)
return out
finally:
# --- Cleanup: clear overrides, sandbox, and temp files ---
@@ -636,11 +672,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
elapsed = self.config.task_timeout
tqdm.write(f" [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)")
logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed)
return {
out = {
"passed": False, "reward": 0.0,
"task_name": task_name, "category": category,
"error": f"timeout ({elapsed}s)",
}
self._save_result(out)
return out
async def evaluate(self, *args, **kwargs) -> None:
"""
@@ -796,7 +834,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
print(f"{'='*60}\n")
# Build sample records for evaluate_log
# Build sample records for evaluate_log (includes full conversations)
samples = [
{
"task_name": r.get("task_name"),
@@ -805,6 +843,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
"reward": r.get("reward"),
"turns_used": r.get("turns_used"),
"error": r.get("error"),
"messages": r.get("messages"),
}
for r in valid_results
]
@@ -826,11 +865,22 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
except Exception as e:
print(f"Error logging evaluation results: {e}")
# Close streaming file
if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
self._streaming_file.close()
print(f" Live results saved to: {self._streaming_path}")
# Kill all remaining sandboxes. Timed-out tasks leave orphaned thread
# pool workers still executing commands -- cleanup_all stops them.
from tools.terminal_tool import cleanup_all_environments
print("\nCleaning up all sandboxes...")
cleanup_all_environments()
# Shut down the tool thread pool so orphaned workers from timed-out
# tasks are killed immediately instead of retrying against dead
# sandboxes and spamming the console with TimeoutError warnings.
from environments.agent_loop import _tool_executor
_tool_executor.shutdown(wait=False, cancel_futures=True)
print("Done.")
# =========================================================================