From ee7fde6531499df637345dcc345252ce09269a63 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 4 Mar 2026 12:55:56 +0000 Subject: [PATCH] feat: add OpenThoughts-TBLite evaluation script Introduced a new evaluation script for the OpenThoughts-TBLite environment, enabling users to run evaluations with customizable options. The script includes logging capabilities and real-time output, enhancing the evaluation process for terminal agents. This addition complements the existing benchmarking tools and improves usability for users. --- environments/benchmarks/tblite/run_eval.sh | 42 +++++++++++++++++++ .../benchmarks/terminalbench_2/run_eval.sh | 20 ++++++--- 2 files changed, 57 insertions(+), 5 deletions(-) create mode 100755 environments/benchmarks/tblite/run_eval.sh diff --git a/environments/benchmarks/tblite/run_eval.sh b/environments/benchmarks/tblite/run_eval.sh new file mode 100755 index 000000000..9d860bf5e --- /dev/null +++ b/environments/benchmarks/tblite/run_eval.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# OpenThoughts-TBLite Evaluation +# +# Run from repo root: +# bash environments/benchmarks/tblite/run_eval.sh +# +# Override model: +# bash environments/benchmarks/tblite/run_eval.sh \ +# --openai.model_name anthropic/claude-sonnet-4 +# +# Run a subset: +# bash environments/benchmarks/tblite/run_eval.sh \ +# --env.task_filter broken-python,pandas-etl +# +# All terminal settings (backend, timeout, lifetime, pool size) are +# configured via env config fields -- no env vars needed. + +set -euo pipefail + +mkdir -p logs evals/openthoughts-tblite +LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log" + +echo "OpenThoughts-TBLite Evaluation" +echo "Log file: $LOG_FILE" +echo "" + +# Unbuffered python output so logs are written in real-time +export PYTHONUNBUFFERED=1 + +# Show INFO-level agent loop timing (api/tool durations per turn) +# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal +export LOGLEVEL=INFO + +python tblite_env.py evaluate \ + --config default.yaml \ + "$@" \ + 2>&1 | tee "$LOG_FILE" + +echo "" +echo "Log saved to: $LOG_FILE" +echo "Eval results: evals/openthoughts-tblite/" diff --git a/environments/benchmarks/terminalbench_2/run_eval.sh b/environments/benchmarks/terminalbench_2/run_eval.sh index d4f1dcd6a..ffbe48480 100755 --- a/environments/benchmarks/terminalbench_2/run_eval.sh +++ b/environments/benchmarks/terminalbench_2/run_eval.sh @@ -12,21 +12,31 @@ # Run a subset: # bash environments/benchmarks/terminalbench_2/run_eval.sh \ # --env.task_filter fix-git,git-multibranch +# +# All terminal settings (backend, timeout, lifetime, pool size) are +# configured via env config fields -- no env vars needed. + +set -euo pipefail mkdir -p logs evals/terminal-bench-2 LOG_FILE="logs/terminalbench2_$(date +%Y%m%d_%H%M%S).log" echo "Terminal-Bench 2.0 Evaluation" -echo "Log: $LOG_FILE" +echo "Log file: $LOG_FILE" echo "" -export TERMINAL_ENV=modal -export TERMINAL_TIMEOUT=300 +# Unbuffered python output so logs are written in real-time +export PYTHONUNBUFFERED=1 -python environments/benchmarks/terminalbench_2/terminalbench2_env.py evaluate \ - --config environments/benchmarks/terminalbench_2/default.yaml \ +# Show INFO-level agent loop timing (api/tool durations per turn) +# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal +export LOGLEVEL=INFO + +python terminalbench2_env.py evaluate \ + --config default.yaml \ "$@" \ 2>&1 | tee "$LOG_FILE" echo "" echo "Log saved to: $LOG_FILE" +echo "Eval results: evals/terminal-bench-2/"