environments/benchmarks/tblite/run_eval.sh

#!/bin/bash

# OpenThoughts-TBLite Evaluation
#
# Run from repo root:
#   bash environments/benchmarks/tblite/run_eval.sh
#
# Override model:
#   bash environments/benchmarks/tblite/run_eval.sh \
#       --openai.model_name anthropic/claude-sonnet-4
#
# Run a subset:
#   bash environments/benchmarks/tblite/run_eval.sh \
#       --env.task_filter broken-python,pandas-etl
#
# All terminal settings (backend, timeout, lifetime, pool size) are
# configured via env config fields -- no env vars needed.

set -euo pipefail

mkdir -p logs evals/openthoughts-tblite
LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log"

echo "OpenThoughts-TBLite Evaluation"
echo "Log file: $LOG_FILE"
echo ""

# Unbuffered python output so logs are written in real-time
export PYTHONUNBUFFERED=1

# Show INFO-level agent loop timing (api/tool durations per turn)
# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal
export LOGLEVEL=INFO

python tblite_env.py evaluate \
  --config default.yaml \
  "$@" \
  2>&1 | tee "$LOG_FILE"

echo ""
echo "Log saved to: $LOG_FILE"
echo "Eval results: evals/openthoughts-tblite/"
feat: add OpenThoughts-TBLite evaluation script Introduced a new evaluation script for the OpenThoughts-TBLite environment, enabling users to run evaluations with customizable options. The script includes logging capabilities and real-time output, enhancing the evaluation process for terminal agents. This addition complements the existing benchmarking tools and improves usability for users. 2026-03-04 12:55:56 +00:00			`#!/bin/bash`

			`# OpenThoughts-TBLite Evaluation`
			`#`
			`# Run from repo root:`
			`# bash environments/benchmarks/tblite/run_eval.sh`
			`#`
			`# Override model:`
			`# bash environments/benchmarks/tblite/run_eval.sh \`
			`# --openai.model_name anthropic/claude-sonnet-4`
			`#`
			`# Run a subset:`
			`# bash environments/benchmarks/tblite/run_eval.sh \`
			`# --env.task_filter broken-python,pandas-etl`
			`#`
			`# All terminal settings (backend, timeout, lifetime, pool size) are`
			`# configured via env config fields -- no env vars needed.`

			`set -euo pipefail`

			`mkdir -p logs evals/openthoughts-tblite`
			`LOG_FILE="logs/tblite_$(date +%Y%m%d_%H%M%S).log"`

			`echo "OpenThoughts-TBLite Evaluation"`
			`echo "Log file: $LOG_FILE"`
			`echo ""`

			`# Unbuffered python output so logs are written in real-time`
			`export PYTHONUNBUFFERED=1`

			`# Show INFO-level agent loop timing (api/tool durations per turn)`
			`# These go to the log file; tqdm + [START]/[PASS]/[FAIL] go to terminal`
			`export LOGLEVEL=INFO`

			`python tblite_env.py evaluate \`
			`--config default.yaml \`
			`"$@" \`
			`2>&1 \| tee "$LOG_FILE"`

			`echo ""`
			`echo "Log saved to: $LOG_FILE"`
			`echo "Eval results: evals/openthoughts-tblite/"`