Files
hermes-agent/environments/benchmarks/tblite/local.yaml
dmahan93 ed27b826c5 feat: add eval_concurrency limit + Docker local config for TBLite
- Add eval_concurrency config field with asyncio.Semaphore
- Add local.yaml config using Docker backend (sandboxed, no cloud costs)
- Register docker_image alongside modal_image for backend flexibility
- Default: 8 parallel tasks for local runs
2026-03-11 06:52:26 -07:00

39 lines
1.3 KiB
YAML

# OpenThoughts-TBLite Evaluation -- Docker Backend (Local Compute)
#
# Runs tasks in Docker containers on the local machine.
# Sandboxed like Modal but no cloud costs. Good for dev/testing.
#
# Usage:
# python environments/benchmarks/tblite/tblite_env.py evaluate \
# --config environments/benchmarks/tblite/local.yaml
#
# # Override concurrency:
# python environments/benchmarks/tblite/tblite_env.py evaluate \
# --config environments/benchmarks/tblite/local.yaml \
# --env.eval_concurrency 4
env:
enabled_toolsets: ["terminal", "file"]
max_agent_turns: 60
max_token_length: 32000
agent_temperature: 0.8
terminal_backend: "docker"
terminal_timeout: 300
tool_pool_size: 16
dataset_name: "NousResearch/openthoughts-tblite"
test_timeout: 600
task_timeout: 1200
eval_concurrency: 8 # max 8 tasks at once
tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
use_wandb: false
wandb_name: "openthoughts-tblite-local"
ensure_scores_are_not_same: false
data_dir_to_save_evals: "environments/benchmarks/evals/openthoughts-tblite-local"
openai:
base_url: "https://openrouter.ai/api/v1"
model_name: "anthropic/claude-sonnet-4"
server_type: "openai"
health_check: false
# api_key loaded from OPENROUTER_API_KEY in .env