35 lines
850 B
Bash
35 lines
850 B
Bash
|
|
#!/bin/bash
|
||
|
|
|
||
|
|
# YC-Bench Evaluation
|
||
|
|
#
|
||
|
|
# Requires: pip install "hermes-agent[yc-bench]"
|
||
|
|
#
|
||
|
|
# Run from repo root:
|
||
|
|
# bash environments/benchmarks/yc_bench/run_eval.sh
|
||
|
|
#
|
||
|
|
# Override model:
|
||
|
|
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||
|
|
# --openai.model_name anthropic/claude-opus-4-20250514
|
||
|
|
#
|
||
|
|
# Run a single preset:
|
||
|
|
# bash environments/benchmarks/yc_bench/run_eval.sh \
|
||
|
|
# --env.presets '["fast_test"]' --env.seeds '[1]'
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
mkdir -p logs evals/yc-bench
|
||
|
|
LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log"
|
||
|
|
|
||
|
|
echo "YC-Bench Evaluation"
|
||
|
|
echo "Log: $LOG_FILE"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \
|
||
|
|
python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
|
||
|
|
--config environments/benchmarks/yc_bench/default.yaml \
|
||
|
|
"$@" \
|
||
|
|
2>&1 | tee "$LOG_FILE"
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "Log saved to: $LOG_FILE"
|