environments/benchmarks/yc_bench/run_eval.sh

#!/bin/bash

# YC-Bench Evaluation
#
# Requires: pip install "hermes-agent[yc-bench]"
#
# Run from repo root:
#   bash environments/benchmarks/yc_bench/run_eval.sh
#
# Override model:
#   bash environments/benchmarks/yc_bench/run_eval.sh \
#       --openai.model_name anthropic/claude-opus-4-20250514
#
# Run a single preset:
#   bash environments/benchmarks/yc_bench/run_eval.sh \
#       --env.presets '["fast_test"]' --env.seeds '[1]'

set -euo pipefail

mkdir -p logs evals/yc-bench
LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log"

echo "YC-Bench Evaluation"
echo "Log: $LOG_FILE"
echo ""

PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \
  python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \
  --config environments/benchmarks/yc_bench/default.yaml \
  "$@" \
  2>&1 | tee "$LOG_FILE"

echo ""
echo "Log saved to: $LOG_FILE"
feat: add YC-Bench long-horizon agent benchmark environment Adds eval-only benchmark for YC-Bench (collinear-ai/yc-bench), a deterministic long-horizon benchmark where the agent acts as CEO of an AI startup over a simulated 1-3 year run. Key design decisions verified against the official yc-bench repo: - Uses 'sim init' (NOT 'yc-bench run') to avoid starting a competing built-in agent loop - Correct DB table names: 'companies' and 'sim_events' - Correct 4 domains: research, inference, data_environment, training - Penalty values are preset-dependent (not hardcoded in system prompt) - Sequential evaluation (each run is 100-500 turns) - Follows TerminalBench2 patterns: KeyboardInterrupt handling, cleanup_all_environments(), tqdm logging handler, streaming JSONL yc-bench added as optional dependency: pip install hermes-agent[yc-bench] Closes #340 2026-03-06 19:25:56 -08:00			`#!/bin/bash`

			`# YC-Bench Evaluation`
			`#`
			`# Requires: pip install "hermes-agent[yc-bench]"`
			`#`
			`# Run from repo root:`
			`# bash environments/benchmarks/yc_bench/run_eval.sh`
			`#`
			`# Override model:`
			`# bash environments/benchmarks/yc_bench/run_eval.sh \`
			`# --openai.model_name anthropic/claude-opus-4-20250514`
			`#`
			`# Run a single preset:`
			`# bash environments/benchmarks/yc_bench/run_eval.sh \`
			`# --env.presets '["fast_test"]' --env.seeds '[1]'`

			`set -euo pipefail`

			`mkdir -p logs evals/yc-bench`
			`LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log"`

			`echo "YC-Bench Evaluation"`
			`echo "Log: $LOG_FILE"`
			`echo ""`

			`PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \`
			`python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \`
			`--config environments/benchmarks/yc_bench/default.yaml \`
			`"$@" \`
			`2>&1 \| tee "$LOG_FILE"`

			`echo ""`
			`echo "Log saved to: $LOG_FILE"`