diff --git a/environments/benchmarks/yc_bench/README.md b/environments/benchmarks/yc_bench/README.md new file mode 100644 index 000000000..7a8aba787 --- /dev/null +++ b/environments/benchmarks/yc_bench/README.md @@ -0,0 +1,115 @@ +# YC-Bench: Long-Horizon Agent Benchmark + +[YC-Bench](https://github.com/collinear-ai/yc-bench) by [Collinear AI](https://collinear.ai/) is a deterministic, long-horizon benchmark that tests LLM agents' ability to act as a tech startup CEO. The agent manages a simulated company over 1-3 years, making compounding decisions about resource allocation, cash flow, task management, and prestige specialisation across 4 skill domains. + +Unlike TerminalBench2 (which evaluates per-task coding ability with binary pass/fail), YC-Bench measures **long-term strategic coherence** — whether an agent can maintain consistent strategy, manage compounding consequences, and adapt plans over hundreds of turns. + +## Setup + +```bash +# Install yc-bench (optional dependency) +pip install "hermes-agent[yc-bench]" + +# Or install from source +git clone https://github.com/collinear-ai/yc-bench +cd yc-bench && pip install -e . + +# Verify +yc-bench --help +``` + +## Running + +```bash +# From the repo root: +bash environments/benchmarks/yc_bench/run_eval.sh + +# Or directly: +python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \ + --config environments/benchmarks/yc_bench/default.yaml + +# Override model: +bash environments/benchmarks/yc_bench/run_eval.sh \ + --openai.model_name anthropic/claude-opus-4-20250514 + +# Quick single-preset test: +bash environments/benchmarks/yc_bench/run_eval.sh \ + --env.presets '["fast_test"]' --env.seeds '[1]' +``` + +## How It Works + +### Architecture + +``` +HermesAgentLoop (our agent) + -> terminal tool -> subprocess("yc-bench company status") -> JSON output + -> terminal tool -> subprocess("yc-bench task accept --task-id X") -> JSON + -> terminal tool -> subprocess("yc-bench sim resume") -> JSON (advance time) + -> ... (100-500 turns per run) +``` + +The environment initialises the simulation via `yc-bench sim init` (NOT `yc-bench run`, which would start yc-bench's own built-in agent loop). Our `HermesAgentLoop` then drives all interaction through CLI commands. + +### Simulation Mechanics + +- **4 skill domains**: research, inference, data_environment, training +- **Prestige system** (1.0-10.0): Gates access to higher-paying tasks +- **Employee management**: Junior/Mid/Senior with domain-specific skill rates +- **Throughput splitting**: `effective_rate = base_rate / N` active tasks per employee +- **Financial pressure**: Monthly payroll, bankruptcy = game over +- **Deterministic**: SHA256-based RNG — same seed + preset = same world + +### Difficulty Presets + +| Preset | Employees | Tasks | Focus | +|-----------|-----------|-------|-------| +| tutorial | 3 | 50 | Basic loop mechanics | +| easy | 5 | 100 | Throughput awareness | +| **medium**| 5 | 150 | Prestige climbing + domain specialisation | +| **hard** | 7 | 200 | Precise ETA reasoning | +| nightmare | 8 | 300 | Sustained perfection under payroll pressure | +| fast_test | (varies) | (varies) | Quick validation (~50 turns) | + +Default eval runs **fast_test + medium + hard** × 3 seeds = 9 runs. + +### Scoring + +``` +composite = 0.5 × survival + 0.5 × normalised_funds +``` + +- **Survival** (binary): Did the company avoid bankruptcy? +- **Normalised funds** (0.0-1.0): Log-scale relative to initial $250K capital + +## Configuration + +Key fields in `default.yaml`: + +| Field | Default | Description | +|-------|---------|-------------| +| `presets` | `["fast_test", "medium", "hard"]` | Which presets to evaluate | +| `seeds` | `[1, 2, 3]` | RNG seeds per preset | +| `max_agent_turns` | 200 | Max LLM calls per run | +| `run_timeout` | 3600 | Wall-clock timeout per run (seconds) | +| `survival_weight` | 0.5 | Weight of survival in composite score | +| `funds_weight` | 0.5 | Weight of normalised funds in composite | +| `horizon_years` | null | Override horizon (null = auto from preset) | + +## Cost & Time Estimates + +Each run is 100-500 LLM turns. Approximate costs per run at typical API rates: + +| Preset | Turns | Time | Est. Cost | +|--------|-------|------|-----------| +| fast_test | ~50 | 5-10 min | $1-5 | +| medium | ~200 | 20-40 min | $5-15 | +| hard | ~300 | 30-60 min | $10-25 | + +Full default eval (9 runs): ~3-6 hours, $50-200 depending on model. + +## References + +- [collinear-ai/yc-bench](https://github.com/collinear-ai/yc-bench) — Official repository +- [Collinear AI](https://collinear.ai/) — Company behind yc-bench +- [TerminalBench2](../terminalbench_2/) — Per-task coding benchmark (complementary) diff --git a/environments/benchmarks/yc_bench/__init__.py b/environments/benchmarks/yc_bench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/environments/benchmarks/yc_bench/default.yaml b/environments/benchmarks/yc_bench/default.yaml new file mode 100644 index 000000000..871c1eaa8 --- /dev/null +++ b/environments/benchmarks/yc_bench/default.yaml @@ -0,0 +1,43 @@ +# YC-Bench Evaluation -- Default Configuration +# +# Long-horizon agent benchmark: agent plays CEO of an AI startup over +# a simulated 1-3 year run, interacting via yc-bench CLI subcommands. +# +# Requires: pip install "hermes-agent[yc-bench]" +# +# Usage: +# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \ +# --config environments/benchmarks/yc_bench/default.yaml +# +# # Override model: +# python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \ +# --config environments/benchmarks/yc_bench/default.yaml \ +# --openai.model_name anthropic/claude-opus-4-20250514 + +env: + enabled_toolsets: ["terminal"] + max_agent_turns: 200 + max_token_length: 32000 + agent_temperature: 0.0 + terminal_backend: "local" + terminal_timeout: 60 + presets: ["fast_test", "medium", "hard"] + seeds: [1, 2, 3] + run_timeout: 3600 # 60 min wall-clock per run, auto-FAIL if exceeded + survival_weight: 0.5 # weight of binary survival in composite score + funds_weight: 0.5 # weight of normalised final funds in composite score + db_dir: "/tmp/yc_bench_dbs" + company_name: "BenchCo" + start_date: "01/01/2025" # MM/DD/YYYY (yc-bench convention) + tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" + use_wandb: true + wandb_name: "yc-bench" + ensure_scores_are_not_same: false + data_dir_to_save_evals: "environments/benchmarks/evals/yc-bench" + +openai: + base_url: "https://openrouter.ai/api/v1" + model_name: "anthropic/claude-sonnet-4-20250514" + server_type: "openai" + health_check: false + # api_key loaded from OPENROUTER_API_KEY in .env diff --git a/environments/benchmarks/yc_bench/run_eval.sh b/environments/benchmarks/yc_bench/run_eval.sh new file mode 100755 index 000000000..0d793f53d --- /dev/null +++ b/environments/benchmarks/yc_bench/run_eval.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# YC-Bench Evaluation +# +# Requires: pip install "hermes-agent[yc-bench]" +# +# Run from repo root: +# bash environments/benchmarks/yc_bench/run_eval.sh +# +# Override model: +# bash environments/benchmarks/yc_bench/run_eval.sh \ +# --openai.model_name anthropic/claude-opus-4-20250514 +# +# Run a single preset: +# bash environments/benchmarks/yc_bench/run_eval.sh \ +# --env.presets '["fast_test"]' --env.seeds '[1]' + +set -euo pipefail + +mkdir -p logs evals/yc-bench +LOG_FILE="logs/yc_bench_$(date +%Y%m%d_%H%M%S).log" + +echo "YC-Bench Evaluation" +echo "Log: $LOG_FILE" +echo "" + +PYTHONUNBUFFERED=1 LOGLEVEL="${LOGLEVEL:-INFO}" \ + python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \ + --config environments/benchmarks/yc_bench/default.yaml \ + "$@" \ + 2>&1 | tee "$LOG_FILE" + +echo "" +echo "Log saved to: $LOG_FILE" diff --git a/environments/benchmarks/yc_bench/yc_bench_env.py b/environments/benchmarks/yc_bench/yc_bench_env.py new file mode 100644 index 000000000..4965a559f --- /dev/null +++ b/environments/benchmarks/yc_bench/yc_bench_env.py @@ -0,0 +1,847 @@ +""" +YCBenchEvalEnv -- YC-Bench Long-Horizon Agent Benchmark Environment + +Evaluates agentic LLMs on YC-Bench: a deterministic, long-horizon benchmark +where the agent acts as CEO of an AI startup over a simulated 1-3 year run. +The agent manages cash flow, employees, tasks, and prestige across 4 domains, +interacting exclusively via CLI subprocess calls against a SQLite-backed +discrete-event simulation. + +Unlike TerminalBench2 (per-task binary pass/fail), YC-Bench measures sustained +multi-turn strategic coherence -- whether an agent can manage compounding +decisions over hundreds of turns without going bankrupt. + +This is an eval-only environment. Run via: + + python environments/benchmarks/yc_bench/yc_bench_env.py evaluate \ + --config environments/benchmarks/yc_bench/default.yaml + +The evaluate flow: + 1. setup() -- Verifies yc-bench installed, builds eval matrix (preset x seed) + 2. evaluate() -- Iterates over all runs sequentially through: + a. rollout_and_score_eval() -- Per-run agent loop + - Initialises a fresh yc-bench simulation via `sim init` (NOT `run`) + - Runs HermesAgentLoop with terminal tool only + - Reads final SQLite DB to extract score + - Returns survival (0/1) + normalised funds score + b. Aggregates per-preset and overall metrics + c. Logs results via evaluate_log() and wandb + +Key features: + - CLI-only interface: agent calls yc-bench subcommands via terminal tool + - Deterministic: same seed + preset = same world (SHA256-based RNG) + - Multi-dimensional scoring: survival + normalised final funds + - Per-preset difficulty breakdown in results + - Isolated SQLite DB per run (no cross-run state leakage) + +Requires: pip install hermes-agent[yc-bench] +""" + +import asyncio +import datetime +import json +import logging +import math +import os +import sqlite3 +import subprocess +import sys +import threading +import time +import uuid +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +_repo_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +from pydantic import Field + +from atroposlib.envs.base import EvalHandlingEnum +from atroposlib.envs.server_handling.server_manager import APIServerConfig + +from environments.agent_loop import HermesAgentLoop +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig + +logger = logging.getLogger(__name__) + +# ============================================================================= +# System prompt +# ============================================================================= + +YC_BENCH_SYSTEM_PROMPT = """\ +You are the autonomous CEO of an early-stage AI startup in a deterministic +business simulation. You manage the company exclusively through the `yc-bench` +CLI tool. Your primary goal is to **survive** until the simulation horizon ends +without going bankrupt, while **maximising final funds**. + +## Simulation Mechanics + +- **Funds**: You start with $250,000 seed capital. Revenue comes from completing + tasks. Rewards scale with your prestige: `base × (1 + scale × (prestige − 1))`. +- **Domains**: There are 4 skill domains: **research**, **inference**, + **data_environment**, and **training**. Each has its own prestige level + (1.0-10.0). Higher prestige unlocks better-paying tasks. +- **Employees**: You have employees (Junior/Mid/Senior) with domain-specific + skill rates. **Throughput splits**: `effective_rate = base_rate / N` where N + is the number of active tasks assigned to that employee. Focus beats breadth. +- **Payroll**: Deducted automatically on the first business day of each month. + Running out of funds = bankruptcy = game over. +- **Time**: The simulation runs on business days (Mon-Fri), 09:00-18:00. + Time only advances when you call `yc-bench sim resume`. + +## Task Lifecycle + +1. Browse market tasks with `market browse` +2. Accept a task with `task accept` (this sets its deadline) +3. Assign employees with `task assign` +4. Dispatch with `task dispatch` to start work +5. Call `sim resume` to advance time and let employees make progress +6. Tasks complete when all domain requirements are fulfilled + +**Penalties for failure vary by difficulty preset.** Completing a task on time +earns full reward + prestige gain. Missing a deadline or cancelling a task +incurs prestige penalties -- cancelling is always more costly than letting a +task fail, so cancel only as a last resort. + +## CLI Commands + +### Observe +- `yc-bench company status` -- funds, prestige, runway +- `yc-bench employee list` -- skills, salary, active tasks +- `yc-bench market browse [--domain D] [--required-prestige-lte N]` -- available tasks +- `yc-bench task list [--status active|planned]` -- your tasks +- `yc-bench task inspect --task-id UUID` -- progress, deadline, assignments +- `yc-bench finance ledger [--category monthly_payroll|task_reward]` -- transaction history +- `yc-bench report monthly` -- monthly P&L + +### Act +- `yc-bench task accept --task-id UUID` -- accept from market +- `yc-bench task assign --task-id UUID --employee-id UUID` -- assign employee +- `yc-bench task dispatch --task-id UUID` -- start work (needs >=1 assignment) +- `yc-bench task cancel --task-id UUID --reason "text"` -- cancel (prestige penalty) +- `yc-bench sim resume` -- advance simulation clock + +### Memory (persists across context truncation) +- `yc-bench scratchpad read` -- read your persistent notes +- `yc-bench scratchpad write --content "text"` -- overwrite notes +- `yc-bench scratchpad append --content "text"` -- append to notes +- `yc-bench scratchpad clear` -- clear notes + +## Strategy Guidelines + +1. **Specialise in 2-3 domains** to climb the prestige ladder faster and unlock + high-reward tasks. Don't spread thin across all 4 domains early on. +2. **Focus employees** -- assigning one employee to many tasks halves their + throughput per additional task. Keep assignments concentrated. +3. **Use the scratchpad** to track your strategy, upcoming deadlines, and + employee assignments. This persists even if conversation context is truncated. +4. **Monitor runway** -- always know how many months of payroll you can cover. + Accept high-reward tasks before payroll dates. +5. **Don't over-accept** -- taking too many tasks and missing deadlines cascades + into prestige loss, locking you out of profitable contracts. +6. Use `finance ledger` and `report monthly` to track revenue trends. + +## Your Turn + +Each turn: +1. Call `yc-bench company status` and `yc-bench task list` to orient yourself. +2. Check for completed tasks and pending deadlines. +3. Browse market for profitable tasks within your prestige level. +4. Accept, assign, and dispatch tasks strategically. +5. Call `yc-bench sim resume` to advance time. +6. Repeat until the simulation ends. + +Think step by step before acting.""" + +# Starting funds in cents ($250,000) +INITIAL_FUNDS_CENTS = 25_000_000 + +# Default horizon per preset (years) +_PRESET_HORIZONS = { + "tutorial": 1, + "easy": 1, + "medium": 1, + "hard": 1, + "nightmare": 1, + "fast_test": 1, + "default": 3, + "high_reward": 1, +} + + +# ============================================================================= +# Configuration +# ============================================================================= + +class YCBenchEvalConfig(HermesAgentEnvConfig): + """ + Configuration for the YC-Bench evaluation environment. + + Extends HermesAgentEnvConfig with YC-Bench-specific settings for + preset selection, seed control, scoring, and simulation parameters. + """ + + presets: List[str] = Field( + default=["fast_test", "medium", "hard"], + description="YC-Bench preset names to evaluate.", + ) + seeds: List[int] = Field( + default=[1, 2, 3], + description="Random seeds -- each preset x seed = one run.", + ) + run_timeout: int = Field( + default=3600, + description="Maximum wall-clock seconds per run. Default 60 minutes.", + ) + survival_weight: float = Field( + default=0.5, + description="Weight of survival (0/1) in composite score.", + ) + funds_weight: float = Field( + default=0.5, + description="Weight of normalised final funds in composite score.", + ) + db_dir: str = Field( + default="/tmp/yc_bench_dbs", + description="Directory for per-run SQLite databases.", + ) + horizon_years: Optional[int] = Field( + default=None, + description=( + "Simulation horizon in years. If None (default), inferred from " + "preset name (1 year for most, 3 for 'default')." + ), + ) + company_name: str = Field( + default="BenchCo", + description="Name of the simulated company.", + ) + start_date: str = Field( + default="01/01/2025", + description="Simulation start date in MM/DD/YYYY format (yc-bench convention).", + ) + + +# ============================================================================= +# Scoring helpers +# ============================================================================= + +def _read_final_score(db_path: str) -> Dict[str, Any]: + """ + Read final game state from a YC-Bench SQLite database. + + Returns dict with final_funds_cents (int), survived (bool), + terminal_reason (str). + + Note: yc-bench table names are plural -- 'companies' not 'company', + 'sim_events' not 'simulation_log'. + """ + if not os.path.exists(db_path): + logger.warning("DB not found at %s", db_path) + return { + "final_funds_cents": 0, + "survived": False, + "terminal_reason": "db_missing", + } + + conn = None + try: + conn = sqlite3.connect(db_path) + cur = conn.cursor() + + # Read final funds from the 'companies' table + cur.execute("SELECT funds_cents FROM companies LIMIT 1") + row = cur.fetchone() + funds = row[0] if row else 0 + + # Determine terminal reason from 'sim_events' table + terminal_reason = "unknown" + try: + cur.execute( + "SELECT event_type FROM sim_events " + "WHERE event_type IN ('bankruptcy', 'horizon_end') " + "ORDER BY scheduled_at DESC LIMIT 1" + ) + event_row = cur.fetchone() + if event_row: + terminal_reason = event_row[0] + except sqlite3.OperationalError: + # Table may not exist if simulation didn't progress + pass + + survived = funds >= 0 and terminal_reason != "bankruptcy" + return { + "final_funds_cents": funds, + "survived": survived, + "terminal_reason": terminal_reason, + } + + except Exception as e: + logger.error("Failed to read DB %s: %s", db_path, e) + return { + "final_funds_cents": 0, + "survived": False, + "terminal_reason": f"db_error: {e}", + } + finally: + if conn: + conn.close() + + +def _compute_composite_score( + final_funds_cents: int, + survived: bool, + survival_weight: float = 0.5, + funds_weight: float = 0.5, + initial_funds_cents: int = INITIAL_FUNDS_CENTS, +) -> float: + """ + Compute composite score from survival and final funds. + + Score = survival_weight * survival_score + + funds_weight * normalised_funds_score + + Normalised funds uses log-scale relative to initial capital: + - funds <= 0: 0.0 + - funds == initial: ~0.15 + - funds == 10x: ~0.52 + - funds == 100x: 1.0 + """ + survival_score = 1.0 if survived else 0.0 + + if final_funds_cents <= 0: + funds_score = 0.0 + else: + max_ratio = 100.0 + ratio = final_funds_cents / max(initial_funds_cents, 1) + funds_score = min(math.log1p(ratio) / math.log1p(max_ratio), 1.0) + + return survival_weight * survival_score + funds_weight * funds_score + + +# ============================================================================= +# Main Environment +# ============================================================================= + +class YCBenchEvalEnv(HermesAgentBaseEnv): + """ + YC-Bench long-horizon agent benchmark environment (eval-only). + + Each eval item is a (preset, seed) pair. The environment initialises the + simulation via ``yc-bench sim init`` (NOT ``yc-bench run`` which would start + a competing built-in agent loop). The HermesAgentLoop then drives the + interaction by calling individual yc-bench CLI commands via the terminal tool. + + After the agent loop ends, the SQLite DB is read to extract the final score. + + Scoring: + composite = 0.5 * survival + 0.5 * normalised_funds + """ + + name = "yc-bench" + env_config_cls = YCBenchEvalConfig + + @classmethod + def config_init(cls) -> Tuple[YCBenchEvalConfig, List[APIServerConfig]]: + env_config = YCBenchEvalConfig( + enabled_toolsets=["terminal"], + disabled_toolsets=None, + distribution=None, + max_agent_turns=200, + max_token_length=32000, + agent_temperature=0.0, + system_prompt=YC_BENCH_SYSTEM_PROMPT, + terminal_backend="local", + terminal_timeout=60, + presets=["fast_test", "medium", "hard"], + seeds=[1, 2, 3], + run_timeout=3600, + survival_weight=0.5, + funds_weight=0.5, + db_dir="/tmp/yc_bench_dbs", + eval_handling=EvalHandlingEnum.STOP_TRAIN, + group_size=1, + steps_per_eval=1, + total_steps=1, + tokenizer_name="NousResearch/Hermes-3-Llama-3.1-8B", + use_wandb=True, + wandb_name="yc-bench", + ensure_scores_are_not_same=False, + ) + + server_configs = [ + APIServerConfig( + base_url="https://openrouter.ai/api/v1", + model_name="anthropic/claude-sonnet-4-20250514", + server_type="openai", + api_key=os.getenv("OPENROUTER_API_KEY", ""), + health_check=False, + ) + ] + + return env_config, server_configs + + # ========================================================================= + # Setup + # ========================================================================= + + async def setup(self): + """Verify yc-bench is installed and build the eval matrix.""" + # Verify yc-bench CLI is available + try: + result = subprocess.run( + ["yc-bench", "--help"], capture_output=True, text=True, timeout=10 + ) + if result.returncode != 0: + raise FileNotFoundError + except (FileNotFoundError, subprocess.TimeoutExpired): + raise RuntimeError( + "yc-bench CLI not found. Install with:\n" + ' pip install "hermes-agent[yc-bench]"\n' + "Or: git clone https://github.com/collinear-ai/yc-bench " + "&& cd yc-bench && pip install -e ." + ) + print("yc-bench CLI verified.") + + # Build eval matrix: preset x seed + self.all_eval_items = [ + {"preset": preset, "seed": seed} + for preset in self.config.presets + for seed in self.config.seeds + ] + self.iter = 0 + + os.makedirs(self.config.db_dir, exist_ok=True) + self.eval_metrics: List[Tuple[str, float]] = [] + + # Streaming JSONL log for crash-safe result persistence + log_dir = os.path.join(os.path.dirname(__file__), "logs") + os.makedirs(log_dir, exist_ok=True) + run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl") + self._streaming_file = open(self._streaming_path, "w") + self._streaming_lock = threading.Lock() + + print(f"\nYC-Bench eval matrix: {len(self.all_eval_items)} runs") + for item in self.all_eval_items: + print(f" preset={item['preset']!r} seed={item['seed']}") + print(f"Streaming results to: {self._streaming_path}\n") + + def _save_result(self, result: Dict[str, Any]): + """Write a single run result to the streaming JSONL file immediately.""" + if not hasattr(self, "_streaming_file") or self._streaming_file.closed: + return + with self._streaming_lock: + self._streaming_file.write( + json.dumps(result, ensure_ascii=False, default=str) + "\n" + ) + self._streaming_file.flush() + + # ========================================================================= + # Training pipeline stubs (eval-only -- not used) + # ========================================================================= + + async def get_next_item(self): + item = self.all_eval_items[self.iter % len(self.all_eval_items)] + self.iter += 1 + return item + + def format_prompt(self, item: Dict[str, Any]) -> str: + preset = item["preset"] + seed = item["seed"] + return ( + f"A new YC-Bench simulation has been initialized " + f"(preset='{preset}', seed={seed}).\n" + f"Your company '{self.config.company_name}' is ready.\n\n" + "Begin by calling:\n" + "1. `yc-bench company status` -- see your starting funds and prestige\n" + "2. `yc-bench employee list` -- see your team and their skills\n" + "3. `yc-bench market browse --required-prestige-lte 1` -- find tasks " + "you can take\n\n" + "Then accept 2-3 tasks, assign employees, dispatch them, and call " + "`yc-bench sim resume` to advance time. Repeat this loop until the " + "simulation ends (horizon reached or bankruptcy)." + ) + + async def compute_reward(self, item, result, ctx) -> float: + return 0.0 + + async def collect_trajectories(self, item): + return None, [] + + async def score(self, rollout_group_data): + return None + + # ========================================================================= + # Per-run evaluation + # ========================================================================= + + async def rollout_and_score_eval(self, eval_item: Dict[str, Any]) -> Dict: + """ + Evaluate a single (preset, seed) run. + + 1. Sets DATABASE_URL and YC_BENCH_EXPERIMENT env vars + 2. Initialises the simulation via ``yc-bench sim init`` (NOT ``run``) + 3. Runs HermesAgentLoop with terminal tool + 4. Reads SQLite DB to compute final score + 5. Returns result dict with survival, funds, and composite score + """ + preset = eval_item["preset"] + seed = eval_item["seed"] + run_id = str(uuid.uuid4())[:8] + run_key = f"{preset}_seed{seed}_{run_id}" + + from tqdm import tqdm + tqdm.write(f" [START] preset={preset!r} seed={seed} (run_id={run_id})") + run_start = time.time() + + # Isolated DB per run -- prevents cross-run state leakage + db_path = os.path.join(self.config.db_dir, f"yc_bench_{run_key}.db") + os.environ["DATABASE_URL"] = f"sqlite:///{db_path}" + os.environ["YC_BENCH_EXPERIMENT"] = preset + + # Determine horizon: explicit config override > preset lookup > default 1 + horizon = self.config.horizon_years or _PRESET_HORIZONS.get(preset, 1) + + try: + # ---------------------------------------------------------- + # Step 1: Initialise the simulation via CLI + # IMPORTANT: We use `sim init`, NOT `yc-bench run`. + # `yc-bench run` starts yc-bench's own LLM agent loop (via + # LiteLLM), which would compete with our HermesAgentLoop. + # `sim init` just sets up the world and returns. + # ---------------------------------------------------------- + init_cmd = [ + "yc-bench", "sim", "init", + "--seed", str(seed), + "--start-date", self.config.start_date, + "--company-name", self.config.company_name, + "--horizon-years", str(horizon), + ] + init_result = subprocess.run( + init_cmd, capture_output=True, text=True, timeout=30, + ) + if init_result.returncode != 0: + error_msg = (init_result.stderr or init_result.stdout).strip() + raise RuntimeError(f"yc-bench sim init failed: {error_msg}") + + tqdm.write(f" Simulation initialized (horizon={horizon}yr)") + + # ---------------------------------------------------------- + # Step 2: Run the HermesAgentLoop + # ---------------------------------------------------------- + tools, valid_names = self._resolve_tools_for_group() + + messages: List[Dict[str, Any]] = [ + {"role": "system", "content": YC_BENCH_SYSTEM_PROMPT}, + {"role": "user", "content": self.format_prompt(eval_item)}, + ] + + agent = HermesAgentLoop( + server=self.server, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=run_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + extra_body=self.config.extra_body, + ) + result = await agent.run(messages) + + # ---------------------------------------------------------- + # Step 3: Read final score from the simulation DB + # ---------------------------------------------------------- + score_data = _read_final_score(db_path) + final_funds = score_data["final_funds_cents"] + survived = score_data["survived"] + terminal_reason = score_data["terminal_reason"] + + composite = _compute_composite_score( + final_funds_cents=final_funds, + survived=survived, + survival_weight=self.config.survival_weight, + funds_weight=self.config.funds_weight, + ) + + elapsed = time.time() - run_start + status = "SURVIVED" if survived else "BANKRUPT" + if final_funds >= 0: + funds_str = f"${final_funds / 100:,.0f}" + else: + funds_str = f"-${abs(final_funds) / 100:,.0f}" + + tqdm.write( + f" [{status}] preset={preset!r} seed={seed} " + f"funds={funds_str} score={composite:.3f} " + f"turns={result.turns_used} ({elapsed:.0f}s)" + ) + + out = { + "preset": preset, + "seed": seed, + "survived": survived, + "final_funds_cents": final_funds, + "final_funds_usd": final_funds / 100, + "terminal_reason": terminal_reason, + "composite_score": composite, + "turns_used": result.turns_used, + "finished_naturally": result.finished_naturally, + "elapsed_seconds": elapsed, + "db_path": db_path, + "messages": result.messages, + } + self._save_result(out) + return out + + except Exception as e: + elapsed = time.time() - run_start + logger.error("Run %s failed: %s", run_key, e, exc_info=True) + tqdm.write( + f" [ERROR] preset={preset!r} seed={seed}: {e} ({elapsed:.0f}s)" + ) + out = { + "preset": preset, + "seed": seed, + "survived": False, + "final_funds_cents": 0, + "final_funds_usd": 0.0, + "terminal_reason": f"error: {e}", + "composite_score": 0.0, + "turns_used": 0, + "error": str(e), + "elapsed_seconds": elapsed, + } + self._save_result(out) + return out + + # ========================================================================= + # Evaluate + # ========================================================================= + + async def _run_with_timeout(self, item: Dict[str, Any]) -> Dict: + """Wrap a single rollout with a wall-clock timeout.""" + preset = item["preset"] + seed = item["seed"] + try: + return await asyncio.wait_for( + self.rollout_and_score_eval(item), + timeout=self.config.run_timeout, + ) + except asyncio.TimeoutError: + from tqdm import tqdm + tqdm.write( + f" [TIMEOUT] preset={preset!r} seed={seed} " + f"(exceeded {self.config.run_timeout}s)" + ) + out = { + "preset": preset, + "seed": seed, + "survived": False, + "final_funds_cents": 0, + "final_funds_usd": 0.0, + "terminal_reason": f"timeout ({self.config.run_timeout}s)", + "composite_score": 0.0, + "turns_used": 0, + "error": "timeout", + } + self._save_result(out) + return out + + async def evaluate(self, *args, **kwargs) -> None: + """ + Run YC-Bench evaluation over all (preset, seed) combinations. + + Runs sequentially -- each run is 100-500 turns, parallelising would + be prohibitively expensive and cause env var conflicts. + """ + start_time = time.time() + from tqdm import tqdm + + # --- tqdm-compatible logging handler (TB2 pattern) --- + class _TqdmHandler(logging.Handler): + def emit(self, record): + try: + tqdm.write(self.format(record)) + except Exception: + self.handleError(record) + + root = logging.getLogger() + handler = _TqdmHandler() + handler.setFormatter( + logging.Formatter("%(levelname)s %(name)s: %(message)s") + ) + root.handlers = [handler] + for noisy in ("httpx", "openai"): + logging.getLogger(noisy).setLevel(logging.WARNING) + + # --- Print config summary --- + print(f"\n{'='*60}") + print("Starting YC-Bench Evaluation") + print(f"{'='*60}") + print(f" Presets: {self.config.presets}") + print(f" Seeds: {self.config.seeds}") + print(f" Total runs: {len(self.all_eval_items)}") + print(f" Max turns/run: {self.config.max_agent_turns}") + print(f" Run timeout: {self.config.run_timeout}s") + print(f"{'='*60}\n") + + results = [] + pbar = tqdm( + total=len(self.all_eval_items), desc="YC-Bench", dynamic_ncols=True + ) + + try: + for item in self.all_eval_items: + result = await self._run_with_timeout(item) + results.append(result) + survived_count = sum(1 for r in results if r.get("survived")) + pbar.set_postfix_str( + f"survived={survived_count}/{len(results)}" + ) + pbar.update(1) + + except (KeyboardInterrupt, asyncio.CancelledError): + tqdm.write("\n[INTERRUPTED] Stopping evaluation...") + pbar.close() + try: + from tools.terminal_tool import cleanup_all_environments + cleanup_all_environments() + except Exception: + pass + if hasattr(self, "_streaming_file") and not self._streaming_file.closed: + self._streaming_file.close() + return + + pbar.close() + end_time = time.time() + + # --- Compute metrics --- + valid = [r for r in results if r is not None] + if not valid: + print("Warning: No valid results.") + return + + total = len(valid) + survived_total = sum(1 for r in valid if r.get("survived")) + survival_rate = survived_total / total if total else 0.0 + avg_score = ( + sum(r.get("composite_score", 0) for r in valid) / total + if total + else 0.0 + ) + + preset_results: Dict[str, List[Dict]] = defaultdict(list) + for r in valid: + preset_results[r["preset"]].append(r) + + eval_metrics = { + "eval/survival_rate": survival_rate, + "eval/avg_composite_score": avg_score, + "eval/total_runs": total, + "eval/survived_runs": survived_total, + "eval/evaluation_time_seconds": end_time - start_time, + } + + for preset, items in sorted(preset_results.items()): + ps = sum(1 for r in items if r.get("survived")) + pt = len(items) + pa = ( + sum(r.get("composite_score", 0) for r in items) / pt + if pt + else 0 + ) + key = preset.replace("-", "_") + eval_metrics[f"eval/survival_rate_{key}"] = ps / pt if pt else 0 + eval_metrics[f"eval/avg_score_{key}"] = pa + + self.eval_metrics = [(k, v) for k, v in eval_metrics.items()] + + # --- Print summary --- + print(f"\n{'='*60}") + print("YC-Bench Evaluation Results") + print(f"{'='*60}") + print( + f"Overall survival rate: {survival_rate:.1%} " + f"({survived_total}/{total})" + ) + print(f"Average composite score: {avg_score:.4f}") + print(f"Evaluation time: {end_time - start_time:.1f}s") + + print("\nPer-preset breakdown:") + for preset, items in sorted(preset_results.items()): + ps = sum(1 for r in items if r.get("survived")) + pt = len(items) + pa = ( + sum(r.get("composite_score", 0) for r in items) / pt + if pt + else 0 + ) + print(f" {preset}: {ps}/{pt} survived avg_score={pa:.4f}") + for r in items: + status = "SURVIVED" if r.get("survived") else "BANKRUPT" + funds = r.get("final_funds_usd", 0) + print( + f" seed={r['seed']} [{status}] " + f"${funds:,.0f} " + f"score={r.get('composite_score', 0):.3f}" + ) + + print(f"{'='*60}\n") + + # --- Log results --- + samples = [ + {k: v for k, v in r.items() if k != "messages"} for r in valid + ] + + try: + await self.evaluate_log( + metrics=eval_metrics, + samples=samples, + start_time=start_time, + end_time=end_time, + generation_parameters={ + "temperature": self.config.agent_temperature, + "max_tokens": self.config.max_token_length, + "max_agent_turns": self.config.max_agent_turns, + }, + ) + except Exception as e: + print(f"Error logging results: {e}") + + # --- Cleanup (TB2 pattern) --- + if hasattr(self, "_streaming_file") and not self._streaming_file.closed: + self._streaming_file.close() + print(f"Results saved to: {self._streaming_path}") + + try: + from tools.terminal_tool import cleanup_all_environments + cleanup_all_environments() + except Exception: + pass + + try: + from environments.agent_loop import _tool_executor + _tool_executor.shutdown(wait=False, cancel_futures=True) + except Exception: + pass + + # ========================================================================= + # Wandb logging + # ========================================================================= + + async def wandb_log(self, wandb_metrics: Optional[Dict] = None): + """Log YC-Bench-specific metrics to wandb.""" + if wandb_metrics is None: + wandb_metrics = {} + for k, v in self.eval_metrics: + wandb_metrics[k] = v + self.eval_metrics = [] + await super().wandb_log(wandb_metrics) + + +if __name__ == "__main__": + YCBenchEvalEnv.cli() diff --git a/pyproject.toml b/pyproject.toml index 7c2016544..5f86cabd2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ pty = ["ptyprocess>=0.7.0"] honcho = ["honcho-ai>=2.0.1"] mcp = ["mcp>=1.2.0"] homeassistant = ["aiohttp>=3.9.0"] +yc-bench = ["yc-bench @ git+https://github.com/collinear-ai/yc-bench.git"] all = [ "hermes-agent[modal]", "hermes-agent[daytona]",