environments/hermes_swe_env/hermes_swe_env.py

"""
HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes

A concrete environment for software engineering tasks where the model writes code
and the reward function runs tests to verify correctness. Uses Modal terminal
backend for cloud-isolated sandboxes per rollout.

The reward function uses ToolContext.terminal() to run test commands in the same
Modal sandbox the model used during its agentic loop. All filesystem state from
the model's tool calls is preserved for verification.

Usage:
    # Phase 1: OpenAI server type
    vllm serve YourModel --tool-parser hermes
    run-api
    python environments/hermes_swe_env.py serve \\
        --openai.base_url http://localhost:8000/v1 \\
        --openai.model_name YourModel \\
        --openai.server_type openai \\
        --env.dataset_name bigcode/humanevalpack \\
        --env.terminal_backend modal

    # Phase 2: VLLM server type (full RL training)
    python environments/hermes_swe_env.py serve \\
        --openai.base_url http://localhost:8000/v1 \\
        --openai.model_name YourModel \\
        --openai.server_type vllm \\
        --env.tool_call_parser hermes \\
        --env.terminal_backend modal
"""

import logging
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

# Ensure repo root is on sys.path for imports
_repo_root = Path(__file__).resolve().parent.parent.parent
if str(_repo_root) not in sys.path:
    sys.path.insert(0, str(_repo_root))

from datasets import load_dataset

from atroposlib.envs.base import ScoredDataGroup
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from atroposlib.type_definitions import Item

from environments.agent_loop import AgentResult
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.tool_context import ToolContext

logger = logging.getLogger(__name__)


class HermesSweEnvConfig(HermesAgentEnvConfig):
    """Config with defaults for SWE-bench style tasks."""

    pass  # Inherits all fields, overrides defaults in config_init


class HermesSweEnv(HermesAgentBaseEnv):
    """
    SWE-bench style environment using Modal terminal backend.

    The model gets a coding task, uses terminal + file + web tools to solve it,
    and the reward function runs tests in the same Modal sandbox to verify.

    Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
    and customize format_prompt() and compute_reward() as needed.
    """

    name = "hermes-swe"
    env_config_cls = HermesSweEnvConfig

    @classmethod
    def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
        """
        Default configuration for the SWE environment.

        Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
        """
        env_config = HermesSweEnvConfig(
            # Toolsets: terminal for running code, file for reading/writing, web for docs
            enabled_toolsets=["terminal", "file", "web"],
            disabled_toolsets=None,
            distribution=None,
            # Agent settings -- SWE tasks need more turns
            max_agent_turns=30,
            max_token_length=4096,
            agent_temperature=1.0,
            system_prompt=(
                "You are a skilled software engineer. You have access to a terminal, "
                "file tools, and web search. Use these tools to complete the coding task. "
                "Write clean, working code and verify it runs correctly before finishing."
            ),
            # Modal backend for cloud-isolated sandboxes
            terminal_backend="modal",
            # Dataset -- override via CLI for your specific SWE dataset
            dataset_name="bigcode/humanevalpack",
            dataset_split="test",
            prompt_field="prompt",
            # Atropos settings
            group_size=4,
            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
            tool_call_parser="hermes",
            steps_per_eval=50,
            total_steps=500,
            use_wandb=True,
            wandb_name="hermes-swe",
        )

        server_configs = [
            APIServerConfig(
                base_url="http://localhost:8000/v1",
                model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
                server_type="openai",  # Phase 1; switch to "vllm" for Phase 2
                api_key="",
            )
        ]

        return env_config, server_configs

    async def setup(self):
        """Load the SWE dataset."""
        if self.config.dataset_name:
            self.dataset = load_dataset(
                self.config.dataset_name, split=self.config.dataset_split
            )
        else:
            # Placeholder if no dataset specified
            self.dataset = []
        self.iter = 0
        self.reward_buffer: List[float] = []

    async def get_next_item(self) -> Dict[str, Any]:
        """Cycle through the SWE dataset."""
        if not self.dataset:
            raise ValueError("No dataset loaded. Set dataset_name in config.")
        item = self.dataset[self.iter % len(self.dataset)]
        self.iter += 1
        return item

    def format_prompt(self, item: Dict[str, Any]) -> str:
        """
        Format the SWE task prompt.

        Override this in subclasses for different dataset formats.
        Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
        """
        prompt = item.get(self.config.prompt_field, "")

        # If the dataset has test information, include it in the prompt
        test_info = item.get("test", item.get("test_code", item.get("tests", "")))
        if test_info:
            prompt += f"\n\nTests to pass:\n{test_info}"

        return prompt

    async def compute_reward(
        self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
    ) -> float:
        """
        Score by running tests in the model's Modal sandbox.

        Default implementation:
        - If the dataset item has a 'test' or 'test_code' field, run it
        - Check exit code: 0 = pass, non-zero = fail
        - Partial credit for file creation

        Override this in subclasses for more sophisticated reward logic.
        """
        # Find the test command from the dataset item
        test_code = item.get("test", item.get("test_code", item.get("tests", "")))

        if test_code:
            # Run the test in the model's sandbox
            test_result = ctx.terminal(
                f'cd /workspace && python3 -c "{test_code}"', timeout=60
            )

            if test_result["exit_code"] == 0:
                self.reward_buffer.append(1.0)
                return 1.0

        # Partial credit: check if the model created any Python files
        file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
        if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
            self.reward_buffer.append(0.1)
            return 0.1

        self.reward_buffer.append(0.0)
        return 0.0

    async def evaluate(self, *args, **kwargs):
        """
        Run evaluation on a held-out set.

        Override for dataset-specific evaluation logic.
        """
        start_time = time.time()
        end_time = time.time()

        eval_metrics = {"eval/placeholder": 0.0}
        await self.evaluate_log(
            metrics=eval_metrics,
            start_time=start_time,
            end_time=end_time,
        )

    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
        """Log SWE-specific metrics."""
        if wandb_metrics is None:
            wandb_metrics = {}

        if self.reward_buffer:
            wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
                self.reward_buffer
            )
            wandb_metrics["train/pass_rate"] = sum(
                1 for r in self.reward_buffer if r == 1.0
            ) / len(self.reward_buffer)
            self.reward_buffer = []

        await super().wandb_log(wandb_metrics)


if __name__ == "__main__":
    HermesSweEnv.cli()
Add support for Atropos Agentic RL environments (requires branch tool_call_support in Atropos atm) - Added new environments for reinforcement learning, including `HermesSweEnv` for software engineering tasks and `TerminalTestEnv` for inline testing. - Introduced `ToolContext` for unrestricted access to tools during reward computation. - Updated `.gitignore` to exclude `wandb/` directory. - Enhanced `README.md` with detailed architecture and usage instructions for Atropos environments. - Added configuration files for SWE and terminal test environments to streamline setup. - Removed unnecessary compiled Python files from `__pycache__`. 2026-02-07 09:17:16 +00:00			`"""`
			`HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes`

			`A concrete environment for software engineering tasks where the model writes code`
			`and the reward function runs tests to verify correctness. Uses Modal terminal`
			`backend for cloud-isolated sandboxes per rollout.`

			`The reward function uses ToolContext.terminal() to run test commands in the same`
			`Modal sandbox the model used during its agentic loop. All filesystem state from`
			`the model's tool calls is preserved for verification.`

			`Usage:`
			`# Phase 1: OpenAI server type`
			`vllm serve YourModel --tool-parser hermes`
			`run-api`
			`python environments/hermes_swe_env.py serve \\`
			`--openai.base_url http://localhost:8000/v1 \\`
			`--openai.model_name YourModel \\`
			`--openai.server_type openai \\`
			`--env.dataset_name bigcode/humanevalpack \\`
			`--env.terminal_backend modal`

			`# Phase 2: VLLM server type (full RL training)`
			`python environments/hermes_swe_env.py serve \\`
			`--openai.base_url http://localhost:8000/v1 \\`
			`--openai.model_name YourModel \\`
			`--openai.server_type vllm \\`
			`--env.tool_call_parser hermes \\`
			`--env.terminal_backend modal`
			`"""`

			`import logging`
			`import sys`
			`import time`
			`from pathlib import Path`
			`from typing import Any, Dict, List, Optional, Tuple, Union`

			`# Ensure repo root is on sys.path for imports`
Add new environments and enhance tool context functionality - Introduced new environments: Terminal Test Environment and SWE Environment, each with default configurations for testing and software engineering tasks. - Added TerminalBench 2.0 evaluation environment with comprehensive setup for agentic LLMs, including task execution and verification. - Enhanced ToolContext with methods for uploading and downloading files, ensuring binary-safe operations. - Updated documentation across environments to reflect new features and usage instructions. - Refactored existing environment configurations for consistency and clarity. 2026-02-10 19:39:05 +00:00			`_repo_root = Path(__file__).resolve().parent.parent.parent`
Add support for Atropos Agentic RL environments (requires branch tool_call_support in Atropos atm) - Added new environments for reinforcement learning, including `HermesSweEnv` for software engineering tasks and `TerminalTestEnv` for inline testing. - Introduced `ToolContext` for unrestricted access to tools during reward computation. - Updated `.gitignore` to exclude `wandb/` directory. - Enhanced `README.md` with detailed architecture and usage instructions for Atropos environments. - Added configuration files for SWE and terminal test environments to streamline setup. - Removed unnecessary compiled Python files from `__pycache__`. 2026-02-07 09:17:16 +00:00			`if str(_repo_root) not in sys.path:`
			`sys.path.insert(0, str(_repo_root))`

			`from datasets import load_dataset`

			`from atroposlib.envs.base import ScoredDataGroup`
			`from atroposlib.envs.server_handling.server_manager import APIServerConfig`
			`from atroposlib.type_definitions import Item`

			`from environments.agent_loop import AgentResult`
			`from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig`
			`from environments.tool_context import ToolContext`

			`logger = logging.getLogger(__name__)`


			`class HermesSweEnvConfig(HermesAgentEnvConfig):`
			`"""Config with defaults for SWE-bench style tasks."""`

			`pass # Inherits all fields, overrides defaults in config_init`


			`class HermesSweEnv(HermesAgentBaseEnv):`
			`"""`
			`SWE-bench style environment using Modal terminal backend.`

			`The model gets a coding task, uses terminal + file + web tools to solve it,`
			`and the reward function runs tests in the same Modal sandbox to verify.`

			`Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)`
			`and customize format_prompt() and compute_reward() as needed.`
			`"""`

			`name = "hermes-swe"`
			`env_config_cls = HermesSweEnvConfig`

			`@classmethod`
			`def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:`
			`"""`
			`Default configuration for the SWE environment.`

			`Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.`
			`"""`
			`env_config = HermesSweEnvConfig(`
			`# Toolsets: terminal for running code, file for reading/writing, web for docs`
			`enabled_toolsets=["terminal", "file", "web"],`
			`disabled_toolsets=None,`
			`distribution=None,`
			`# Agent settings -- SWE tasks need more turns`
			`max_agent_turns=30,`
			`max_token_length=4096,`
			`agent_temperature=1.0,`
			`system_prompt=(`
			`"You are a skilled software engineer. You have access to a terminal, "`
			`"file tools, and web search. Use these tools to complete the coding task. "`
			`"Write clean, working code and verify it runs correctly before finishing."`
			`),`
			`# Modal backend for cloud-isolated sandboxes`
			`terminal_backend="modal",`
			`# Dataset -- override via CLI for your specific SWE dataset`
			`dataset_name="bigcode/humanevalpack",`
			`dataset_split="test",`
			`prompt_field="prompt",`
			`# Atropos settings`
			`group_size=4,`
			`tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",`
			`tool_call_parser="hermes",`
			`steps_per_eval=50,`
			`total_steps=500,`
			`use_wandb=True,`
			`wandb_name="hermes-swe",`
			`)`

			`server_configs = [`
			`APIServerConfig(`
			`base_url="http://localhost:8000/v1",`
			`model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",`
			`server_type="openai", # Phase 1; switch to "vllm" for Phase 2`
			`api_key="",`
			`)`
			`]`

			`return env_config, server_configs`

			`async def setup(self):`
			`"""Load the SWE dataset."""`
			`if self.config.dataset_name:`
			`self.dataset = load_dataset(`
			`self.config.dataset_name, split=self.config.dataset_split`
			`)`
			`else:`
			`# Placeholder if no dataset specified`
			`self.dataset = []`
			`self.iter = 0`
			`self.reward_buffer: List[float] = []`

			`async def get_next_item(self) -> Dict[str, Any]:`
			`"""Cycle through the SWE dataset."""`
			`if not self.dataset:`
			`raise ValueError("No dataset loaded. Set dataset_name in config.")`
			`item = self.dataset[self.iter % len(self.dataset)]`
			`self.iter += 1`
			`return item`

			`def format_prompt(self, item: Dict[str, Any]) -> str:`
			`"""`
			`Format the SWE task prompt.`

			`Override this in subclasses for different dataset formats.`
			`Default assumes the dataset has a 'prompt' field and optionally a 'test' field.`
			`"""`
			`prompt = item.get(self.config.prompt_field, "")`

			`# If the dataset has test information, include it in the prompt`
			`test_info = item.get("test", item.get("test_code", item.get("tests", "")))`
			`if test_info:`
			`prompt += f"\n\nTests to pass:\n{test_info}"`

			`return prompt`

			`async def compute_reward(`
			`self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext`
			`) -> float:`
			`"""`
			`Score by running tests in the model's Modal sandbox.`

			`Default implementation:`
			`- If the dataset item has a 'test' or 'test_code' field, run it`
			`- Check exit code: 0 = pass, non-zero = fail`
			`- Partial credit for file creation`

			`Override this in subclasses for more sophisticated reward logic.`
			`"""`
			`# Find the test command from the dataset item`
			`test_code = item.get("test", item.get("test_code", item.get("tests", "")))`

			`if test_code:`
			`# Run the test in the model's sandbox`
			`test_result = ctx.terminal(`
			`f'cd /workspace && python3 -c "{test_code}"', timeout=60`
			`)`

			`if test_result["exit_code"] == 0:`
			`self.reward_buffer.append(1.0)`
			`return 1.0`

			`# Partial credit: check if the model created any Python files`
			`file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null \| head -5")`
			`if file_check["exit_code"] == 0 and file_check.get("output", "").strip():`
			`self.reward_buffer.append(0.1)`
			`return 0.1`

			`self.reward_buffer.append(0.0)`
			`return 0.0`

			`async def evaluate(self, args, *kwargs):`
			`"""`
			`Run evaluation on a held-out set.`

			`Override for dataset-specific evaluation logic.`
			`"""`
			`start_time = time.time()`
			`end_time = time.time()`

			`eval_metrics = {"eval/placeholder": 0.0}`
			`await self.evaluate_log(`
			`metrics=eval_metrics,`
			`start_time=start_time,`
			`end_time=end_time,`
			`)`

			`async def wandb_log(self, wandb_metrics: Optional[Dict] = None):`
			`"""Log SWE-specific metrics."""`
			`if wandb_metrics is None:`
			`wandb_metrics = {}`

			`if self.reward_buffer:`
			`wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(`
			`self.reward_buffer`
			`)`
			`wandb_metrics["train/pass_rate"] = sum(`
			`1 for r in self.reward_buffer if r == 1.0`
			`) / len(self.reward_buffer)`
			`self.reward_buffer = []`

			`await super().wandb_log(wandb_metrics)`


			`if __name__ == "__main__":`
			`HermesSweEnv.cli()`