diff --git a/.gitignore b/.gitignore index dcbbb56aa..3c5ca3743 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,7 @@ privvy* images/ __pycache__/ hermes_agent.egg-info/ +wandb/ # CLI config (may contain sensitive SSH paths) cli-config.yaml diff --git a/README.md b/README.md index aa603d64a..45340e8f0 100644 --- a/README.md +++ b/README.md @@ -323,6 +323,94 @@ For extended RL workflows with longer timeouts: python rl_cli.py --model "anthropic/claude-sonnet-4-20250514" ``` +### ๐Ÿงช Atropos RL Environments + +Hermes-Agent integrates with the [Atropos](https://github.com/NousResearch/atropos) RL framework through a layered environment system. This allows training models with reinforcement learning on agentic tasks using hermes-agent's tools. + +#### Architecture + +The integration has three layers: + +| Layer | File | Purpose | +|-------|------|---------| +| **Agent Loop** | `environments/agent_loop.py` | Reusable multi-turn tool-calling engine (standard OpenAI spec) | +| **Base Environment** | `environments/hermes_base_env.py` | Abstract Atropos `BaseEnv` subclass with toolset resolution, ToolContext, scoring | +| **Concrete Envs** | `environments/terminal_test_env.py`, `environments/hermes_swe_env.py` | Task-specific environments | + +#### Two-Phase Operation + +- **Phase 1 (OpenAI server type)**: Works with any OpenAI-compatible endpoint (VLLM, SGLang, OpenRouter, OpenAI API). The server handles tool call parsing natively. Good for **SFT data generation**, **verifier testing**, and **evaluation**. +- **Phase 2 (VLLM server type)**: Uses ManagedServer for exact token IDs + logprobs via `/generate`. Client-side tool call parser registry reconstructs structured `tool_calls` from raw output. Required for **full RL training**. + +#### Quick Start + +```bash +# 1. Launch VLLM with tool parser +vllm serve YourModel --tool-parser hermes + +# 2. Start the Atropos API server +run-api + +# 3. Run an environment +python environments/terminal_test_env.py serve \ + --openai.base_url http://localhost:8000/v1 \ + --openai.model_name YourModel \ + --openai.server_type openai +``` + +#### ToolContext (Reward Functions) + +Reward functions receive a `ToolContext` with unrestricted access to all hermes-agent tools, scoped to the rollout's sandbox: + +```python +async def compute_reward(self, item, result, ctx: ToolContext) -> float: + # Run tests in the model's terminal sandbox + test = ctx.terminal("pytest -v") + if test["exit_code"] == 0: + return 1.0 + # Or check a file, search the web, navigate a browser... + return 0.0 +``` + +#### Creating Custom Environments + +Subclass `HermesAgentBaseEnv` and implement 5 methods: + +```python +from environments.hermes_base_env import HermesAgentBaseEnv + +class MyEnv(HermesAgentBaseEnv): + name = "my-env" + async def setup(self): ... # Load data + async def get_next_item(self): ... # Return next item + def format_prompt(self, item): ... # Item -> prompt string + async def compute_reward(self, item, result, ctx): ... # Score with ToolContext + async def evaluate(self, *args, **kwargs): ... # Periodic eval + +if __name__ == "__main__": + MyEnv.cli() +``` + +#### Toolset Distributions + +Configure which tools are available per group, either explicitly or probabilistically: + +```bash +# Explicit toolsets +--env.enabled_toolsets '["terminal","file","web"]' + +# Probabilistic distribution (sampled per group) +--env.distribution development +``` + +#### Tool Call Parsers (Phase 2) + +For VLLM server type, a parser registry extracts structured `tool_calls` from raw model output. Supported parsers: `hermes`, `mistral`, `llama3_json`, `qwen`, `deepseek_v3`, `deepseek_v3_1`, `kimi_k2`, `longcat`, `glm45`, `glm47`, `qwen3_coder`. + +```bash +--env.tool_call_parser hermes # Match your VLLM --tool-parser flag +``` + ### โฐ Scheduled Tasks (Cron) Schedule tasks to run automatically: diff --git a/__pycache__/model_tools.cpython-310.pyc b/__pycache__/model_tools.cpython-310.pyc deleted file mode 100644 index 519e30120..000000000 Binary files a/__pycache__/model_tools.cpython-310.pyc and /dev/null differ diff --git a/__pycache__/web_tools.cpython-310.pyc b/__pycache__/web_tools.cpython-310.pyc deleted file mode 100644 index d20f5fb50..000000000 Binary files a/__pycache__/web_tools.cpython-310.pyc and /dev/null differ diff --git a/environments/__init__.py b/environments/__init__.py new file mode 100644 index 000000000..5f2fb6c77 --- /dev/null +++ b/environments/__init__.py @@ -0,0 +1,28 @@ +""" +Hermes-Agent Atropos Environments + +Provides a layered integration between hermes-agent's tool-calling capabilities +and the Atropos RL training framework. + +Layers: + - agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling + - tool_context: Per-rollout tool access handle for reward/verification functions + - hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos + - tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate) + +Concrete environments: + - terminal_test_env: Simple file-creation tasks for testing the stack + - hermes_swe_env: SWE-bench style tasks with Modal sandboxes +""" + +from environments.agent_loop import AgentResult, HermesAgentLoop +from environments.tool_context import ToolContext +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig + +__all__ = [ + "AgentResult", + "HermesAgentLoop", + "ToolContext", + "HermesAgentBaseEnv", + "HermesAgentEnvConfig", +] diff --git a/environments/agent_loop.py b/environments/agent_loop.py new file mode 100644 index 000000000..7e9453b60 --- /dev/null +++ b/environments/agent_loop.py @@ -0,0 +1,306 @@ +""" +HermesAgentLoop -- Reusable Multi-Turn Agent Engine + +Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling. +Works with any server that returns ChatCompletion objects with tool_calls: + - Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API) + - Phase 2: ManagedServer with client-side tool call parser + +The loop passes tools= and checks response.choices[0].message.tool_calls, +identical to hermes-agent's run_agent.py. Tool execution is dispatched via +handle_function_call() from model_tools.py. +""" + +import json +import logging +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Set + +from model_tools import handle_function_call + +logger = logging.getLogger(__name__) + + +@dataclass +class AgentResult: + """Result of running the agent loop.""" + + # Full conversation history in OpenAI message format + messages: List[Dict[str, Any]] + # ManagedServer.get_state() if available (Phase 2), None otherwise + managed_state: Optional[Dict[str, Any]] = None + # How many LLM calls were made + turns_used: int = 0 + # True if model stopped calling tools naturally (vs hitting max_turns) + finished_naturally: bool = False + # Extracted reasoning content per turn (from PR #297 helpers) + reasoning_per_turn: List[Optional[str]] = field(default_factory=list) + + +def _extract_reasoning_from_message(message) -> Optional[str]: + """ + Extract reasoning content from a ChatCompletion message. + + Handles multiple provider formats: + 1. message.reasoning_content field (some providers) + 2. message.reasoning field (some providers) + 3. message.reasoning_details[].text (OpenRouter style) + + Note: block extraction from content is NOT done here -- that's + handled by the response already in Phase 1 (server does it) or by + ManagedServer's patch in Phase 2. + + Args: + message: The assistant message from ChatCompletion response + + Returns: + Extracted reasoning text, or None if not found + """ + # Check reasoning_content field (common across providers) + if hasattr(message, "reasoning_content") and message.reasoning_content: + return message.reasoning_content + + # Check reasoning field + if hasattr(message, "reasoning") and message.reasoning: + return message.reasoning + + # Check reasoning_details (OpenRouter style) + if hasattr(message, "reasoning_details") and message.reasoning_details: + for detail in message.reasoning_details: + if hasattr(detail, "text") and detail.text: + return detail.text + if isinstance(detail, dict) and detail.get("text"): + return detail["text"] + + return None + + +class HermesAgentLoop: + """ + Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling. + + Same pattern as run_agent.py: + - Pass tools= to the API + - Check response.choices[0].message.tool_calls + - Dispatch via handle_function_call() + + Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter, + or ManagedServer with a parser. The server determines how tool_calls get + populated on the response. + """ + + def __init__( + self, + server, + tool_schemas: List[Dict[str, Any]], + valid_tool_names: Set[str], + max_turns: int = 30, + task_id: Optional[str] = None, + temperature: float = 1.0, + max_tokens: Optional[int] = None, + ): + """ + Initialize the agent loop. + + Args: + server: Server object with chat_completion() method (OpenAIServer, + ManagedServer, ServerManager, etc.) + tool_schemas: OpenAI-format tool definitions from get_tool_definitions() + valid_tool_names: Set of tool names the model is allowed to call + max_turns: Maximum number of LLM calls before stopping + task_id: Unique ID for terminal/browser session isolation + temperature: Sampling temperature for generation + max_tokens: Max tokens per generation (None for server default) + """ + self.server = server + self.tool_schemas = tool_schemas + self.valid_tool_names = valid_tool_names + self.max_turns = max_turns + self.task_id = task_id or str(uuid.uuid4()) + self.temperature = temperature + self.max_tokens = max_tokens + + async def run(self, messages: List[Dict[str, Any]]) -> AgentResult: + """ + Execute the full agent loop using standard OpenAI tool calling. + + Args: + messages: Initial conversation messages (system + user). + Modified in-place as the conversation progresses. + + Returns: + AgentResult with full conversation history, managed state, and metadata + """ + reasoning_per_turn = [] + + for turn in range(self.max_turns): + # Build the chat_completion kwargs + chat_kwargs = { + "messages": messages, + "n": 1, + "temperature": self.temperature, + } + + # Only pass tools if we have them + if self.tool_schemas: + chat_kwargs["tools"] = self.tool_schemas + + # Only pass max_tokens if explicitly set + if self.max_tokens is not None: + chat_kwargs["max_tokens"] = self.max_tokens + + # Make the API call -- standard OpenAI spec + try: + response = await self.server.chat_completion(**chat_kwargs) + except Exception as e: + logger.error("API call failed on turn %d: %s", turn + 1, e) + return AgentResult( + messages=messages, + managed_state=self._get_managed_state(), + turns_used=turn + 1, + finished_naturally=False, + reasoning_per_turn=reasoning_per_turn, + ) + + if not response or not response.choices: + logger.warning("Empty response on turn %d", turn + 1) + return AgentResult( + messages=messages, + managed_state=self._get_managed_state(), + turns_used=turn + 1, + finished_naturally=False, + reasoning_per_turn=reasoning_per_turn, + ) + + assistant_msg = response.choices[0].message + + # Extract reasoning content from the response (all provider formats) + reasoning = _extract_reasoning_from_message(assistant_msg) + reasoning_per_turn.append(reasoning) + + # Check for tool calls -- standard OpenAI spec + if assistant_msg.tool_calls: + # Build the assistant message dict for conversation history + msg_dict: Dict[str, Any] = { + "role": "assistant", + "content": assistant_msg.content or "", + "tool_calls": [ + { + "id": tc.id, + "type": "function", + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, + } + for tc in assistant_msg.tool_calls + ], + } + + # Preserve reasoning_content for multi-turn chat template handling + # (e.g., Kimi-K2's template renders blocks differently + # for history vs. the latest turn based on this field) + if reasoning: + msg_dict["reasoning_content"] = reasoning + + messages.append(msg_dict) + + # Execute each tool call via hermes-agent's dispatch + for tc in assistant_msg.tool_calls: + tool_name = tc.function.name + + # Validate tool name + if tool_name not in self.valid_tool_names: + tool_result = json.dumps( + { + "error": f"Unknown tool '{tool_name}'. " + f"Available tools: {sorted(self.valid_tool_names)}" + } + ) + logger.warning( + "Model called unknown tool '%s' on turn %d", + tool_name, + turn + 1, + ) + else: + # Parse arguments and dispatch + try: + args = json.loads(tc.function.arguments) + except json.JSONDecodeError: + args = {} + logger.warning( + "Invalid JSON in tool call arguments for '%s': %s", + tool_name, + tc.function.arguments[:200], + ) + + try: + tool_result = handle_function_call( + tool_name, args, task_id=self.task_id + ) + except Exception as e: + tool_result = json.dumps( + {"error": f"Tool execution failed: {str(e)}"} + ) + logger.error( + "Tool '%s' execution failed: %s", tool_name, e + ) + + # Add tool response to conversation + messages.append( + { + "role": "tool", + "tool_call_id": tc.id, + "content": tool_result, + } + ) + + logger.debug( + "Turn %d: %d tool calls executed", + turn + 1, + len(assistant_msg.tool_calls), + ) + + else: + # No tool calls -- model is done + msg_dict = { + "role": "assistant", + "content": assistant_msg.content or "", + } + if reasoning: + msg_dict["reasoning_content"] = reasoning + messages.append(msg_dict) + + logger.debug( + "Turn %d: model finished naturally (no tool calls)", turn + 1 + ) + + return AgentResult( + messages=messages, + managed_state=self._get_managed_state(), + turns_used=turn + 1, + finished_naturally=True, + reasoning_per_turn=reasoning_per_turn, + ) + + # Hit max turns without the model stopping + logger.info("Agent hit max_turns (%d) without finishing", self.max_turns) + return AgentResult( + messages=messages, + managed_state=self._get_managed_state(), + turns_used=self.max_turns, + finished_naturally=False, + reasoning_per_turn=reasoning_per_turn, + ) + + def _get_managed_state(self) -> Optional[Dict[str, Any]]: + """ + Get ManagedServer state if the server supports it. + + Returns state dict with SequenceNodes containing tokens/logprobs/masks, + or None if the server doesn't support get_state() (e.g., regular OpenAI server). + """ + if hasattr(self.server, "get_state"): + return self.server.get_state() + return None diff --git a/environments/configs/swe_default.yaml b/environments/configs/swe_default.yaml new file mode 100644 index 000000000..3477e4b33 --- /dev/null +++ b/environments/configs/swe_default.yaml @@ -0,0 +1,33 @@ +# SWE Environment -- Default Configuration +# +# SWE-bench style tasks with Modal sandboxes for cloud isolation. +# Uses terminal + file + web toolsets. +# +# Usage: +# python environments/hermes_swe_env.py serve --config environments/configs/swe_default.yaml + +env: + enabled_toolsets: ["terminal", "file", "web"] + max_agent_turns: 30 + max_token_length: 4096 + group_size: 4 + terminal_backend: "modal" + tool_call_parser: "hermes" + tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview" + dataset_name: "bigcode/humanevalpack" + dataset_split: "test" + prompt_field: "prompt" + steps_per_eval: 50 + total_steps: 500 + use_wandb: true + wandb_name: "hermes-swe" + system_prompt: > + You are a skilled software engineer. You have access to a terminal, + file tools, and web search. Use these tools to complete the coding task. + Write clean, working code and verify it runs correctly before finishing. + +openai: + base_url: "http://localhost:8000/v1" + model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview" + server_type: "openai" + api_key: "" diff --git a/environments/configs/terminal_test_default.yaml b/environments/configs/terminal_test_default.yaml new file mode 100644 index 000000000..19b38e334 --- /dev/null +++ b/environments/configs/terminal_test_default.yaml @@ -0,0 +1,35 @@ +# Terminal Test Environment -- Default Configuration +# +# Simple file-creation tasks for validating the full Atropos + hermes-agent stack. +# Uses Modal terminal backend and OpenRouter (Claude) for inference. +# API keys loaded from ~/hermes-agent/.env +# +# Usage: +# run-api +# python environments/terminal_test_env.py serve +# # Or with config file: +# python environments/terminal_test_env.py serve --config environments/configs/terminal_test_default.yaml + +env: + enabled_toolsets: ["terminal", "file"] + max_agent_turns: 10 + max_token_length: 2048 + group_size: 3 + total_steps: 3 + steps_per_eval: 3 + terminal_backend: "modal" + tool_call_parser: "hermes" + tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview" + ensure_scores_are_not_same: false + use_wandb: false + system_prompt: > + You are a helpful assistant with access to a terminal and file tools. + Complete the user's request by using the available tools. + Be precise and follow instructions exactly. + +openai: + base_url: "https://openrouter.ai/api/v1" + model_name: "anthropic/claude-opus-4.6" + server_type: "openai" + health_check: false + # api_key loaded from OPENROUTER_API_KEY in .env diff --git a/environments/hermes_base_env.py b/environments/hermes_base_env.py new file mode 100644 index 000000000..d17fcd6ab --- /dev/null +++ b/environments/hermes_base_env.py @@ -0,0 +1,540 @@ +""" +HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos + +Provides the Atropos integration plumbing that all hermes-agent environments share: +- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2) +- Per-group toolset/distribution resolution +- Agent loop orchestration via HermesAgentLoop +- ToolContext creation for reward functions +- ScoredDataGroup construction from ManagedServer state + +Subclasses only need to implement: + setup() -- Load dataset, initialize state + get_next_item() -- Return the next item from the dataset + format_prompt() -- Convert a dataset item into the user message + compute_reward() -- Score the rollout (has full ToolContext access) + evaluate() -- Periodic evaluation +""" + +import asyncio +import json +import logging +import os +import sys +import uuid +from abc import abstractmethod +from pathlib import Path +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +# Ensure the hermes-agent repo root is on sys.path so that imports like +# `from model_tools import ...` and `from environments.X import ...` work +# regardless of where the script is invoked from. +_repo_root = Path(__file__).resolve().parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +from dotenv import load_dotenv +from pydantic import Field + +# Load API keys from hermes-agent/.env so all environments can access them +_env_path = _repo_root / ".env" +if _env_path.exists(): + load_dotenv(dotenv_path=_env_path) + +from atroposlib.envs.base import ( + BaseEnv, + BaseEnvConfig, + ScoredDataGroup, + ScoredDataItem, +) +from atroposlib.envs.server_handling.server_manager import ( + APIServerConfig, + ServerBaseline, + ServerManager, +) +from atroposlib.type_definitions import Item + +from environments.agent_loop import AgentResult, HermesAgentLoop +from environments.tool_context import ToolContext + +# Import hermes-agent toolset infrastructure +from model_tools import get_tool_definitions +from toolset_distributions import sample_toolsets_from_distribution + +logger = logging.getLogger(__name__) + + +class HermesAgentEnvConfig(BaseEnvConfig): + """ + Configuration for hermes-agent Atropos environments. + + Extends BaseEnvConfig with agent-specific settings for toolsets, + terminal backend, dataset loading, and tool call parsing. + """ + + # --- Toolset configuration --- + # Mutually exclusive: use either enabled_toolsets OR distribution + enabled_toolsets: Optional[List[str]] = Field( + default=None, + description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). " + "If None and distribution is also None, all available toolsets are enabled.", + ) + disabled_toolsets: Optional[List[str]] = Field( + default=None, + description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.", + ) + distribution: Optional[str] = Field( + default=None, + description="Name of a toolset distribution from toolset_distributions.py " + "(e.g., 'development', 'terminal_tasks'). Sampled once per group. " + "Mutually exclusive with enabled_toolsets.", + ) + + # --- Agent loop configuration --- + max_agent_turns: int = Field( + default=30, + description="Maximum number of LLM calls (tool-calling iterations) per rollout.", + ) + system_prompt: Optional[str] = Field( + default=None, + description="System prompt for the agent. Tools are handled via the tools= parameter, " + "not embedded in the prompt text.", + ) + agent_temperature: float = Field( + default=1.0, + description="Sampling temperature for agent generation during rollouts.", + ) + + # --- Terminal backend --- + terminal_backend: str = Field( + default="local", + description="Terminal backend: 'local', 'docker', 'modal', 'ssh', 'singularity'. " + "Modal recommended for production RL (cloud isolation per rollout).", + ) + + # --- Dataset --- + dataset_name: Optional[str] = Field( + default=None, + description="HuggingFace dataset name. Optional if tasks are defined inline.", + ) + dataset_split: str = Field( + default="train", + description="Dataset split to use.", + ) + prompt_field: str = Field( + default="prompt", + description="Which field in the dataset contains the prompt.", + ) + + # --- Phase 2: Tool call parsing --- + tool_call_parser: str = Field( + default="hermes", + description="Tool call parser name for Phase 2 (VLLM server type). " + "Ignored in Phase 1 (OpenAI server type where VLLM parses natively). " + "Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.", + ) + + +class HermesAgentBaseEnv(BaseEnv): + """ + Abstract base environment for hermes-agent Atropos integration. + + Handles two modes of operation: + - Phase 1 (OpenAI server type): Uses server.chat_completion() directly. + The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing + and reasoning extraction natively. DummyManagedServer provides placeholder + tokens. Good for SFT data gen, verifier testing, evaluation. + + - Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs + via /generate. Client-side tool call parser reconstructs structured tool_calls + from raw output. Full RL training capability. + + Subclasses must implement: + setup() -- Load dataset, initialize state + get_next_item() -- Return the next item to roll out + format_prompt() -- Convert a dataset item into the user message string + compute_reward() -- Score the rollout using ToolContext + evaluate() -- Periodic evaluation + """ + + name: Optional[str] = "hermes-agent" + env_config_cls = HermesAgentEnvConfig + + def __init__( + self, + config: HermesAgentEnvConfig, + server_configs: Union[ServerBaseline, List[APIServerConfig]], + slurm=False, + testing=False, + ): + super().__init__(config, server_configs, slurm, testing) + + # Set terminal backend environment variable so hermes tools pick it up + if config.terminal_backend: + os.environ["TERMINAL_ENV"] = config.terminal_backend + + # Current group's resolved tools (set in collect_trajectories) + self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None + + # ========================================================================= + # Toolset resolution (per-group) + # ========================================================================= + + def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]: + """ + Resolve toolsets for a group. Called once in collect_trajectories(), + then shared by all collect_trajectory() calls in the group. + + If distribution is set, samples probabilistically. + If enabled_toolsets is set, uses that explicit list. + disabled_toolsets is applied as a filter on top. + + Returns: + (tool_schemas, valid_tool_names) tuple + """ + config = self.config + + if config.distribution: + group_toolsets = sample_toolsets_from_distribution(config.distribution) + logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets) + else: + group_toolsets = config.enabled_toolsets # None means "all available" + + tools = get_tool_definitions( + enabled_toolsets=group_toolsets, + disabled_toolsets=config.disabled_toolsets, + quiet_mode=True, + ) + + valid_names = {t["function"]["name"] for t in tools} if tools else set() + logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names)) + return tools, valid_names + + # ========================================================================= + # Server mode detection + # ========================================================================= + + def _use_managed_server(self) -> bool: + """ + Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1). + + Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang', + which go through the /generate endpoint for exact token tracking. + + Phase 1 (direct server) is used for 'openai' server type, which uses + /v1/chat/completions with native tool call parsing. + """ + if not self.server.servers: + return False + + server = self.server.servers[0] + # If the server is an OpenAI server (not VLLM/SGLang), use direct mode + from atroposlib.envs.server_handling.openai_server import OpenAIServer + return not isinstance(server, OpenAIServer) + + # ========================================================================= + # Core Atropos integration + # ========================================================================= + + async def collect_trajectories( + self, item: Item + ) -> Tuple[ + Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]], + List[Item], + ]: + """ + Override collect_trajectories to resolve toolsets once per group, + then delegate to the standard group-level collection. + + The default BaseEnv.collect_trajectories() calls collect_trajectory() + group_size times in parallel. We resolve tools once here and store + them for all those calls to use. + """ + # Resolve toolsets for this group (shared by all rollouts in the group) + self._current_group_tools = self._resolve_tools_for_group() + + # Delegate to the default implementation which calls collect_trajectory() + # group_size times via asyncio.gather + return await super().collect_trajectories(item) + + # ========================================================================= + # Wandb rollout display -- format trajectories nicely + # ========================================================================= + + @staticmethod + def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str: + """ + Format a conversation's messages into a readable trajectory string + for wandb rollout tables. Shows tool calls, tool results, and reasoning + in a structured way instead of raw token decoding. + """ + parts = [] + for msg in messages: + role = msg.get("role", "unknown") + content = msg.get("content", "") + + if role == "system": + parts.append(f"[SYSTEM]\n{content}") + + elif role == "user": + parts.append(f"[USER]\n{content}") + + elif role == "assistant": + # Show reasoning if present + reasoning = msg.get("reasoning_content", "") + if reasoning: + # Truncate long reasoning for display + if len(reasoning) > 300: + reasoning = reasoning[:300] + "..." + parts.append(f"[ASSISTANT thinking]\n{reasoning}") + + # Show content + if content: + parts.append(f"[ASSISTANT]\n{content}") + + # Show tool calls + tool_calls = msg.get("tool_calls", []) + for tc in tool_calls: + func = tc.get("function", {}) + name = func.get("name", "?") + args = func.get("arguments", "{}") + # Truncate long arguments for display + if len(args) > 200: + args = args[:200] + "..." + parts.append(f"[TOOL CALL] {name}({args})") + + elif role == "tool": + tool_id = msg.get("tool_call_id", "") + result = content + # Truncate long tool results for display + if len(result) > 500: + result = result[:500] + "..." + parts.append(f"[TOOL RESULT] {result}") + + return "\n\n".join(parts) + + async def add_rollouts_for_wandb( + self, + scored_data, + item=None, + ): + """ + Override to show formatted trajectories with tool calls visible, + instead of raw token decoding which loses all structure. + """ + num_keep = self.config.num_rollouts_per_group_for_logging + if num_keep == -1: + num_keep = self.config.group_size + + group = [] + for i in range(min(num_keep, len(scored_data.get("scores", [])))): + score = scored_data["scores"][i] + + # Use messages if available for rich display + messages = None + if scored_data.get("messages") and i < len(scored_data["messages"]): + messages = scored_data["messages"][i] + + if messages: + text = self._format_trajectory_for_display(messages) + elif scored_data.get("tokens") and i < len(scored_data["tokens"]): + text = self.tokenizer.decode(scored_data["tokens"][i]) + else: + text = "(no data)" + + group.append((text, score)) + + self.rollouts_for_wandb.append(group) + if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep: + self.rollouts_for_wandb.pop(0) + + async def collect_trajectory( + self, item: Item + ) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]: + """ + Run a single rollout: agent loop + reward computation. + + This is called group_size times in parallel by collect_trajectories(). + Each call gets its own task_id for terminal/browser session isolation. + """ + task_id = str(uuid.uuid4()) + + # Get group-level tools (resolved once in collect_trajectories) + if self._current_group_tools is None: + # Fallback: resolve per-trajectory if called outside collect_trajectories + tools, valid_names = self._resolve_tools_for_group() + else: + tools, valid_names = self._current_group_tools + + # Build initial messages + messages: List[Dict[str, Any]] = [] + if self.config.system_prompt: + messages.append({"role": "system", "content": self.config.system_prompt}) + messages.append({"role": "user", "content": self.format_prompt(item)}) + + # Run the agent loop + result: AgentResult + if self._use_managed_server(): + # Phase 2: ManagedServer with parser -- exact tokens + logprobs + try: + async with self.server.managed_server(tokenizer=self.tokenizer) as managed: + agent = HermesAgentLoop( + server=managed, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + ) + result = await agent.run(messages) + except NotImplementedError: + # DummyManagedServer not allowed -- fall back to Phase 1 + logger.warning( + "ManagedServer not available (OpenAI server?). " + "Falling back to direct server mode." + ) + agent = HermesAgentLoop( + server=self.server, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + ) + result = await agent.run(messages) + else: + # Phase 1: OpenAI server -- native tool_calls, placeholder tokens + agent = HermesAgentLoop( + server=self.server, + tool_schemas=tools, + valid_tool_names=valid_names, + max_turns=self.config.max_agent_turns, + task_id=task_id, + temperature=self.config.agent_temperature, + max_tokens=self.config.max_token_length, + ) + result = await agent.run(messages) + + # Compute reward using ToolContext (gives verifier full tool access) + ctx = ToolContext(task_id) + try: + reward = await self.compute_reward(item, result, ctx) + except Exception as e: + logger.error("compute_reward failed: %s", e) + reward = 0.0 + finally: + ctx.cleanup() + + # Build ScoredDataItem from ManagedServer state + # Phase 2: real tokens/masks/logprobs from SequenceNodes + # Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline) + nodes = (result.managed_state or {}).get("nodes", []) + + if nodes: + # Phase 2 (or DummyManagedServer): use actual node data + node = nodes[-1] # Final sequence node = full trajectory + scored_item: Dict[str, Any] = { + "tokens": node.tokens, + "masks": node.masked_tokens, + "scores": reward, + } + + # Include logprobs if available (Phase 2) + if hasattr(node, "logprobs") and node.logprobs: + scored_item["advantages"] = None # Computed by trainer + scored_item["ref_logprobs"] = None + else: + # Phase 1 with no managed state: create placeholder tokens + # so the data pipeline doesn't break. These are NOT suitable + # for training but allow process mode (SFT data gen) to work. + # Tokenize the full conversation to get approximate tokens. + full_text = "\n".join( + msg.get("content", "") for msg in result.messages if msg.get("content") + ) + if self.tokenizer: + tokens = self.tokenizer.encode(full_text, add_special_tokens=True) + else: + tokens = list(range(min(len(full_text) // 4, 128))) + + scored_item = { + "tokens": tokens, + "masks": [-100] + tokens[1:], # Mask first token as prompt + "scores": reward, + } + + # Always include messages for wandb rollout display and data logging + scored_item["messages"] = result.messages + + return scored_item, [] + + # ========================================================================= + # Abstract methods -- subclasses must implement + # ========================================================================= + + @abstractmethod + async def setup(self): + """ + Load dataset, initialize state. + + Called once when the environment starts. Typical implementation: + self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split) + self.iter = 0 + """ + raise NotImplementedError + + @abstractmethod + async def get_next_item(self) -> Item: + """ + Return the next item from the dataset for rollout. + + Called by the base env's main loop to get items for workers. + Should cycle through the dataset. + """ + raise NotImplementedError + + @abstractmethod + def format_prompt(self, item: Item) -> str: + """ + Convert a dataset item into the user message for the agent. + + Args: + item: Dataset item (dict, tuple, etc.) + + Returns: + The prompt string to send to the agent + """ + raise NotImplementedError + + @abstractmethod + async def compute_reward( + self, item: Item, result: AgentResult, ctx: ToolContext + ) -> float: + """ + Score the rollout. Has full access to: + - item: the original dataset item (ground truth, test commands, etc.) + - result: AgentResult with full messages, turn count, reasoning, etc. + - ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web, + browser, vision...) scoped to this rollout's sandbox. Nothing + is off-limits. + + Args: + item: The dataset item that was rolled out + result: The agent's rollout result + ctx: ToolContext with full tool access for verification + + Returns: + Reward float (typically 0.0 to 1.0, but any float is valid) + """ + raise NotImplementedError + + @abstractmethod + async def evaluate(self, *args, **kwargs): + """ + Periodic evaluation. Called every steps_per_eval steps. + + Typical implementation runs the agent on a held-out eval set + and logs metrics via wandb/evaluate_log. + """ + raise NotImplementedError diff --git a/environments/hermes_swe_env.py b/environments/hermes_swe_env.py new file mode 100644 index 000000000..23b3e8f02 --- /dev/null +++ b/environments/hermes_swe_env.py @@ -0,0 +1,229 @@ +""" +HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes + +A concrete environment for software engineering tasks where the model writes code +and the reward function runs tests to verify correctness. Uses Modal terminal +backend for cloud-isolated sandboxes per rollout. + +The reward function uses ToolContext.terminal() to run test commands in the same +Modal sandbox the model used during its agentic loop. All filesystem state from +the model's tool calls is preserved for verification. + +Usage: + # Phase 1: OpenAI server type + vllm serve YourModel --tool-parser hermes + run-api + python environments/hermes_swe_env.py serve \\ + --openai.base_url http://localhost:8000/v1 \\ + --openai.model_name YourModel \\ + --openai.server_type openai \\ + --env.dataset_name bigcode/humanevalpack \\ + --env.terminal_backend modal + + # Phase 2: VLLM server type (full RL training) + python environments/hermes_swe_env.py serve \\ + --openai.base_url http://localhost:8000/v1 \\ + --openai.model_name YourModel \\ + --openai.server_type vllm \\ + --env.tool_call_parser hermes \\ + --env.terminal_backend modal +""" + +import logging +import sys +import time +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +# Ensure repo root is on sys.path for imports +_repo_root = Path(__file__).resolve().parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +from datasets import load_dataset + +from atroposlib.envs.base import ScoredDataGroup +from atroposlib.envs.server_handling.server_manager import APIServerConfig +from atroposlib.type_definitions import Item + +from environments.agent_loop import AgentResult +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig +from environments.tool_context import ToolContext + +logger = logging.getLogger(__name__) + + +class HermesSweEnvConfig(HermesAgentEnvConfig): + """Config with defaults for SWE-bench style tasks.""" + + pass # Inherits all fields, overrides defaults in config_init + + +class HermesSweEnv(HermesAgentBaseEnv): + """ + SWE-bench style environment using Modal terminal backend. + + The model gets a coding task, uses terminal + file + web tools to solve it, + and the reward function runs tests in the same Modal sandbox to verify. + + Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.) + and customize format_prompt() and compute_reward() as needed. + """ + + name = "hermes-swe" + env_config_cls = HermesSweEnvConfig + + @classmethod + def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]: + """ + Default configuration for the SWE environment. + + Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets. + """ + env_config = HermesSweEnvConfig( + # Toolsets: terminal for running code, file for reading/writing, web for docs + enabled_toolsets=["terminal", "file", "web"], + disabled_toolsets=None, + distribution=None, + # Agent settings -- SWE tasks need more turns + max_agent_turns=30, + max_token_length=4096, + agent_temperature=1.0, + system_prompt=( + "You are a skilled software engineer. You have access to a terminal, " + "file tools, and web search. Use these tools to complete the coding task. " + "Write clean, working code and verify it runs correctly before finishing." + ), + # Modal backend for cloud-isolated sandboxes + terminal_backend="modal", + # Dataset -- override via CLI for your specific SWE dataset + dataset_name="bigcode/humanevalpack", + dataset_split="test", + prompt_field="prompt", + # Atropos settings + group_size=4, + tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview", + tool_call_parser="hermes", + steps_per_eval=50, + total_steps=500, + use_wandb=True, + wandb_name="hermes-swe", + ) + + server_configs = [ + APIServerConfig( + base_url="http://localhost:8000/v1", + model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview", + server_type="openai", # Phase 1; switch to "vllm" for Phase 2 + api_key="", + ) + ] + + return env_config, server_configs + + async def setup(self): + """Load the SWE dataset.""" + if self.config.dataset_name: + self.dataset = load_dataset( + self.config.dataset_name, split=self.config.dataset_split + ) + else: + # Placeholder if no dataset specified + self.dataset = [] + self.iter = 0 + self.reward_buffer: List[float] = [] + + async def get_next_item(self) -> Dict[str, Any]: + """Cycle through the SWE dataset.""" + if not self.dataset: + raise ValueError("No dataset loaded. Set dataset_name in config.") + item = self.dataset[self.iter % len(self.dataset)] + self.iter += 1 + return item + + def format_prompt(self, item: Dict[str, Any]) -> str: + """ + Format the SWE task prompt. + + Override this in subclasses for different dataset formats. + Default assumes the dataset has a 'prompt' field and optionally a 'test' field. + """ + prompt = item.get(self.config.prompt_field, "") + + # If the dataset has test information, include it in the prompt + test_info = item.get("test", item.get("test_code", item.get("tests", ""))) + if test_info: + prompt += f"\n\nTests to pass:\n{test_info}" + + return prompt + + async def compute_reward( + self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext + ) -> float: + """ + Score by running tests in the model's Modal sandbox. + + Default implementation: + - If the dataset item has a 'test' or 'test_code' field, run it + - Check exit code: 0 = pass, non-zero = fail + - Partial credit for file creation + + Override this in subclasses for more sophisticated reward logic. + """ + # Find the test command from the dataset item + test_code = item.get("test", item.get("test_code", item.get("tests", ""))) + + if test_code: + # Run the test in the model's sandbox + test_result = ctx.terminal( + f'cd /workspace && python3 -c "{test_code}"', timeout=60 + ) + + if test_result["exit_code"] == 0: + self.reward_buffer.append(1.0) + return 1.0 + + # Partial credit: check if the model created any Python files + file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5") + if file_check["exit_code"] == 0 and file_check.get("output", "").strip(): + self.reward_buffer.append(0.1) + return 0.1 + + self.reward_buffer.append(0.0) + return 0.0 + + async def evaluate(self, *args, **kwargs): + """ + Run evaluation on a held-out set. + + Override for dataset-specific evaluation logic. + """ + start_time = time.time() + end_time = time.time() + + eval_metrics = {"eval/placeholder": 0.0} + await self.evaluate_log( + metrics=eval_metrics, + start_time=start_time, + end_time=end_time, + ) + + async def wandb_log(self, wandb_metrics: Optional[Dict] = None): + """Log SWE-specific metrics.""" + if wandb_metrics is None: + wandb_metrics = {} + + if self.reward_buffer: + wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len( + self.reward_buffer + ) + wandb_metrics["train/pass_rate"] = sum( + 1 for r in self.reward_buffer if r == 1.0 + ) / len(self.reward_buffer) + self.reward_buffer = [] + + await super().wandb_log(wandb_metrics) + + +if __name__ == "__main__": + HermesSweEnv.cli() diff --git a/environments/terminal_test_env.py b/environments/terminal_test_env.py new file mode 100644 index 000000000..e30d56695 --- /dev/null +++ b/environments/terminal_test_env.py @@ -0,0 +1,292 @@ +""" +TerminalTestEnv -- Simple Test Environment for Validating the Stack + +A self-contained environment with inline tasks (no external dataset needed). +Each task asks the model to create a file at a known path with specific content. +The reward verifier cats the file and checks if the content matches. + +Enables only terminal + file toolsets. Uses Modal terminal backend with +OpenRouter (Claude) by default. + +Training tasks (3): + 1. Create ~/greeting.txt with "Hello from Hermes Agent" + 2. Create ~/count.txt with numbers 1-5, one per line + 3. Create ~/answer.txt with the result of 123 + 456 + +Eval task (1): + 1. Create ~/result.txt with the result of 6 * 7 + +Usage: + # Start Atropos API server + run-api + + # Run environment (uses OpenRouter + Modal by default) + python environments/terminal_test_env.py serve + + # Process mode (no run-api needed, saves to JSONL) + python environments/terminal_test_env.py process \\ + --env.data_path_to_save_groups terminal_test_output.jsonl +""" + +import logging +import os +import sys +import time +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +# Ensure repo root is on sys.path for imports +_repo_root = Path(__file__).resolve().parent.parent +if str(_repo_root) not in sys.path: + sys.path.insert(0, str(_repo_root)) + +from atroposlib.envs.base import ScoredDataGroup +from atroposlib.envs.server_handling.server_manager import APIServerConfig +from atroposlib.type_definitions import Item + +from environments.agent_loop import AgentResult +from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig +from environments.tool_context import ToolContext + +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Inline task definitions -- no external dataset needed +# ============================================================================= + +TRAIN_TASKS = [ + { + "prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent", + "verify_path": "~/greeting.txt", + "expected_content": "Hello from Hermes Agent", + }, + { + "prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line", + "verify_path": "~/count.txt", + "expected_content": "1\n2\n3\n4\n5", + }, + { + "prompt": "Create a file at ~/answer.txt containing the result of 123 + 456", + "verify_path": "~/answer.txt", + "expected_content": "579", + }, +] + +EVAL_TASKS = [ + { + "prompt": "Create a file at ~/result.txt containing the result of 6 * 7", + "verify_path": "~/result.txt", + "expected_content": "42", + }, +] + + +class TerminalTestEnvConfig(HermesAgentEnvConfig): + """Config with defaults suitable for terminal testing.""" + + pass # Inherits all fields, overrides defaults in config_init + + +class TerminalTestEnv(HermesAgentBaseEnv): + """ + Simple test environment with inline file-creation tasks. + + All tasks follow the same pattern: "create a file at ~/X.txt with content Y". + The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output + against the expected string. Same verifier logic for all tasks. + + This environment is designed to validate the full stack end-to-end: + - Agent loop executes tool calls (terminal/file) + - ToolContext provides terminal access to the reward function + - Reward function verifies file content via cat + - Scored data flows through the Atropos pipeline + """ + + name = "terminal-test" + env_config_cls = TerminalTestEnvConfig + + @classmethod + def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]: + """ + Default configuration for the terminal test environment. + + Uses Modal terminal backend for cloud isolation and OpenRouter with + Claude for inference. API keys loaded from ~/hermes-agent/.env. + """ + env_config = TerminalTestEnvConfig( + # Terminal + file tools only + enabled_toolsets=["terminal", "file"], + disabled_toolsets=None, + distribution=None, + # Agent settings + max_agent_turns=10, # Simple tasks, don't need many turns + max_token_length=2048, + agent_temperature=1.0, + system_prompt=( + "You are a helpful assistant with access to a terminal and file tools. " + "Complete the user's request by using the available tools. " + "Be precise and follow instructions exactly." + ), + # Modal terminal backend for cloud-isolated sandboxes per rollout + terminal_backend="modal", + # Atropos settings + group_size=3, # 3 rollouts per group + tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview", + tool_call_parser="hermes", + steps_per_eval=3, # Eval after all 3 steps + total_steps=3, # 3 groups total (1 group per step) + use_wandb=True, + wandb_name="terminal-test", + ensure_scores_are_not_same=False, # Allow all-same scores for simple tasks + # No external dataset + dataset_name=None, + ) + + # OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY) + server_configs = [ + APIServerConfig( + base_url="https://openrouter.ai/api/v1", + model_name="anthropic/claude-opus-4.6", + server_type="openai", + api_key=os.getenv("OPENROUTER_API_KEY", ""), + health_check=False, # OpenRouter doesn't have a /health endpoint + ) + ] + + return env_config, server_configs + + async def setup(self): + """Initialize inline task lists.""" + self.train_tasks = list(TRAIN_TASKS) + self.eval_tasks = list(EVAL_TASKS) + self.iter = 0 + # Track reward stats for wandb logging + self.reward_buffer: List[float] = [] + + async def get_next_item(self) -> Dict[str, str]: + """Cycle through training tasks.""" + item = self.train_tasks[self.iter % len(self.train_tasks)] + self.iter += 1 + return item + + def format_prompt(self, item: Dict[str, str]) -> str: + """The prompt is directly in the task item.""" + return item["prompt"] + + async def compute_reward( + self, item: Dict[str, str], result: AgentResult, ctx: ToolContext + ) -> float: + """ + Verify by cat-ing the expected file path and checking content matches. + Same verifier for all tasks -- they all write a file at a known path. + + Scoring: + 1.0 = exact match + 0.5 = expected content is present but has extra stuff + 0.0 = file doesn't exist or content doesn't match + """ + verify_result = ctx.terminal(f"cat {item['verify_path']}") + + # File doesn't exist or can't be read + if verify_result["exit_code"] != 0: + self.reward_buffer.append(0.0) + return 0.0 + + actual = verify_result.get("output", "").strip() + expected = item["expected_content"].strip() + + # Exact match + if actual == expected: + self.reward_buffer.append(1.0) + return 1.0 + + # Partial credit: expected content is present but has extra stuff + if expected in actual: + self.reward_buffer.append(0.5) + return 0.5 + + self.reward_buffer.append(0.0) + return 0.0 + + async def evaluate(self, *args, **kwargs): + """ + Run eval tasks using the agent loop and verify results. + Logs accuracy metrics. + """ + start_time = time.time() + correct = 0 + total = len(self.eval_tasks) + samples = [] + + for eval_item in self.eval_tasks: + try: + # For eval, we do a simple single-turn completion (not full agent loop) + # to keep eval fast. The agent loop is tested via training. + completion = await self.server.chat_completion( + messages=[ + {"role": "system", "content": self.config.system_prompt or ""}, + {"role": "user", "content": eval_item["prompt"]}, + ], + n=1, + max_tokens=self.config.max_token_length, + temperature=0.0, + split="eval", + ) + + response_content = ( + completion.choices[0].message.content if completion.choices else "" + ) + + samples.append( + { + "prompt": eval_item["prompt"], + "response": response_content, + "expected": eval_item["expected_content"], + } + ) + + except Exception as e: + logger.error("Eval failed for item: %s", e) + samples.append( + { + "prompt": eval_item["prompt"], + "response": f"ERROR: {e}", + "expected": eval_item["expected_content"], + } + ) + + end_time = time.time() + + eval_metrics = { + "eval/num_samples": total, + } + + await self.evaluate_log( + metrics=eval_metrics, + samples=samples, + start_time=start_time, + end_time=end_time, + ) + + async def wandb_log(self, wandb_metrics: Optional[Dict] = None): + """Log training metrics including reward stats and accuracy.""" + if wandb_metrics is None: + wandb_metrics = {} + + if self.reward_buffer: + total = len(self.reward_buffer) + correct = sum(1 for r in self.reward_buffer if r == 1.0) + partial = sum(1 for r in self.reward_buffer if r == 0.5) + + wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total + wandb_metrics["train/accuracy"] = correct / total + wandb_metrics["train/partial_match_rate"] = partial / total + wandb_metrics["train/total_rollouts"] = total + self.reward_buffer = [] + + await super().wandb_log(wandb_metrics) + + +if __name__ == "__main__": + TerminalTestEnv.cli() diff --git a/environments/tool_call_parsers/__init__.py b/environments/tool_call_parsers/__init__.py new file mode 100644 index 000000000..8bff3f9d1 --- /dev/null +++ b/environments/tool_call_parsers/__init__.py @@ -0,0 +1,120 @@ +""" +Tool Call Parser Registry + +Client-side parsers that extract structured tool_calls from raw model output text. +Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns +raw text without tool call parsing. + +Each parser is a standalone reimplementation of the corresponding VLLM parser's +non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library +(re, json, uuid) and openai types. + +Usage: + from environments.tool_call_parsers import get_parser + + parser = get_parser("hermes") + content, tool_calls = parser.parse(raw_model_output) + # content = text with tool call markup stripped + # tool_calls = list of ChatCompletionMessageToolCall objects, or None +""" + +import logging +from abc import ABC, abstractmethod +from typing import Dict, List, Optional, Tuple, Type + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, +) + +logger = logging.getLogger(__name__) + +# Type alias for parser return value +ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]] + + +class ToolCallParser(ABC): + """ + Base class for tool call parsers. + + Each parser knows how to extract structured tool_calls from a specific + model family's raw output text format. + """ + + @abstractmethod + def parse(self, text: str) -> ParseResult: + """ + Parse raw model output text for tool calls. + + Args: + text: Raw decoded text from the model's completion + + Returns: + Tuple of (content, tool_calls) where: + - content: text with tool call markup stripped (the message 'content' field), + or None if the entire output was tool calls + - tool_calls: list of ChatCompletionMessageToolCall objects, + or None if no tool calls were found + """ + raise NotImplementedError + + +# Global parser registry: name -> parser class +PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {} + + +def register_parser(name: str): + """ + Decorator to register a parser class under a given name. + + Usage: + @register_parser("hermes") + class HermesToolCallParser(ToolCallParser): + ... + """ + + def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]: + PARSER_REGISTRY[name] = cls + return cls + + return decorator + + +def get_parser(name: str) -> ToolCallParser: + """ + Get a parser instance by name. + + Args: + name: Parser name (e.g., "hermes", "mistral", "llama3_json") + + Returns: + Instantiated parser + + Raises: + KeyError: If parser name is not found in registry + """ + if name not in PARSER_REGISTRY: + available = sorted(PARSER_REGISTRY.keys()) + raise KeyError( + f"Tool call parser '{name}' not found. Available parsers: {available}" + ) + return PARSER_REGISTRY[name]() + + +def list_parsers() -> List[str]: + """Return sorted list of registered parser names.""" + return sorted(PARSER_REGISTRY.keys()) + + +# Import all parser modules to trigger registration via @register_parser decorators +# Each module registers itself when imported +from environments.tool_call_parsers.hermes_parser import HermesToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.mistral_parser import MistralToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.llama_parser import LlamaToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.qwen_parser import QwenToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser # noqa: E402, F401 +from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser # noqa: E402, F401 diff --git a/environments/tool_call_parsers/deepseek_v3_1_parser.py b/environments/tool_call_parsers/deepseek_v3_1_parser.py new file mode 100644 index 000000000..f0124c389 --- /dev/null +++ b/environments/tool_call_parsers/deepseek_v3_1_parser.py @@ -0,0 +1,71 @@ +""" +DeepSeek V3.1 tool call parser. + +Similar to V3 but with a slightly different format: + <๏ฝœtoolโ–callโ–begin๏ฝœ>function_name<๏ฝœtoolโ–sep๏ฝœ>arguments<๏ฝœtoolโ–callโ–end๏ฝœ> + +Note: V3 has type+name before the separator, V3.1 has name before and args after. + +Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls() +""" + +import re +import uuid +from typing import List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +@register_parser("deepseek_v3_1") +@register_parser("deepseek_v31") +class DeepSeekV31ToolCallParser(ToolCallParser): + """ + Parser for DeepSeek V3.1 tool calls. + + Slightly different regex than V3: function_name comes before the separator, + arguments come after (no type field, no json code block wrapper). + """ + + START_TOKEN = "<๏ฝœtoolโ–callsโ–begin๏ฝœ>" + + # Regex captures: function_name, function_arguments + PATTERN = re.compile( + r"<๏ฝœtoolโ–callโ–begin๏ฝœ>(?P.*?)<๏ฝœtoolโ–sep๏ฝœ>(?P.*?)<๏ฝœtoolโ–callโ–end๏ฝœ>" + ) + + def parse(self, text: str) -> ParseResult: + if self.START_TOKEN not in text: + return text, None + + try: + matches = self.PATTERN.findall(text) + if not matches: + return text, None + + tool_calls: List[ChatCompletionMessageToolCall] = [] + for match in matches: + func_name, func_args = match + tool_calls.append( + ChatCompletionMessageToolCall( + id=f"call_{uuid.uuid4().hex[:8]}", + type="function", + function=Function( + name=func_name.strip(), + arguments=func_args.strip(), + ), + ) + ) + + if not tool_calls: + return text, None + + content = text[: text.find(self.START_TOKEN)].strip() + return content if content else None, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/deepseek_v3_parser.py b/environments/tool_call_parsers/deepseek_v3_parser.py new file mode 100644 index 000000000..5356b1a67 --- /dev/null +++ b/environments/tool_call_parsers/deepseek_v3_parser.py @@ -0,0 +1,75 @@ +""" +DeepSeek V3 tool call parser. + +Format uses special unicode tokens: + <๏ฝœtoolโ–callsโ–begin๏ฝœ> + <๏ฝœtoolโ–callโ–begin๏ฝœ>type<๏ฝœtoolโ–sep๏ฝœ>function_name + ```json + {"arg": "value"} + ``` + <๏ฝœtoolโ–callโ–end๏ฝœ> + <๏ฝœtoolโ–callsโ–end๏ฝœ> + +Based on VLLM's DeepSeekV3ToolParser.extract_tool_calls() +""" + +import re +import uuid +from typing import List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +@register_parser("deepseek_v3") +class DeepSeekV3ToolCallParser(ToolCallParser): + """ + Parser for DeepSeek V3 tool calls. + + Uses special unicode tokens with fullwidth angle brackets and block elements. + Extracts type, function name, and JSON arguments from the structured format. + """ + + START_TOKEN = "<๏ฝœtoolโ–callsโ–begin๏ฝœ>" + + # Regex captures: type, function_name, function_arguments + PATTERN = re.compile( + r"<๏ฝœtoolโ–callโ–begin๏ฝœ>(?P.*)<๏ฝœtoolโ–sep๏ฝœ>(?P.*)\n```json\n(?P.*)\n```<๏ฝœtoolโ–callโ–end๏ฝœ>" + ) + + def parse(self, text: str) -> ParseResult: + if self.START_TOKEN not in text: + return text, None + + try: + matches = self.PATTERN.findall(text) + if not matches: + return text, None + + tool_calls: List[ChatCompletionMessageToolCall] = [] + for match in matches: + tc_type, func_name, func_args = match + tool_calls.append( + ChatCompletionMessageToolCall( + id=f"call_{uuid.uuid4().hex[:8]}", + type="function", + function=Function( + name=func_name.strip(), + arguments=func_args.strip(), + ), + ) + ) + + if not tool_calls: + return text, None + + # Content is everything before the tool calls section + content = text[: text.find(self.START_TOKEN)].strip() + return content if content else None, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/glm45_parser.py b/environments/tool_call_parsers/glm45_parser.py new file mode 100644 index 000000000..e92e29881 --- /dev/null +++ b/environments/tool_call_parsers/glm45_parser.py @@ -0,0 +1,109 @@ +""" +GLM 4.5 (GLM-4-MoE) tool call parser. + +Format uses custom arg_key/arg_value tags rather than standard JSON: + function_name + param1value1 + param2value2 + + +Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback. + +Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls() +""" + +import ast +import json +import re +import uuid +from typing import Any, Dict, List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +def _deserialize_value(value: str) -> Any: + """ + Try to deserialize a string value to its native Python type. + Attempts json.loads, then ast.literal_eval, then returns raw string. + """ + try: + return json.loads(value) + except (json.JSONDecodeError, TypeError): + pass + + try: + return ast.literal_eval(value) + except (ValueError, SyntaxError, TypeError): + pass + + return value + + +@register_parser("glm45") +class Glm45ToolCallParser(ToolCallParser): + """ + Parser for GLM 4.5 (GLM-4-MoE) tool calls. + + Uses ... tags with / pairs + instead of standard JSON arguments. + """ + + FUNC_CALL_REGEX = re.compile(r".*?", re.DOTALL) + FUNC_DETAIL_REGEX = re.compile(r"([^\n]*)\n(.*)", re.DOTALL) + FUNC_ARG_REGEX = re.compile( + r"(.*?)\s*(.*?)", re.DOTALL + ) + + START_TOKEN = "" + + def parse(self, text: str) -> ParseResult: + if self.START_TOKEN not in text: + return text, None + + try: + matched_calls = self.FUNC_CALL_REGEX.findall(text) + if not matched_calls: + return text, None + + tool_calls: List[ChatCompletionMessageToolCall] = [] + + for match in matched_calls: + detail = self.FUNC_DETAIL_REGEX.search(match) + if not detail: + continue + + func_name = detail.group(1).strip() + func_args_raw = detail.group(2) + + # Parse arg_key/arg_value pairs + pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else [] + arg_dict: Dict[str, Any] = {} + for key, value in pairs: + arg_key = key.strip() + arg_val = _deserialize_value(value.strip()) + arg_dict[arg_key] = arg_val + + tool_calls.append( + ChatCompletionMessageToolCall( + id=f"call_{uuid.uuid4().hex[:8]}", + type="function", + function=Function( + name=func_name, + arguments=json.dumps(arg_dict, ensure_ascii=False), + ), + ) + ) + + if not tool_calls: + return text, None + + content = text[: text.find(self.START_TOKEN)].strip() + return content if content else None, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/glm47_parser.py b/environments/tool_call_parsers/glm47_parser.py new file mode 100644 index 000000000..6631cf842 --- /dev/null +++ b/environments/tool_call_parsers/glm47_parser.py @@ -0,0 +1,35 @@ +""" +GLM 4.7 tool call parser. + +Same as GLM 4.5 but with slightly different regex patterns. +The tool_call tags may wrap differently and arg parsing handles +newlines between key/value pairs. + +Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser). +""" + +import re + +from environments.tool_call_parsers import ParseResult, register_parser +from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser + + +@register_parser("glm47") +class Glm47ToolCallParser(Glm45ToolCallParser): + """ + Parser for GLM 4.7 tool calls. + Extends GLM 4.5 with updated regex patterns. + """ + + def __init__(self): + super().__init__() + # GLM 4.7 uses a slightly different detail regex that includes + # the wrapper and optional arg_key content + self.FUNC_DETAIL_REGEX = re.compile( + r"(.*?)(.*?)?", re.DOTALL + ) + # GLM 4.7 handles newlines between arg_key and arg_value tags + self.FUNC_ARG_REGEX = re.compile( + r"(.*?)(?:\\n|\s)*(.*?)", + re.DOTALL, + ) diff --git a/environments/tool_call_parsers/hermes_parser.py b/environments/tool_call_parsers/hermes_parser.py new file mode 100644 index 000000000..c1902fd62 --- /dev/null +++ b/environments/tool_call_parsers/hermes_parser.py @@ -0,0 +1,73 @@ +""" +Hermes tool call parser. + +Format: {"name": "func", "arguments": {...}} +Based on VLLM's Hermes2ProToolParser.extract_tool_calls() +""" + +import json +import re +import uuid +from typing import List, Optional, Tuple + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +@register_parser("hermes") +class HermesToolCallParser(ToolCallParser): + """ + Parser for Hermes-format tool calls. + + Matches ... tags containing JSON with "name" and "arguments". + Also handles unclosed at end-of-string (truncated generation). + """ + + # Matches both closed and unclosed tool_call tags + PATTERN = re.compile( + r"\s*(.*?)\s*|\s*(.*)", re.DOTALL + ) + + def parse(self, text: str) -> ParseResult: + if "" not in text: + return text, None + + try: + matches = self.PATTERN.findall(text) + if not matches: + return text, None + + tool_calls: List[ChatCompletionMessageToolCall] = [] + for match in matches: + # match is a tuple: (closed_content, unclosed_content) + raw_json = match[0] if match[0] else match[1] + if not raw_json.strip(): + continue + + tc_data = json.loads(raw_json) + tool_calls.append( + ChatCompletionMessageToolCall( + id=f"call_{uuid.uuid4().hex[:8]}", + type="function", + function=Function( + name=tc_data["name"], + arguments=json.dumps( + tc_data.get("arguments", {}), ensure_ascii=False + ), + ), + ) + ) + + if not tool_calls: + return text, None + + # Content is everything before the first tag + content = text[: text.find("")].strip() + return content if content else None, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/kimi_k2_parser.py b/environments/tool_call_parsers/kimi_k2_parser.py new file mode 100644 index 000000000..29f40fc24 --- /dev/null +++ b/environments/tool_call_parsers/kimi_k2_parser.py @@ -0,0 +1,93 @@ +""" +Kimi K2 tool call parser. + +Format: + <|tool_calls_section_begin|> + <|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|> + <|tool_calls_section_end|> + +The function_id format is typically "functions.func_name:index" or "func_name:index". + +Based on VLLM's KimiK2ToolParser.extract_tool_calls() +""" + +import re +import uuid +from typing import List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +@register_parser("kimi_k2") +class KimiK2ToolCallParser(ToolCallParser): + """ + Parser for Kimi K2 tool calls. + + Uses section begin/end tokens wrapping individual tool call begin/end tokens. + The tool_call_id contains the function name (after last dot, before colon). + """ + + # Support both singular and plural variants + START_TOKENS = [ + "<|tool_calls_section_begin|>", + "<|tool_call_section_begin|>", + ] + + # Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments + PATTERN = re.compile( + r"<\|tool_call_begin\|>\s*(?P[^<]+:\d+)\s*" + r"<\|tool_call_argument_begin\|>\s*" + r"(?P(?:(?!<\|tool_call_begin\|>).)*?)\s*" + r"<\|tool_call_end\|>", + re.DOTALL, + ) + + def parse(self, text: str) -> ParseResult: + # Check for any variant of the start token + has_start = any(token in text for token in self.START_TOKENS) + if not has_start: + return text, None + + try: + matches = self.PATTERN.findall(text) + if not matches: + return text, None + + tool_calls: List[ChatCompletionMessageToolCall] = [] + for match in matches: + function_id, function_args = match + + # Extract function name from ID format: "functions.get_weather:0" -> "get_weather" + function_name = function_id.split(":")[0].split(".")[-1] + + tool_calls.append( + ChatCompletionMessageToolCall( + id=function_id, # Preserve the original ID format + type="function", + function=Function( + name=function_name, + arguments=function_args.strip(), + ), + ) + ) + + if not tool_calls: + return text, None + + # Content is everything before the tool calls section + earliest_start = len(text) + for token in self.START_TOKENS: + idx = text.find(token) + if idx >= 0 and idx < earliest_start: + earliest_start = idx + + content = text[:earliest_start].strip() + return content if content else None, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/llama_parser.py b/environments/tool_call_parsers/llama_parser.py new file mode 100644 index 000000000..8eb2136a1 --- /dev/null +++ b/environments/tool_call_parsers/llama_parser.py @@ -0,0 +1,96 @@ +""" +Llama 3.x / 4 tool call parser. + +Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys. +May be preceded by <|python_tag|> token. Supports multiple JSON objects separated +by content or semicolons. + +Based on VLLM's Llama3JsonToolParser.extract_tool_calls() +""" + +import json +import re +import uuid +from typing import List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +@register_parser("llama3_json") +@register_parser("llama4_json") +class LlamaToolCallParser(ToolCallParser): + """ + Parser for Llama 3.x and 4 JSON-format tool calls. + + Finds JSON objects containing "name" + ("arguments" or "parameters") keys. + Uses Python's json.JSONDecoder.raw_decode for robust extraction of + JSON objects from mixed text. + """ + + BOT_TOKEN = "<|python_tag|>" + + # Regex to find the start of potential JSON objects + JSON_START = re.compile(r"\{") + + def parse(self, text: str) -> ParseResult: + # Quick check: need either the bot token or a JSON brace + if self.BOT_TOKEN not in text and "{" not in text: + return text, None + + try: + decoder = json.JSONDecoder() + tool_calls: List[ChatCompletionMessageToolCall] = [] + end_index = -1 # Track where the last parsed JSON ended + + for match in self.JSON_START.finditer(text): + start = match.start() + # Skip if this brace is inside a previously parsed JSON object + if start <= end_index: + continue + + try: + obj, json_end = decoder.raw_decode(text[start:]) + end_index = start + json_end + + # Must have "name" and either "arguments" or "parameters" + name = obj.get("name") + args = obj.get("arguments", obj.get("parameters")) + + if not name or args is None: + continue + + # Normalize arguments to JSON string + if isinstance(args, dict): + args = json.dumps(args, ensure_ascii=False) + elif not isinstance(args, str): + args = json.dumps(args, ensure_ascii=False) + + tool_calls.append( + ChatCompletionMessageToolCall( + id=f"call_{uuid.uuid4().hex[:8]}", + type="function", + function=Function(name=name, arguments=args), + ) + ) + except (json.JSONDecodeError, KeyError, ValueError): + continue + + if not tool_calls: + return text, None + + # Content is everything before the first tool call JSON + # Find where the first tool call starts in the text + first_tc_start = text.find("{") + if self.BOT_TOKEN in text: + first_tc_start = text.find(self.BOT_TOKEN) + content = text[:first_tc_start].strip() if first_tc_start > 0 else None + + return content, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/longcat_parser.py b/environments/tool_call_parsers/longcat_parser.py new file mode 100644 index 000000000..afecdb862 --- /dev/null +++ b/environments/tool_call_parsers/longcat_parser.py @@ -0,0 +1,69 @@ +""" +Longcat Flash Chat tool call parser. + +Same as Hermes but uses tags instead of . +Based on VLLM's LongcatFlashToolParser (extends Hermes2ProToolParser). +""" + +import json +import re +import uuid +from typing import List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +@register_parser("longcat") +class LongcatToolCallParser(ToolCallParser): + """ + Parser for Longcat Flash Chat tool calls. + Identical logic to Hermes, just different tag names. + """ + + PATTERN = re.compile( + r"\s*(.*?)\s*|\s*(.*)", + re.DOTALL, + ) + + def parse(self, text: str) -> ParseResult: + if "" not in text: + return text, None + + try: + matches = self.PATTERN.findall(text) + if not matches: + return text, None + + tool_calls: List[ChatCompletionMessageToolCall] = [] + for match in matches: + raw_json = match[0] if match[0] else match[1] + if not raw_json.strip(): + continue + + tc_data = json.loads(raw_json) + tool_calls.append( + ChatCompletionMessageToolCall( + id=f"call_{uuid.uuid4().hex[:8]}", + type="function", + function=Function( + name=tc_data["name"], + arguments=json.dumps( + tc_data.get("arguments", {}), ensure_ascii=False + ), + ), + ) + ) + + if not tool_calls: + return text, None + + content = text[: text.find("")].strip() + return content if content else None, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/mistral_parser.py b/environments/tool_call_parsers/mistral_parser.py new file mode 100644 index 000000000..5526bdd01 --- /dev/null +++ b/environments/tool_call_parsers/mistral_parser.py @@ -0,0 +1,130 @@ +""" +Mistral tool call parser. + +Supports two formats depending on tokenizer version: +- Pre-v11: content[TOOL_CALLS] [{"name": ..., "arguments": {...}}, ...] +- v11+: content[TOOL_CALLS]tool_name1{"arg": "val"}[TOOL_CALLS]tool_name2{"arg": "val"} + +Based on VLLM's MistralToolParser.extract_tool_calls() +The [TOOL_CALLS] token is the bot_token used by Mistral models. +""" + +import json +import re +import uuid +from typing import List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +def _generate_mistral_id() -> str: + """Mistral tool call IDs are 9-char alphanumeric strings.""" + import random + import string + + return "".join(random.choices(string.ascii_letters + string.digits, k=9)) + + +@register_parser("mistral") +class MistralToolCallParser(ToolCallParser): + """ + Parser for Mistral-format tool calls. + + Detects format by checking if the content after [TOOL_CALLS] starts with '[' + (pre-v11 JSON array) or with a tool name (v11+ format). + """ + + # The [TOOL_CALLS] token -- may appear as different strings depending on tokenizer + BOT_TOKEN = "[TOOL_CALLS]" + + # Fallback regex for pre-v11 format when JSON parsing fails + TOOL_CALL_REGEX = re.compile(r"\[?\s*(\{.*?\})\s*\]?", re.DOTALL) + + def parse(self, text: str) -> ParseResult: + if self.BOT_TOKEN not in text: + return text, None + + try: + parts = text.split(self.BOT_TOKEN) + content = parts[0].strip() + raw_tool_calls = parts[1:] + + # Detect format: if the first raw part starts with '[', it's pre-v11 + first_raw = raw_tool_calls[0].strip() if raw_tool_calls else "" + is_pre_v11 = first_raw.startswith("[") or first_raw.startswith("{") + + tool_calls: List[ChatCompletionMessageToolCall] = [] + + if not is_pre_v11: + # v11+ format: [TOOL_CALLS]tool_name{args}[TOOL_CALLS]tool_name2{args2} + for raw in raw_tool_calls: + raw = raw.strip() + if not raw or "{" not in raw: + continue + + brace_idx = raw.find("{") + tool_name = raw[:brace_idx].strip() + args_str = raw[brace_idx:] + + tool_calls.append( + ChatCompletionMessageToolCall( + id=_generate_mistral_id(), + type="function", + function=Function(name=tool_name, arguments=args_str), + ) + ) + else: + # Pre-v11 format: [TOOL_CALLS] [{"name": ..., "arguments": {...}}] + try: + parsed = json.loads(first_raw) + if isinstance(parsed, dict): + parsed = [parsed] + + for tc in parsed: + args = tc.get("arguments", {}) + if isinstance(args, dict): + args = json.dumps(args, ensure_ascii=False) + + tool_calls.append( + ChatCompletionMessageToolCall( + id=_generate_mistral_id(), + type="function", + function=Function( + name=tc["name"], arguments=args + ), + ) + ) + except json.JSONDecodeError: + # Fallback regex extraction + match = self.TOOL_CALL_REGEX.findall(first_raw) + if match: + for raw_json in match: + try: + tc = json.loads(raw_json) + args = tc.get("arguments", {}) + if isinstance(args, dict): + args = json.dumps(args, ensure_ascii=False) + tool_calls.append( + ChatCompletionMessageToolCall( + id=_generate_mistral_id(), + type="function", + function=Function( + name=tc["name"], arguments=args + ), + ) + ) + except (json.JSONDecodeError, KeyError): + continue + + if not tool_calls: + return text, None + + return content if content else None, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/qwen3_coder_parser.py b/environments/tool_call_parsers/qwen3_coder_parser.py new file mode 100644 index 000000000..042e46f7b --- /dev/null +++ b/environments/tool_call_parsers/qwen3_coder_parser.py @@ -0,0 +1,163 @@ +""" +Qwen3-Coder tool call parser. + +Format uses XML-style nested tags: + + + value + value2 + + + +Parameters are extracted from value tags and +type-converted using the schema if available, otherwise treated as strings. + +Based on VLLM's Qwen3CoderToolParser.extract_tool_calls() +""" + +import ast +import json +import re +import uuid +from typing import Any, Dict, List, Optional + +from openai.types.chat.chat_completion_message_tool_call import ( + ChatCompletionMessageToolCall, + Function, +) + +from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser + + +def _try_convert_value(value: str) -> Any: + """ + Try to convert a parameter value string to a native Python type. + Handles null, numbers, booleans, JSON objects/arrays, and falls back to string. + """ + stripped = value.strip() + + # Handle null + if stripped.lower() == "null": + return None + + # Try JSON first (handles objects, arrays, strings, numbers, booleans) + try: + return json.loads(stripped) + except (json.JSONDecodeError, TypeError): + pass + + # Try Python literal eval (handles tuples, etc.) + try: + return ast.literal_eval(stripped) + except (ValueError, SyntaxError, TypeError): + pass + + # Return as string + return stripped + + +@register_parser("qwen3_coder") +class Qwen3CoderToolCallParser(ToolCallParser): + """ + Parser for Qwen3-Coder XML-format tool calls. + + Uses nested XML tags: val + """ + + START_TOKEN = "" + FUNCTION_PREFIX = "(.*?)|(.*?)$", re.DOTALL + ) + + # Find function blocks within a tool_call + FUNCTION_REGEX = re.compile( + r"||(?=)|$)", + re.DOTALL, + ) + + def _parse_function_call(self, function_str: str) -> Optional[ChatCompletionMessageToolCall]: + """Parse a single ... block into a ToolCall.""" + try: + # Extract function name: everything before the first '>' + gt_idx = function_str.index(">") + func_name = function_str[:gt_idx].strip() + params_str = function_str[gt_idx + 1:] + + # Extract parameters + param_dict: Dict[str, Any] = {} + for match_text in self.PARAMETER_REGEX.findall(params_str): + if ">" not in match_text: + continue + eq_idx = match_text.index(">") + param_name = match_text[:eq_idx].strip() + param_value = match_text[eq_idx + 1:] + + # Clean up whitespace + if param_value.startswith("\n"): + param_value = param_value[1:] + if param_value.endswith("\n"): + param_value = param_value[:-1] + + param_dict[param_name] = _try_convert_value(param_value) + + return ChatCompletionMessageToolCall( + id=f"call_{uuid.uuid4().hex[:24]}", + type="function", + function=Function( + name=func_name, + arguments=json.dumps(param_dict, ensure_ascii=False), + ), + ) + except (ValueError, IndexError): + return None + + def parse(self, text: str) -> ParseResult: + if self.FUNCTION_PREFIX not in text: + return text, None + + try: + # Find all tool_call blocks + tc_matches = self.TOOL_CALL_REGEX.findall(text) + raw_blocks = [m[0] if m[0] else m[1] for m in tc_matches] + + # Fallback: if no tool_call tags, try the whole text + if not raw_blocks: + raw_blocks = [text] + + # Find function blocks within each tool_call + function_strs: List[str] = [] + for block in raw_blocks: + func_matches = self.FUNCTION_REGEX.findall(block) + function_strs.extend(m[0] if m[0] else m[1] for m in func_matches) + + if not function_strs: + return text, None + + # Parse each function call + tool_calls: List[ChatCompletionMessageToolCall] = [] + for func_str in function_strs: + tc = self._parse_function_call(func_str) + if tc is not None: + tool_calls.append(tc) + + if not tool_calls: + return text, None + + # Content before tool calls + first_tc = text.find(self.START_TOKEN) + if first_tc < 0: + first_tc = text.find(self.FUNCTION_PREFIX) + content = text[:first_tc].strip() if first_tc > 0 else None + + return content, tool_calls + + except Exception: + return text, None diff --git a/environments/tool_call_parsers/qwen_parser.py b/environments/tool_call_parsers/qwen_parser.py new file mode 100644 index 000000000..9c8a81419 --- /dev/null +++ b/environments/tool_call_parsers/qwen_parser.py @@ -0,0 +1,19 @@ +""" +Qwen 2.5 tool call parser. + +Uses the same format as Hermes. +Registered as a separate parser name for clarity when using --tool-parser=qwen. +""" + +from environments.tool_call_parsers import register_parser +from environments.tool_call_parsers.hermes_parser import HermesToolCallParser + + +@register_parser("qwen") +class QwenToolCallParser(HermesToolCallParser): + """ + Parser for Qwen 2.5 tool calls. + Same {"name": ..., "arguments": ...} format as Hermes. + """ + + pass # Identical format -- inherits everything from Hermes diff --git a/environments/tool_context.py b/environments/tool_context.py new file mode 100644 index 000000000..4c9f0d363 --- /dev/null +++ b/environments/tool_context.py @@ -0,0 +1,246 @@ +""" +ToolContext -- Unrestricted Tool Access for Reward Functions + +A per-rollout handle that gives reward/verification functions direct access to +ALL hermes-agent tools, scoped to the rollout's task_id. The same task_id means +the terminal/browser session is the SAME one the model used during its rollout -- +all state (files, processes, browser tabs) is preserved. + +The verifier author decides which tools to use. Nothing is hardcoded or gated. + +Example usage in a compute_reward(): + async def compute_reward(self, item, result, ctx): + # Run tests in the model's terminal sandbox + test = ctx.terminal("pytest -v") + if test["exit_code"] == 0: + return 1.0 + + # Check if a file was created + content = ctx.read_file("/workspace/solution.py") + if content.get("content"): + return 0.5 + + return 0.0 +""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from model_tools import handle_function_call +from tools.terminal_tool import cleanup_vm +from tools.browser_tool import cleanup_browser + +logger = logging.getLogger(__name__) + + +class ToolContext: + """ + Open-ended access to all hermes-agent tools for a specific rollout. + + Passed to compute_reward() so verifiers can use any tool they need: + terminal commands, file reads/writes, web searches, browser automation, etc. + All calls share the rollout's task_id for session isolation. + """ + + def __init__(self, task_id: str): + self.task_id = task_id + + # ------------------------------------------------------------------------- + # Terminal tools + # ------------------------------------------------------------------------- + + def terminal(self, command: str, timeout: int = 180) -> Dict[str, Any]: + """ + Run a command in the rollout's terminal session. + + Args: + command: Shell command to execute + timeout: Command timeout in seconds + + Returns: + Dict with 'exit_code' (int) and 'output' (str) + """ + result = handle_function_call( + "terminal", + {"command": command, "timeout": timeout}, + task_id=self.task_id, + ) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"exit_code": -1, "output": result} + + # ------------------------------------------------------------------------- + # File tools + # ------------------------------------------------------------------------- + + def read_file(self, path: str) -> Dict[str, Any]: + """ + Read a file from the rollout's filesystem. + + Args: + path: File path to read + + Returns: + Dict with file content or error + """ + result = handle_function_call( + "read_file", {"path": path}, task_id=self.task_id + ) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"error": result} + + def write_file(self, path: str, content: str) -> Dict[str, Any]: + """ + Write a file in the rollout's filesystem. + + Args: + path: File path to write + content: Content to write + + Returns: + Dict with success status or error + """ + result = handle_function_call( + "write_file", {"path": path, "content": content}, task_id=self.task_id + ) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"error": result} + + def search(self, query: str, path: str = ".") -> Dict[str, Any]: + """ + Search for text in the rollout's filesystem. + + Args: + query: Search query + path: Directory to search in + + Returns: + Dict with search results + """ + result = handle_function_call( + "search", {"query": query, "path": path}, task_id=self.task_id + ) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"error": result} + + # ------------------------------------------------------------------------- + # Web tools + # ------------------------------------------------------------------------- + + def web_search(self, query: str) -> Dict[str, Any]: + """ + Search the web. + + Args: + query: Search query + + Returns: + Dict with search results + """ + result = handle_function_call("web_search", {"query": query}) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"error": result} + + def web_extract(self, urls: List[str]) -> Dict[str, Any]: + """ + Extract content from URLs. + + Args: + urls: List of URLs to extract content from + + Returns: + Dict with extracted content + """ + result = handle_function_call("web_extract", {"urls": urls}) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"error": result} + + # ------------------------------------------------------------------------- + # Browser tools + # ------------------------------------------------------------------------- + + def browser_navigate(self, url: str) -> Dict[str, Any]: + """ + Navigate the rollout's browser session to a URL. + + Args: + url: URL to navigate to + + Returns: + Dict with page snapshot or error + """ + result = handle_function_call( + "browser_navigate", {"url": url}, task_id=self.task_id + ) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"error": result} + + def browser_snapshot(self) -> Dict[str, Any]: + """ + Take a snapshot of the current browser page. + + Returns: + Dict with page content/accessibility snapshot + """ + result = handle_function_call( + "browser_snapshot", {}, task_id=self.task_id + ) + try: + return json.loads(result) + except json.JSONDecodeError: + return {"error": result} + + # ------------------------------------------------------------------------- + # Generic tool access + # ------------------------------------------------------------------------- + + def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> str: + """ + Call any hermes-agent tool by name. + + This is the generic escape hatch -- if a tool doesn't have a convenience + wrapper above, you can call it directly here. + + Args: + tool_name: Name of the tool (e.g., "vision_analyze", "skills_list") + arguments: Dict of arguments for the tool + + Returns: + Raw JSON string result from the tool + """ + return handle_function_call(tool_name, arguments, task_id=self.task_id) + + # ------------------------------------------------------------------------- + # Cleanup + # ------------------------------------------------------------------------- + + def cleanup(self): + """ + Release all resources (terminal VMs, browser sessions) for this rollout. + + Called automatically by the base environment via try/finally after + compute_reward() completes. You generally don't need to call this yourself. + """ + try: + cleanup_vm(self.task_id) + except Exception as e: + logger.debug("VM cleanup for task %s: %s", self.task_id, e) + + try: + cleanup_browser(self.task_id) + except Exception as e: + logger.debug("Browser cleanup for task %s: %s", self.task_id, e) diff --git a/hermes_agent.egg-info/PKG-INFO b/hermes_agent.egg-info/PKG-INFO deleted file mode 100644 index 159a40692..000000000 --- a/hermes_agent.egg-info/PKG-INFO +++ /dev/null @@ -1,868 +0,0 @@ -Metadata-Version: 2.4 -Name: hermes-agent -Version: 0.1.0 -Summary: AI agent with advanced tool-calling and toolsets -Author: Nous Research -License: MIT -Requires-Python: >=3.10 -Description-Content-Type: text/markdown -Requires-Dist: openai -Requires-Dist: python-dotenv -Requires-Dist: fire -Requires-Dist: httpx -Requires-Dist: rich -Requires-Dist: tenacity -Requires-Dist: pyyaml -Requires-Dist: requests -Requires-Dist: jinja2 -Requires-Dist: pydantic>=2.0 -Requires-Dist: firecrawl-py -Requires-Dist: fal-client -Requires-Dist: litellm>=1.75.5 -Requires-Dist: typer -Requires-Dist: platformdirs -Provides-Extra: modal -Requires-Dist: modal; extra == "modal" -Requires-Dist: boto3; extra == "modal" -Provides-Extra: dev -Requires-Dist: pytest; extra == "dev" -Requires-Dist: pytest-asyncio; extra == "dev" -Provides-Extra: messaging -Requires-Dist: python-telegram-bot>=20.0; extra == "messaging" -Requires-Dist: discord.py>=2.0; extra == "messaging" -Provides-Extra: cron -Requires-Dist: croniter; extra == "cron" -Provides-Extra: all -Requires-Dist: croniter; extra == "all" -Requires-Dist: python-telegram-bot>=20.0; extra == "all" -Requires-Dist: discord.py>=2.0; extra == "all" - -# Hermes Agent - -An AI agent with advanced tool-calling capabilities, featuring a flexible toolsets system for organizing and managing tools. - -## Features - -- **Interactive CLI**: Beautiful terminal interface with animated feedback, personalities, and session management -- **Messaging Gateway**: Connect to Telegram, Discord, and WhatsApp for conversational AI anywhere -- **Web Tools**: Search, extract content, and crawl websites -- **Terminal Tools**: Execute commands via local, Docker, Singularity, Modal, or SSH backends -- **Browser Tools**: Automate web browsers to navigate, click, type, and extract content -- **Vision Tools**: Analyze images from URLs -- **Reasoning Tools**: Advanced multi-model reasoning (Mixture of Agents) -- **Creative Tools**: Generate images from text prompts -- **Skills Tools**: On-demand knowledge documents with progressive disclosure -- **Toolsets System**: Organize tools into logical groups for different scenarios -- **Scheduled Tasks**: Cron jobs for automated agent tasks with delivery to platforms -- **Context Compression**: Automatic summarization when approaching context limits -- **Batch Processing**: Process datasets in parallel with checkpointing and statistics tracking -- **Ephemeral System Prompts**: Guide model behavior without polluting training datasets - -## Installation - -### Quick Install (Recommended) - -**Linux/macOS:** -```bash -curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash -``` - -**Windows (PowerShell):** -```powershell -irm https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.ps1 | iex -``` - -This installer will: -- Clone the repository to `~/.hermes-agent` -- Create a virtual environment and install dependencies -- Set up the `hermes` command in your PATH -- Run an interactive setup wizard to configure API keys - -### Manual Installation - -If you prefer to install manually: - -```bash -# Clone with submodules -git clone --recurse-submodules https://github.com/NousResearch/Hermes-Agent.git -cd Hermes-Agent - -# Run the setup script -./setup-hermes.sh -``` - -Or step-by-step: - -```bash -# Create and activate virtual environment -python3 -m venv venv -source venv/bin/activate # Windows: venv\Scripts\activate - -# Install in editable mode with all extras -pip install -e ".[all]" - -# Or install dependencies manually -pip install -r requirements.txt -pip install -e ./mini-swe-agent - -# Copy and configure environment -cp .env.example .env -# Edit .env with your API keys - -# Run the setup wizard -hermes setup -``` - -## Quick Start - -Once installed, the `hermes` command is your main entry point: - -```bash -hermes # Interactive chat (default) -hermes chat # Same as above -hermes chat -q "Hello" # Single query, then exit -hermes setup # Configure API keys and settings -hermes status # Show configuration status -hermes doctor # Diagnose issues -hermes gateway # Start messaging gateway (Telegram/Discord/WhatsApp) -hermes cron daemon # Run cron job scheduler -hermes version # Show version info -``` - -**Legacy `./hermes` script:** -```bash -# The old CLI script still works: -./hermes - -# Or with options: -./hermes --model "anthropic/claude-sonnet-4" --toolsets "web,terminal" -``` - -The CLI provides: -- Animated spinners during thinking and tool execution -- Kawaii-style feedback messages -- `/commands` for configuration, history, and session management -- Customizable personalities (`/personality kawaii`, `/personality pirate`, etc.) -- Persistent configuration via `cli-config.yaml` - -## Configuration - -### Environment Variables -```bash -# Copy the example environment file -cp .env.example .env - -# Edit .env and add your API keys -nano .env # or use your preferred editor -``` - -**Required API Keys:** -- `OPENROUTER_API_KEY` - LLM access via OpenRouter (get at: https://openrouter.ai/keys) -- `FIRECRAWL_API_KEY` - Web tools (get at: https://firecrawl.dev/) -- `NOUS_API_KEY` - Vision & reasoning tools (get at: https://inference-api.nousresearch.com/) -- `FAL_KEY` - Image generation (get at: https://fal.ai/) - -**Optional API Keys (for specific features):** -- `BROWSERBASE_API_KEY` - Browser automation (get at: https://browserbase.com/) -- `BROWSERBASE_PROJECT_ID` - From Browserbase dashboard -- `MORPH_API_KEY` - For legacy Hecate terminal backend (get at: https://morph.so/) - -### 4. Configure Terminal Backend - -The terminal tool uses **mini-swe-agent** environments. Configure in `.env` or `cli-config.yaml`: - -```bash -# Backend: "local", "docker", "singularity", "modal", or "ssh" -TERMINAL_ENV=local # Default: runs on host machine (no isolation) -TERMINAL_ENV=ssh # Remote execution via SSH (agent code stays local) -TERMINAL_ENV=singularity # Recommended for HPC: Apptainer/Singularity containers -TERMINAL_ENV=docker # Isolated Docker containers -TERMINAL_ENV=modal # Cloud execution via Modal - -# Container image (for docker/singularity/modal backends) -TERMINAL_DOCKER_IMAGE=python:3.11-slim -TERMINAL_SINGULARITY_IMAGE=docker://python:3.11-slim -TERMINAL_TIMEOUT=60 - -# SSH backend (for ssh) -TERMINAL_SSH_HOST=my-server.example.com -TERMINAL_SSH_USER=myuser -TERMINAL_SSH_KEY=~/.ssh/id_rsa # Optional, uses ssh-agent if not set -``` - -**Backend Requirements:** -- **local**: No extra setup (runs directly on your machine, no isolation) -- **ssh**: SSH access to remote machine (great for sandboxing - agent can't touch its own code) -- **singularity**: Requires Apptainer or Singularity installed (common on HPC clusters, no root needed) -- **docker**: Requires Docker installed and user in `docker` group -- **modal**: Requires Modal account (see setup below) - -### Singularity/Apptainer Setup (Recommended for HPC) - -Singularity/Apptainer provides rootless container execution, ideal for HPC clusters: - -```bash -# 1. Verify Apptainer is installed -apptainer --version # or: singularity --version - -# 2. Set up cache directories (important for parallel workers) -# Use /scratch if available (HPC), otherwise /tmp -export APPTAINER_CACHEDIR=/scratch/$USER/.apptainer -export APPTAINER_TMPDIR=/scratch/$USER/.apptainer/tmp -mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR" - -# 3. Pre-build SIF image (recommended for parallel batch processing) -# This avoids race conditions when multiple workers start simultaneously -apptainer build $APPTAINER_CACHEDIR/python-nodejs.sif docker://nikolaik/python-nodejs:python3.11-nodejs20 - -# 4. Configure .env to use the local SIF -TERMINAL_ENV=singularity -TERMINAL_SINGULARITY_IMAGE=/scratch/$USER/.apptainer/python-nodejs.sif -``` - -**Tip:** The batch scripts in `configs/` automatically handle SIF pre-building if `/scratch` is available. - -### Modal Cloud Backend Setup - -[Modal](https://modal.com) provides serverless cloud compute for running sandboxed environments at scale. - -```bash -# 1. Install Modal and dependencies -pip install modal boto3 - -# 2. Authenticate with Modal (opens browser) -modal setup - -# 3. Set terminal backend to modal in .env -TERMINAL_ENV=modal -``` - -Modal uses CLI-based authentication (stored in `~/.modal/`), so no API key is needed in `.env`. After running `modal setup`, commands will automatically execute in Modal's cloud sandboxes. - -### Browser Tools Setup - -Browser tools enable the agent to navigate websites, fill forms, click buttons, and extract content. They use [agent-browser](https://github.com/vercel-labs/agent-browser) CLI with [Browserbase](https://browserbase.com) cloud execution. - -```bash -# 1. Install Node.js (if not already installed) -# Use nvm (recommended) or your package manager - -# 2. Install agent-browser CLI (choose one option): -npm install -g agent-browser # Option A: Global install (recommended) -npm install # Option B: Local install (uses npx fallback) - -# 3. Get Browserbase credentials -# Sign up at https://browserbase.com/ and get your: -# - API Key (from Settings โ†’ API Keys) -# - Project ID (from your project dashboard) - -# 4. Add to your .env file: -BROWSERBASE_API_KEY=your_api_key_here -BROWSERBASE_PROJECT_ID=your_project_id_here -``` - -**Available Browser Tools:** - -| Tool | Description | -|------|-------------| -| `browser_navigate` | Navigate to a URL | -| `browser_snapshot` | Get text-based page snapshot with element refs | -| `browser_click` | Click an element by ref (e.g., `@e5`) | -| `browser_type` | Type text into an input field | -| `browser_scroll` | Scroll up or down | -| `browser_back` | Go back in browser history | -| `browser_press` | Press a keyboard key (Enter, Tab, etc.) | -| `browser_close` | Close the browser session | -| `browser_get_images` | Get list of images on the page | - -**Example Usage:** -```bash -# Use browser tools with web search and vision -python run_agent.py \ - --query "Go to amazon.com and find the price of the latest Kindle" \ - --enabled_toolsets=browser,web,vision - -# Use browser-focused distribution -python batch_runner.py \ - --dataset_file=browser_tasks.jsonl \ - --distribution=browser_use \ - --run_name=browser_run -``` - -See `.env.example` for all available configuration options including debug settings. - -### Skills Tools - -Skills are on-demand knowledge documents the agent can load when needed. They follow a **progressive disclosure** pattern to minimize token usage: - -``` -skills/ -โ”œโ”€โ”€ mlops/ # Category folder -โ”‚ โ”œโ”€โ”€ axolotl/ # Skill folder -โ”‚ โ”‚ โ”œโ”€โ”€ SKILL.md # Main instructions (required) -โ”‚ โ”‚ โ”œโ”€โ”€ references/ # Additional docs, API specs -โ”‚ โ”‚ โ””โ”€โ”€ templates/ # Output formats, configs -โ”‚ โ””โ”€โ”€ vllm/ -โ”‚ โ””โ”€โ”€ SKILL.md -``` - -**Available Skills Tools:** - -| Tool | Description | -|------|-------------| -| `skills_categories` | List available skill categories (~50 tokens) | -| `skills_list` | List skills with name + description (~3k tokens for 40 skills) | -| `skill_view` | Load full skill content, tags, and linked files | - -**Example Usage:** -```bash -# Use skills tools -python run_agent.py \ - --query "What skills do you have for fine-tuning? Show me the axolotl skill." \ - --enabled_toolsets=skills -``` - -**Creating Skills:** - -Skills use YAML frontmatter for metadata: -```yaml ---- -name: my-skill -description: Brief description shown in skills_list -tags: [tag1, tag2] -related_skills: [other-skill] -version: 1.0.0 ---- -# Skill Content - -Instructions, examples, and guidelines here... -``` - -Skills can include: -- `references/` - Additional documentation, API specs, examples -- `templates/` - Output formats, config files, boilerplate code -- `scripts/` - Executable helpers (Python, shell scripts) - -## Session Logging - -Every conversation is automatically logged to `logs/` for debugging and inspection: - -``` -logs/ -โ”œโ”€โ”€ session_20260201_143052_a1b2c3.json -โ”œโ”€โ”€ session_20260201_150217_d4e5f6.json -โ””โ”€โ”€ ... -``` - -**Log Format:** -```json -{ - "session_id": "20260201_143052_a1b2c3", - "model": "anthropic/claude-sonnet-4", - "session_start": "2026-02-01T14:30:52.123456", - "last_updated": "2026-02-01T14:35:12.789012", - "message_count": 8, - "conversations": [ - {"from": "system", "value": "..."}, - {"from": "human", "value": "..."}, - {"from": "gpt", "value": "..."}, - {"from": "tool", "value": "..."} - ] -} -``` - -- **Automatic**: Logs are created and updated automatically after each conversation turn -- **Session ID in Banner**: The CLI displays the session ID in the welcome banner -- **Trajectory Format**: Uses the same format as batch processing for consistency -- **Git Ignored**: `logs/` is in `.gitignore` so logs aren't committed - -## Context Compression - -Long conversations can exceed the model's context limit. Hermes Agent automatically compresses context when approaching the limit: - -**How it works:** -1. Tracks actual token usage from API responses (`usage.prompt_tokens`) -2. When tokens reach 85% of model's context limit, triggers compression -3. Protects first 3 turns (system prompt, initial request, first response) -4. Protects last 4 turns (recent context is most relevant) -5. Summarizes middle turns using a fast/cheap model (Gemini Flash) -6. Inserts summary as a user message, conversation continues seamlessly - -**Configuration (`cli-config.yaml`):** -```yaml -compression: - enabled: true # Enable auto-compression (default) - threshold: 0.85 # Compress at 85% of context limit - summary_model: "google/gemini-2.0-flash-001" -``` - -**Or via environment variables:** -```bash -CONTEXT_COMPRESSION_ENABLED=true -CONTEXT_COMPRESSION_THRESHOLD=0.85 -CONTEXT_COMPRESSION_MODEL=google/gemini-2.0-flash-001 -``` - -**When compression triggers, you'll see:** -``` -๐Ÿ“ฆ Context compression triggered (170,000 tokens โ‰ฅ 170,000 threshold) - ๐Ÿ“Š Model context limit: 200,000 tokens (85% = 170,000) - ๐Ÿ—œ๏ธ Summarizing turns 4-15 (12 turns) - โœ… Compressed: 20 โ†’ 9 messages (~45,000 tokens saved) -``` - -## Scheduled Tasks (Cron Jobs) - -Hermes Agent can schedule automated tasks to run in the future - either one-time reminders or recurring jobs. - -### CLI Commands - -```bash -# List scheduled jobs -/cron - -# Add a one-shot reminder (runs once in 30 minutes) -/cron add 30m Remind me to check the build status - -# Add a recurring job (every 2 hours) -/cron add "every 2h" Check server status at 192.168.1.100 and report any issues - -# Add a cron expression (daily at 9am) -/cron add "0 9 * * *" Generate a morning briefing summarizing GitHub notifications - -# Remove a job -/cron remove abc123def456 -``` - -### Agent Self-Scheduling - -The agent can also schedule its own follow-up tasks using tools: - -```python -# Available when using hermes-cli toolset (default for CLI) -schedule_cronjob(prompt="...", schedule="30m", repeat=1) # One-shot -schedule_cronjob(prompt="...", schedule="every 2h") # Recurring -list_cronjobs() # View all jobs -remove_cronjob(job_id="...") # Cancel a job -``` - -**โš ๏ธ Important:** Cronjobs run in **isolated sessions with NO prior context**. The prompt must be completely self-contained with all necessary information (file paths, URLs, server addresses, etc.). The future agent will not remember anything from the current conversation. - -### Schedule Formats - -| Format | Example | Description | -|--------|---------|-------------| -| Duration | `30m`, `2h`, `1d` | One-shot delay from now | -| Interval | `every 30m`, `every 2h` | Recurring at fixed intervals | -| Cron | `0 9 * * *` | Cron expression (requires `croniter`) | -| Timestamp | `2026-02-03T14:00` | One-shot at specific time | - -### Repeat Options - -| repeat | Behavior | -|--------|----------| -| (omitted) | One-shot schedules run once; intervals/cron run forever | -| `1` | Run once then auto-delete | -| `N` | Run N times then auto-delete | - -### Running the Cron Daemon - -Jobs are stored in `~/.hermes/cron/jobs.json` and executed by a scheduler: - -```bash -# Option 1: Built-in daemon (checks every 60 seconds) -python cli.py --cron-daemon - -# Option 2: System cron integration (run once per minute) -# Add to crontab: crontab -e -*/1 * * * * cd ~/hermes-agent && python cli.py --cron-tick-once >> ~/.hermes/cron/cron.log 2>&1 -``` - -### Job Output - -Job outputs are saved to `~/.hermes/cron/output/{job_id}/{timestamp}.md` for review. - -## Messaging Gateway (Telegram, Discord, WhatsApp) - -Connect Hermes Agent to messaging platforms so you can chat from anywhere. - -### Quick Start - -```bash -# 1. Add your bot token to .env -echo 'TELEGRAM_BOT_TOKEN="your_token"' >> .env - -# 2. Test the gateway (foreground) -./scripts/hermes-gateway run - -# 3. Install as a background service -./scripts/hermes-gateway install - -# 4. Manage the service -./scripts/hermes-gateway start # Start -./scripts/hermes-gateway stop # Stop -./scripts/hermes-gateway status # Check status -``` - -### Supported Platforms - -| Platform | Setup | Toolset | -|----------|-------|---------| -| Telegram | Bot via @BotFather | `hermes-telegram` | -| Discord | Bot via Developer Portal | `hermes-discord` | -| WhatsApp | Node.js bridge | `hermes-whatsapp` | - -### Session Management - -- Sessions persist across messages (agent remembers context) -- Reset policies: daily (4am), idle (2 hours), or both -- Manual reset: send `/new` or `/reset` - -### Cron Job Delivery - -Schedule tasks that deliver to specific platforms: - -```python -schedule_cronjob( - prompt="Check server status...", - schedule="every 1h", - deliver="telegram" # or "origin", "discord", etc. -) -``` - -### CLI Commands - -| Command | Description | -|---------|-------------| -| `/platforms` | Show gateway configuration status | -| `--gateway` | Start the gateway (CLI flag) | - -See [docs/messaging.md](docs/messaging.md) for full setup instructions. - -## Interactive CLI - -The CLI provides a rich interactive experience for working with the agent. - -### Running the CLI - -```bash -# Basic usage -./hermes - -# With specific model -./hermes --model "anthropic/claude-sonnet-4" - -# With specific toolsets -./hermes --toolsets "web,terminal,skills" -``` - -### CLI Commands - -| Command | Description | -|---------|-------------| -| `/help` | Show available commands | -| `/tools` | List available tools by toolset | -| `/toolsets` | List available toolsets | -| `/model [name]` | Show or change the current model | -| `/prompt [text]` | View/set custom system prompt | -| `/personality [name]` | Set a predefined personality | -| `/clear` | Clear screen and reset conversation | -| `/reset` | Reset conversation only | -| `/history` | Show conversation history | -| `/save` | Save current conversation to file | -| `/config` | Show current configuration | -| `/cron` | Manage scheduled tasks (list, add, remove) | -| `/platforms` | Show gateway/messaging platform status | -| `/quit` | Exit the CLI | - -### Configuration - -Copy `cli-config.yaml.example` to `cli-config.yaml` and customize: - -```yaml -# Model settings -model: - default: "anthropic/claude-sonnet-4" - -# Terminal backend (local, docker, singularity, modal, or ssh) -terminal: - env_type: "local" - cwd: "." # Use current directory - -# Or use SSH for remote execution (keeps agent code isolated) -# terminal: -# env_type: "ssh" -# ssh_host: "my-server.example.com" -# ssh_user: "myuser" -# ssh_key: "~/.ssh/id_rsa" -# cwd: "/home/myuser/project" - -# Enable specific toolsets -toolsets: - - all # or: web, terminal, browser, vision, etc. - -# Custom personalities (use with /personality command) -agent: - personalities: - helpful: "You are a helpful assistant." - kawaii: "You are a kawaii assistant! Use cute expressions..." -``` - -### Personalities - -Built-in personalities available via `/personality`: -- `helpful`, `concise`, `technical`, `creative`, `teacher` -- `kawaii`, `catgirl`, `pirate`, `shakespeare`, `surfer` -- `noir`, `uwu`, `philosopher`, `hype` - -## Toolsets System - -The agent uses a toolsets system for organizing and managing tools. All tools must be part of a toolset to be accessible - individual tool selection is not supported. This ensures consistent and logical grouping of capabilities. - -### Key Concepts - -- **Toolsets**: Logical groups of tools for specific use cases (e.g., "research", "development", "debugging") -- **Composition**: Toolsets can include other toolsets for powerful combinations -- **Custom Toolsets**: Create your own toolsets at runtime or by editing `toolsets.py` -- **Toolset-Only Access**: Tools are only accessible through toolsets, not individually - -### Available Toolsets - -See `toolsets.py` for the complete list of predefined toolsets including: -- Basic toolsets (web, terminal, vision, creative, reasoning) -- Composite toolsets (research, development, analysis, etc.) -- Scenario-specific toolsets (debugging, documentation, API testing, etc.) -- Special toolsets (safe mode without terminal, minimal, offline) - -### Using Toolsets - -```bash -# Use a predefined toolset -python run_agent.py --enabled_toolsets=research --query "Find latest AI papers" - -# Combine multiple toolsets -python run_agent.py --enabled_toolsets=web,vision --query "Analyze this website" - -# Enable all toolsets explicitly (same as omitting the flag) -python run_agent.py --enabled_toolsets=all --query "Do web research and run commands if helpful" - -# Safe mode (no terminal access) -python run_agent.py --enabled_toolsets=safe --query "Help without running commands" - -# List all available toolsets and tools -python run_agent.py --list_tools -``` - -See `toolsets.py` for the complete list of available toolsets and how to create custom ones. - -## Basic Usage - -### Default (all tools enabled) -```bash -# Uses OpenRouter by default - just set OPENROUTER_API_KEY in .env -python run_agent.py \ - --query "search up the latest docs on jit in python 3.13 and write me basic example that's not in their docs. profile its perf" \ - --max_turns 20 \ - --model anthropic/claude-sonnet-4-20250514 -``` - -### With specific toolset -```bash -python run_agent.py \ - --query "Debug this Python error" \ - --enabled_toolsets=debugging \ - --model anthropic/claude-sonnet-4-20250514 -``` - -### Python API -```python -from run_agent import AIAgent - -# Uses OpenRouter by default (reads OPENROUTER_API_KEY from .env) -agent = AIAgent( - model="anthropic/claude-sonnet-4-20250514", - enabled_toolsets=["research"] -) -response = agent.chat("Find information about quantum computing") - -# Create custom toolset at runtime -from toolsets import create_custom_toolset - -create_custom_toolset( - name="my_tools", - description="My custom toolkit", - tools=["web_search"], - includes=["terminal", "vision"] -) - -agent = AIAgent(enabled_toolsets=["my_tools"]) -``` - -## Batch Processing - -Process multiple prompts from a dataset in parallel with automatic checkpointing and statistics tracking: - -```bash -# Basic batch processing -python batch_runner.py \ - --dataset_file=prompts.jsonl \ - --batch_size=20 \ - --run_name=my_run - -# With specific distribution -python batch_runner.py \ - --dataset_file=prompts.jsonl \ - --batch_size=20 \ - --run_name=image_run \ - --distribution=image_gen \ - --num_workers=4 -``` - -**Key Features:** -- Parallel processing with configurable workers -- Toolset distributions for varied data generation -- Automatic checkpointing and resume capability -- Combined output in `data//trajectories.jsonl` -- Tool usage statistics and success rates - -Use `--list_distributions` to see available toolset distributions for varied data generation. - -### Trajectory Compression - -Post-process trajectories to fit within token budgets for training: - -```bash -# Compress a directory of JSONL files -python trajectory_compressor.py --input=data/my_run - -# Compress a single JSONL file -python trajectory_compressor.py --input=data/trajectories.jsonl - -# Compress a 15% sample (useful for creating smaller training sets) -python trajectory_compressor.py --input=data/trajectories.jsonl --sample_percent=15 - -# Custom output and token target -python trajectory_compressor.py \ - --input=data/trajectories.jsonl \ - --output=data/compressed.jsonl \ - --target_max_tokens=16000 -``` - -**Features:** -- Protects first turns (system, human, first GPT response, first tool call) -- Protects last N turns (configurable) -- Summarizes middle turns using LLM to fit target token budget -- Supports both directory and single file input -- Optional random sampling with `--sample_percent` -- Configurable via `configs/trajectory_compression.yaml` - -### Ephemeral System Prompts - -The ephemeral system prompt feature allows you to guide the model's behavior during batch processing **without** saving that prompt to the training dataset trajectories. This is useful for: - -- Guiding model behavior during data collection -- Adding task-specific instructions -- Keeping saved trajectories clean and focused on tool-calling format - -**Example:** -```bash -python batch_runner.py \ - --dataset_file=prompts.jsonl \ - --batch_size=10 \ - --run_name=my_run \ - --ephemeral_system_prompt="You are a helpful assistant focused on image generation." -``` - -The ephemeral prompt will influence the model's behavior during execution, but **only the standard tool-calling system prompt** will be saved in the trajectory files. - -The ephemeral prompt influences model behavior during execution, but **only the standard tool-calling system prompt** is saved in trajectory files. - -## Command Line Arguments - -**Single Agent (`run_agent.py`):** -- `--query`: The question or task for the agent -- `--model`: Model to use (default: claude-opus-4-20250514) -- `--api_key`: API key for authentication -- `--base_url`: API endpoint URL -- `--max_turns`: Maximum number of tool-calling iterations -- `--enabled_toolsets`: Comma-separated list of toolsets to enable. Use `all` (or `*`) to enable everything. If omitted, all toolsets are enabled by default. -- `--disabled_toolsets`: Comma-separated list of toolsets to disable -- `--list_tools`: List all available toolsets and tools -- `--save_trajectories`: Save conversation trajectories to JSONL files - -**Batch Processing (`batch_runner.py`):** -- `--dataset_file`: Path to JSONL file with prompts -- `--batch_size`: Number of prompts per batch -- `--run_name`: Name for this run (for output/checkpointing) -- `--distribution`: Toolset distribution to use (default: "default") -- `--num_workers`: Number of parallel workers (default: 4) -- `--resume`: Resume from checkpoint if interrupted -- `--ephemeral_system_prompt`: System prompt used during execution but NOT saved to trajectories -- `--list_distributions`: List available toolset distributions - -## Environment Variables - -All environment variables can be configured in the `.env` file (copy from `.env.example`). - -**LLM Provider (OpenRouter):** -- `OPENROUTER_API_KEY`: Primary LLM access via OpenRouter (supports Claude, GPT-4, Gemini, etc.) -- `LLM_MODEL`: Default model (e.g., `anthropic/claude-sonnet-4`, `openai/gpt-4o`) - -**Tool API Keys:** -- `FIRECRAWL_API_KEY`: Web tools (search, extract, crawl) -- `NOUS_API_KEY`: Vision and reasoning tools -- `FAL_KEY`: Image generation tools - -**Terminal Tool Configuration (mini-swe-agent backend):** -- `TERMINAL_ENV`: Backend type - `local`, `docker`, `singularity`, `modal`, or `ssh` (default: `local`) -- `TERMINAL_DOCKER_IMAGE`: Docker image for docker backend (default: `python:3.11-slim`) -- `TERMINAL_SINGULARITY_IMAGE`: Singularity/Apptainer image (can be `docker://...` URL or local `.sif` path) -- `TERMINAL_TIMEOUT`: Command timeout in seconds (default: `60`) -- `TERMINAL_LIFETIME_SECONDS`: Cleanup inactive environments after this time (default: `300`) -- `TERMINAL_CWD`: Working directory inside containers (default: `/tmp`) -- `TERMINAL_SCRATCH_DIR`: Custom scratch directory for sandbox storage (optional, auto-detects `/scratch`) -- `SUDO_PASSWORD`: Enable sudo commands by piping password via `sudo -S` (works with all backends) - - If unset in CLI mode, you'll be prompted interactively when sudo is needed (45s timeout) - -**SSH Backend Configuration (for remote execution):** -- `TERMINAL_SSH_HOST`: Remote server hostname or IP -- `TERMINAL_SSH_USER`: SSH username -- `TERMINAL_SSH_PORT`: SSH port (default: `22`) -- `TERMINAL_SSH_KEY`: Path to SSH private key (optional, uses ssh-agent if not set) - -**Context Compression (auto-shrinks long conversations):** -- `CONTEXT_COMPRESSION_ENABLED`: Enable auto-compression (default: `true`) -- `CONTEXT_COMPRESSION_THRESHOLD`: Compress at this % of context limit (default: `0.85`) -- `CONTEXT_COMPRESSION_MODEL`: Model for generating summaries (default: `google/gemini-2.0-flash-001`) - -**Browser Tool Configuration (agent-browser + Browserbase):** -- `BROWSERBASE_API_KEY`: Browserbase API key for cloud browser execution -- `BROWSERBASE_PROJECT_ID`: Browserbase project ID -- `BROWSER_SESSION_TIMEOUT`: Session timeout in seconds (default: `300`) - -**Legacy Hecate Terminal Backend (optional):** -- `MORPH_API_KEY`: For Hecate/MorphCloud terminal backend -- `HECATE_VM_LIFETIME_SECONDS`: VM lifetime (default: 300) -- `HECATE_DEFAULT_SNAPSHOT_ID`: Default snapshot (default: snapshot_p5294qxt) - -**Debug Options:** -- `WEB_TOOLS_DEBUG`, `VISION_TOOLS_DEBUG`, `MOA_TOOLS_DEBUG`, `IMAGE_TOOLS_DEBUG`: Enable debug logging - -## Key Files - -| File | Purpose | -|------|---------| -| `hermes` | CLI launcher script (run with `./hermes`) | -| `cli.py` | Interactive CLI implementation | -| `cli-config.yaml` | CLI configuration (copy from `.example`) | -| `run_agent.py` | Main agent runner - single query execution | -| `batch_runner.py` | Parallel batch processing with checkpointing | -| `model_tools.py` | Core tool definitions and handlers | -| `toolsets.py` | Toolset definitions and composition | -| `toolset_distributions.py` | Probability distributions for data generation | -| `trajectory_compressor.py` | Post-process trajectories for training | -| `tools/` | Individual tool implementations | -| `tools/skills_tool.py` | Skills system with progressive disclosure | -| `skills/` | On-demand knowledge documents | -| `docs/` | Documentation | -| `configs/` | Example batch run scripts | diff --git a/hermes_agent.egg-info/SOURCES.txt b/hermes_agent.egg-info/SOURCES.txt deleted file mode 100644 index d94b105e0..000000000 --- a/hermes_agent.egg-info/SOURCES.txt +++ /dev/null @@ -1,47 +0,0 @@ -README.md -batch_runner.py -cli.py -model_tools.py -pyproject.toml -run_agent.py -toolset_distributions.py -toolsets.py -trajectory_compressor.py -cron/__init__.py -cron/jobs.py -cron/scheduler.py -gateway/__init__.py -gateway/config.py -gateway/delivery.py -gateway/run.py -gateway/session.py -hermes_agent.egg-info/PKG-INFO -hermes_agent.egg-info/SOURCES.txt -hermes_agent.egg-info/dependency_links.txt -hermes_agent.egg-info/entry_points.txt -hermes_agent.egg-info/requires.txt -hermes_agent.egg-info/top_level.txt -hermes_cli/__init__.py -hermes_cli/cron.py -hermes_cli/doctor.py -hermes_cli/gateway.py -hermes_cli/main.py -hermes_cli/setup.py -hermes_cli/status.py -tests/test_batch_runner.py -tests/test_checkpoint_resumption.py -tests/test_modal_terminal.py -tests/test_nous_api_limits.py -tests/test_nous_api_pattern.py -tests/test_temperature_fix.py -tests/test_web_tools.py -tools/__init__.py -tools/browser_tool.py -tools/cronjob_tools.py -tools/image_generation_tool.py -tools/mixture_of_agents_tool.py -tools/skills_tool.py -tools/terminal_hecate.py -tools/terminal_tool.py -tools/vision_tools.py -tools/web_tools.py \ No newline at end of file diff --git a/hermes_agent.egg-info/dependency_links.txt b/hermes_agent.egg-info/dependency_links.txt deleted file mode 100644 index 8b1378917..000000000 --- a/hermes_agent.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/hermes_agent.egg-info/entry_points.txt b/hermes_agent.egg-info/entry_points.txt deleted file mode 100644 index 504de51c2..000000000 --- a/hermes_agent.egg-info/entry_points.txt +++ /dev/null @@ -1,3 +0,0 @@ -[console_scripts] -hermes = hermes_cli.main:main -hermes-agent = run_agent:main diff --git a/hermes_agent.egg-info/requires.txt b/hermes_agent.egg-info/requires.txt deleted file mode 100644 index 91036b7cd..000000000 --- a/hermes_agent.egg-info/requires.txt +++ /dev/null @@ -1,35 +0,0 @@ -openai -python-dotenv -fire -httpx -rich -tenacity -pyyaml -requests -jinja2 -pydantic>=2.0 -firecrawl-py -fal-client -litellm>=1.75.5 -typer -platformdirs - -[all] -croniter -python-telegram-bot>=20.0 -discord.py>=2.0 - -[cron] -croniter - -[dev] -pytest -pytest-asyncio - -[messaging] -python-telegram-bot>=20.0 -discord.py>=2.0 - -[modal] -modal -boto3 diff --git a/hermes_agent.egg-info/top_level.txt b/hermes_agent.egg-info/top_level.txt deleted file mode 100644 index a80409024..000000000 --- a/hermes_agent.egg-info/top_level.txt +++ /dev/null @@ -1,11 +0,0 @@ -batch_runner -cli -cron -gateway -hermes_cli -model_tools -run_agent -tools -toolset_distributions -toolsets -trajectory_compressor