Update environment configuration and enhance tool definitions

- Modified `.env.example` to set default terminal environment to 'local' and updated Docker, Singularity, and Modal image references to use 'python:3.11-slim'. - Updated `package.json` to include Node.js engine requirements and modified post-install script for better user guidance. - Enhanced `pyproject.toml` to reflect new dependencies and optional dependencies for modal and development environments. - Improved `README.md` with additional setup instructions for Singularity and Node.js dependencies, along with clearer toolset documentation. - Refactored `model_tools.py` to include new tool definitions and ensure consistency across toolsets. - Updated architecture documentation to clarify tool structure and registration processes.
2026-01-29 22:36:07 +00:00
parent f8846f85a1
commit 7ea17bb957
8 changed files with 535 additions and 257 deletions
--- a/.env.example
+++ b/.env.example
@@ -37,12 +37,12 @@ FAL_KEY=
 # - singularity: Runs in Apptainer/Singularity containers (HPC clusters, no root needed)
 # - docker: Runs in Docker containers (isolated, requires Docker + docker group)
 # - modal: Runs in Modal cloud sandboxes (scalable, requires Modal account)
-TERMINAL_ENV=singularity
+TERMINAL_ENV=local

 # Container images (for singularity/docker/modal backends)
-TERMINAL_DOCKER_IMAGE=python:3.11
-TERMINAL_SINGULARITY_IMAGE=docker://python:3.11
-TERMINAL_MODAL_IMAGE=python:3.11
+TERMINAL_DOCKER_IMAGE=python:3.11-slim
+TERMINAL_SINGULARITY_IMAGE=docker://python:3.11-slim
+TERMINAL_MODAL_IMAGE=python:3.11-slim

 # Working directory inside the container
 TERMINAL_CWD=/tmp
@@ -53,13 +53,19 @@ TERMINAL_TIMEOUT=60
 # Cleanup inactive environments after this many seconds
 TERMINAL_LIFETIME_SECONDS=300

+# Scratch directory for Singularity sandboxes (optional)
+# If not set, uses /scratch (if available) or /tmp
+# TERMINAL_SCRATCH_DIR=/scratch/myuser
+
+# Disk usage warning threshold in GB (default: 500)
+TERMINAL_DISK_WARNING_GB=500
+
 # =============================================================================
 # MODAL CLOUD BACKEND (Optional - for TERMINAL_ENV=modal)
 # =============================================================================
 # Modal uses CLI authentication, not environment variables.
 # Run: pip install modal && modal setup
 # This will authenticate via browser and store credentials locally.
-# No API key needed in .env - Modal handles auth automatically.

 # =============================================================================
 # BROWSER TOOL CONFIGURATION (agent-browser + Browserbase)
@@ -79,60 +85,19 @@ BROWSERBASE_API_KEY=
 BROWSERBASE_PROJECT_ID=

 # Enable residential proxies for better CAPTCHA solving (default: true)
-# Routes traffic through residential IPs, significantly improves success rate
 BROWSERBASE_PROXIES=true

 # Enable advanced stealth mode (default: false, requires Scale Plan)
-# Uses custom Chromium build to avoid bot detection altogether
 BROWSERBASE_ADVANCED_STEALTH=false

-# Browser session timeout in seconds (optional, default: 300)
-# Sessions are cleaned up after this duration of inactivity
+# Browser session timeout in seconds (default: 300)
 BROWSER_SESSION_TIMEOUT=300

 # =============================================================================
-# Browser automation requires Browserbase cloud service for remote browser execution.
-# This allows the agent to navigate websites, fill forms, and extract information.
-
-# Browserbase API Key - Cloud browser execution
-# Get at: https://browserbase.com/
-BROWSERBASE_API_KEY=
-
-# Browserbase Project ID - From your Browserbase dashboard
-BROWSERBASE_PROJECT_ID=
-
-# Enable proxies for better CAPTCHA solving and anti-bot avoidance (default: true)
-# Proxies route traffic through residential IPs for more reliable access
-BROWSERBASE_PROXIES=true
-
-# Enable advanced stealth mode (default: false, requires Scale Plan)
-# Uses custom Chromium build to avoid bot detection altogether
-BROWSERBASE_ADVANCED_STEALTH=false
-
-# Browser session timeout in seconds (optional, default: 300)
-# Sessions are cleaned up after this duration of inactivity
-BROWSER_SESSION_TIMEOUT=300
-
-# =============================================================================
-# Browser automation requires Browserbase cloud service for remote browser execution.
-# This allows the agent to navigate websites, fill forms, and extract information.
-
-# Browserbase API Key - Cloud browser execution
-# Get at: https://browserbase.com/
-BROWSERBASE_API_KEY=
-
-# Browserbase Project ID - From your Browserbase dashboard
-BROWSERBASE_PROJECT_ID=
-
-# Browser session timeout in seconds (optional, default: 300)
-# Sessions are cleaned up after this duration of inactivity
-BROWSER_SESSION_TIMEOUT=300
-
-# =============================================================================
-# LEGACY/OPTIONAL API KEYS
+# LEGACY/OPTIONAL
 # =============================================================================

-# Morph API Key - For legacy Hecate terminal backend (terminal-hecate tool)
+# Morph API Key - For legacy Hecate terminal backend
 # Get at: https://morph.so/
 MORPH_API_KEY=

@@ -147,12 +112,3 @@ WEB_TOOLS_DEBUG=false
 VISION_TOOLS_DEBUG=false
 MOA_TOOLS_DEBUG=false
 IMAGE_TOOLS_DEBUG=false
-
-# Scratch directory for Singularity sandboxes (optional)
-# If not set, uses /scratch (if available) or /tmp
-# Set this to a directory with lots of space for large pip installs
-# TERMINAL_SCRATCH_DIR=/scratch/myuser
-
-# Disk usage warning threshold in GB (default: 500)
-# Warning is printed when total sandbox disk usage exceeds this
-TERMINAL_DISK_WARNING_GB=500
--- a/README.md
+++ b/README.md
@@ -32,11 +32,14 @@ git submodule update --init --recursive
 python3 -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate

-# Install required packages
+# Install Python packages
 pip install -r requirements.txt

 # Install mini-swe-agent for terminal tools
 pip install -e ./mini-swe-agent
+
+# Install Node.js dependencies for browser tools (requires Node.js)
+npm install
 ```

 ### 3. Configure Environment Variables
@@ -82,6 +85,31 @@ TERMINAL_TIMEOUT=60
 - **docker**: Requires Docker installed and user in `docker` group
 - **modal**: Requires Modal account (see setup below)

+### Singularity/Apptainer Setup (Recommended for HPC)
+
+Singularity/Apptainer provides rootless container execution, ideal for HPC clusters:
+
+```bash
+# 1. Verify Apptainer is installed
+apptainer --version  # or: singularity --version
+
+# 2. Set up cache directories (important for parallel workers)
+# Use /scratch if available (HPC), otherwise /tmp
+export APPTAINER_CACHEDIR=/scratch/$USER/.apptainer
+export APPTAINER_TMPDIR=/scratch/$USER/.apptainer/tmp
+mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR"
+
+# 3. Pre-build SIF image (recommended for parallel batch processing)
+# This avoids race conditions when multiple workers start simultaneously
+apptainer build $APPTAINER_CACHEDIR/python-nodejs.sif docker://nikolaik/python-nodejs:python3.11-nodejs20
+
+# 4. Configure .env to use the local SIF
+TERMINAL_ENV=singularity
+TERMINAL_SINGULARITY_IMAGE=/scratch/$USER/.apptainer/python-nodejs.sif
+```
+
+**Tip:** The batch scripts in `configs/` automatically handle SIF pre-building if `/scratch` is available.
+
 ### Modal Cloud Backend Setup

 [Modal](https://modal.com) provides serverless cloud compute for running sandboxed environments at scale.
@@ -107,8 +135,9 @@ Browser tools enable the agent to navigate websites, fill forms, click buttons,
 # 1. Install Node.js (if not already installed)
 # Use nvm (recommended) or your package manager

-# 2. Install agent-browser CLI globally
-npm install -g agent-browser
+# 2. Install agent-browser CLI (choose one option):
+npm install -g agent-browser     # Option A: Global install (recommended)
+npm install                      # Option B: Local install (uses npx fallback)

 # 3. Get Browserbase credentials
 # Sign up at https://browserbase.com/ and get your:
@@ -188,7 +217,7 @@ python run_agent.py --enabled_toolsets=safe --query "Help without running comman
 python run_agent.py --list_tools
 ```

-For detailed documentation on toolsets, see `TOOLSETS_README.md`.
+See `toolsets.py` for the complete list of available toolsets and how to create custom ones.

 ## Basic Usage

@@ -260,8 +289,36 @@ python batch_runner.py \
 - Combined output in `data/<run_name>/trajectories.jsonl`
 - Tool usage statistics and success rates

-**Quick Start:** See [QUICKSTART_BATCH.md](QUICKSTART_BATCH.md) for a 5-minute getting started guide.  
-**Full Documentation:** See [BATCH_PROCESSING.md](BATCH_PROCESSING.md) for comprehensive documentation.
+Use `--list_distributions` to see available toolset distributions for varied data generation.
+
+### Trajectory Compression
+
+Post-process trajectories to fit within token budgets for training:
+
+```bash
+# Compress a directory of JSONL files
+python trajectory_compressor.py --input=data/my_run
+
+# Compress a single JSONL file
+python trajectory_compressor.py --input=data/trajectories.jsonl
+
+# Compress a 15% sample (useful for creating smaller training sets)
+python trajectory_compressor.py --input=data/trajectories.jsonl --sample_percent=15
+
+# Custom output and token target
+python trajectory_compressor.py \
+  --input=data/trajectories.jsonl \
+  --output=data/compressed.jsonl \
+  --target_max_tokens=16000
+```
+
+**Features:**
+- Protects first turns (system, human, first GPT response, first tool call)
+- Protects last N turns (configurable)
+- Summarizes middle turns using LLM to fit target token budget
+- Supports both directory and single file input
+- Optional random sampling with `--sample_percent`
+- Configurable via `configs/trajectory_compression.yaml`

 ### Ephemeral System Prompts

@@ -282,7 +339,7 @@ python batch_runner.py \

 The ephemeral prompt will influence the model's behavior during execution, but **only the standard tool-calling system prompt** will be saved in the trajectory files.

-**Documentation:** See [docs/ephemeral_system_prompt.md](docs/ephemeral_system_prompt.md) for complete details.
+The ephemeral prompt influences model behavior during execution, but **only the standard tool-calling system prompt** is saved in trajectory files.

 ## Command Line Arguments

@@ -321,11 +378,13 @@ All environment variables can be configured in the `.env` file (copy from `.env.
 - `FAL_KEY`: Image generation tools

 **Terminal Tool Configuration (mini-swe-agent backend):**
- `TERMINAL_ENV`: Backend type - `local`, `docker`, or `modal` (default: `local`)
- `TERMINAL_DOCKER_IMAGE`: Docker image to use (default: `python:3.11-slim`)
+- `TERMINAL_ENV`: Backend type - `local`, `docker`, `singularity`, or `modal` (default: `local`)
+- `TERMINAL_DOCKER_IMAGE`: Docker image for docker backend (default: `python:3.11-slim`)
+- `TERMINAL_SINGULARITY_IMAGE`: Singularity/Apptainer image (can be `docker://...` URL or local `.sif` path)
 - `TERMINAL_TIMEOUT`: Command timeout in seconds (default: `60`)
 - `TERMINAL_LIFETIME_SECONDS`: Cleanup inactive environments after this time (default: `300`)
 - `TERMINAL_CWD`: Working directory inside containers (default: `/tmp`)
+- `TERMINAL_SCRATCH_DIR`: Custom scratch directory for sandbox storage (optional, auto-detects `/scratch`)

 **Browser Tool Configuration (agent-browser + Browserbase):**
 - `BROWSERBASE_API_KEY`: Browserbase API key for cloud browser execution
@@ -340,18 +399,16 @@ All environment variables can be configured in the `.env` file (copy from `.env.
 **Debug Options:**
 - `WEB_TOOLS_DEBUG`, `VISION_TOOLS_DEBUG`, `MOA_TOOLS_DEBUG`, `IMAGE_TOOLS_DEBUG`: Enable debug logging

-## Documentation
+## Key Files

-**Single Agent Usage:**
- `TOOLSETS_README.md`: Comprehensive guide to the toolsets system
- `toolsets.py`: View and modify available toolsets
- `model_tools.py`: Core tool definitions and handlers
-
-**Batch Processing:**
- `QUICKSTART_BATCH.md`: 5-minute quick start guide
- `BATCH_PROCESSING.md`: Complete batch processing documentation
- `toolset_distributions.py`: Toolset distributions for data generation
-
-## Examples
-
-See `TOOLSETS_README.md` for extensive examples of using different toolsets for various scenarios.
+| File | Purpose |
+|------|---------|
+| `run_agent.py` | Main agent runner - single query execution |
+| `batch_runner.py` | Parallel batch processing with checkpointing |
+| `model_tools.py` | Core tool definitions and handlers |
+| `toolsets.py` | Toolset definitions and composition |
+| `toolset_distributions.py` | Probability distributions for data generation |
+| `trajectory_compressor.py` | Post-process trajectories for training |
+| `tools/` | Individual tool implementations |
+| `architecture/` | Design documentation |
+| `configs/` | Example batch run scripts |
--- a/architecture/agents.md
+++ b/architecture/agents.md
@@ -1,55 +1,104 @@
 # Agents

-Agents can be viewed as an FSM using an LLM to generate inputs into the system that operates over a DAG.
+The agent is the core loop that orchestrates LLM calls and tool execution.

-What this really means is that the agent is just a function without memory that uses text inputs and outputs in a
-defined order.
+## AIAgent Class
+
+The main agent is implemented in `run_agent.py`:

 ```python
-def my_agent(*args, **kwargs) -> str:
-    # do whatever you want!
-    return "Hi I'm an agent!"
+class AIAgent:
+    def __init__(
+        self,
+        model: str = "anthropic/claude-sonnet-4",
+        api_key: str = None,
+        base_url: str = "https://openrouter.ai/api/v1",
+        max_turns: int = 20,
+        enabled_toolsets: list = None,
+        disabled_toolsets: list = None,
+        verbose_logging: bool = False,
+    ):
+        # Initialize OpenAI client, load tools based on toolsets
+        ...
+    
+    def chat(self, user_message: str, task_id: str = None) -> str:
+        # Main entry point - runs the agent loop
+        ...
 ```

-Now obviously, that's like saying water's wet, but we're going to be using that definition to inform our design of the
-library, namely, that we should *not* store agent state outside the function call.
+## Agent Loop

-## The Agent Class
+The core loop in `_run_agent_loop()`:

-So we don't have state, why are we using a class?
-
-Well, we want to initialize things, we want to have some configuration, and we want to have some helper functions.
-Preferably all in a single place.
+```
+1. Add user message to conversation
+2. Call LLM with tools
+3. If LLM returns tool calls:
+   - Execute each tool
+   - Add tool results to conversation
+   - Go to step 2
+4. If LLM returns text response:
+   - Return response to user
+```

 ```python
-class BaseAgent:
-    def agent_primitives(self) -> list[BaseAgent]:
-        # Returns a list of Agents that are utilized by this agent to generate inputs
-        # We use agent primitives here instead of subagents because these are going to be part
-        # of the message graph, not a subagent tool call.
-        raise NotImplementedError
+while turns < max_turns:
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        tools=tool_schemas,
+    )
    
-    def tools(self) -> list[BaseTool]:
-        # Returns a list of tools that the agent needs to run
-        raise NotImplementedError
-    
-    
-    def run(self, config, *args, **kwargs) -> ConversationGraph:
-        llm = get_llm(config)
-        tools = self.tools()
-        for agent in self.agent_primitives():
-            tools.extend(agent.tools())
-        tools = remove_duplicates(tools)
-        tools = initialize_tools(tools, config)
-        return self(llm, tools, config, *args, **kwargs)
-    
-    @staticmethod
-    def __call__(self, llm, tools, config, *args, **kwargs) -> ConversationGraph:
-        # Returns a ConversationGraph that can be parsed to get the output of the agent
-        # Use w/e args/kwargs you want, as long as llm/tools/config are satisfied. 
-        raise NotImplementedError
+    if response.tool_calls:
+        for tool_call in response.tool_calls:
+            result = await execute_tool(tool_call)
+            messages.append(tool_result_message(result))
+        turns += 1
+    else:
+        return response.content
 ```

-Doesn't seem too bad (I hope), it is a bit annoying that we don't initialize everything in the constructor, but
-hopefully we all kinda like it :)
+## Conversation Management

+Messages are stored as a list of dicts following OpenAI format:
+
+```python
+messages = [
+    {"role": "system", "content": "You are a helpful assistant..."},
+    {"role": "user", "content": "Search for Python tutorials"},
+    {"role": "assistant", "content": None, "tool_calls": [...]},
+    {"role": "tool", "tool_call_id": "...", "content": "..."},
+    {"role": "assistant", "content": "Here's what I found..."},
+]
+```
+
+## Reasoning Context
+
+For models that support reasoning (chain-of-thought), the agent:
+1. Extracts `reasoning_content` from API responses
+2. Stores it in `assistant_msg["reasoning"]` for trajectory export
+3. Passes it back via `reasoning_content` field on subsequent turns
+
+## Trajectory Export
+
+Conversations can be exported for training:
+
+```python
+agent = AIAgent(save_trajectories=True)
+agent.chat("Do something")
+# Saves to trajectories/*.jsonl in ShareGPT format
+```
+
+## Batch Processing
+
+For processing multiple prompts, use `batch_runner.py`:
+
+```bash
+python batch_runner.py \
+    --dataset_file=prompts.jsonl \
+    --batch_size=20 \
+    --num_workers=4 \
+    --run_name=my_run
+```
+
+See `batch_runner.py` for parallel execution with checkpointing.
--- a/architecture/llm_client.md
+++ b/architecture/llm_client.md
@@ -1,14 +1,124 @@
 # LLM Client

-A quick wrapper over openai apis
+Hermes Agent uses the OpenAI Python SDK with OpenRouter as the backend, providing access to many models through a single API.

-## Responsibilities
+## Configuration

- Transform "normal" chat/completions requests into graphs
- Translate graphs into LLM requests
- Keep a history of graphs parsed by it
-  - On Policy Data
-  - Deduplicating graphs, so we don't keep previous history as separate graphs
+```python
+from openai import OpenAI

-## How to use
-Exactly the same as the openai api! Just with the additional support of graph inputs and outputs.
+client = OpenAI(
+    api_key=os.getenv("OPENROUTER_API_KEY"),
+    base_url="https://openrouter.ai/api/v1"
+)
+```
+
+## Supported Models
+
+Any model available on [OpenRouter](https://openrouter.ai/models):
+
+```python
+# Anthropic
+model = "anthropic/claude-sonnet-4"
+model = "anthropic/claude-opus-4"
+
+# OpenAI
+model = "openai/gpt-4o"
+model = "openai/o1"
+
+# Google
+model = "google/gemini-2.0-flash"
+
+# Open models
+model = "meta-llama/llama-3.3-70b-instruct"
+model = "deepseek/deepseek-chat-v3"
+model = "moonshotai/kimi-k2.5"
+```
+
+## Tool Calling
+
+Standard OpenAI function calling format:
+
+```python
+response = client.chat.completions.create(
+    model=model,
+    messages=messages,
+    tools=[
+        {
+            "type": "function",
+            "function": {
+                "name": "web_search",
+                "description": "Search the web",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "query": {"type": "string"}
+                    },
+                    "required": ["query"]
+                }
+            }
+        }
+    ],
+)
+
+# Check for tool calls
+if response.choices[0].message.tool_calls:
+    for tool_call in response.choices[0].message.tool_calls:
+        name = tool_call.function.name
+        args = json.loads(tool_call.function.arguments)
+        # Execute tool...
+```
+
+## Reasoning Models
+
+Some models return reasoning/thinking content:
+
+```python
+# Access reasoning if available
+message = response.choices[0].message
+if hasattr(message, 'reasoning_content') and message.reasoning_content:
+    reasoning = message.reasoning_content
+    # Store for trajectory export
+```
+
+## Provider Selection
+
+OpenRouter allows selecting specific providers:
+
+```python
+response = client.chat.completions.create(
+    model=model,
+    messages=messages,
+    extra_body={
+        "provider": {
+            "order": ["Anthropic", "Google"],  # Preferred providers
+            "ignore": ["Novita"],              # Providers to skip
+        }
+    }
+)
+```
+
+## Error Handling
+
+Common errors and handling:
+
+```python
+try:
+    response = client.chat.completions.create(...)
+except openai.RateLimitError:
+    # Back off and retry
+except openai.APIError as e:
+    # Check e.code for specific errors
+    # 400 = bad request (often provider-specific)
+    # 502 = bad gateway (retry with different provider)
+```
+
+## Cost Tracking
+
+OpenRouter returns usage info:
+
+```python
+usage = response.usage
+print(f"Tokens: {usage.prompt_tokens} + {usage.completion_tokens}")
+print(f"Cost: ${usage.cost:.6f}")  # If available
+```
--- a/architecture/message_graph.md
+++ b/architecture/message_graph.md
@@ -1,114 +1,121 @@
-# Message Graph
+# Message Format & Trajectories

-```mermaid
-graph TD
-    %% Message nodes
-    SystemMsg["📋 System Message<br/>Role: System<br/>Content: Messages are nodes in a graph"]
-    UserMsg["👤 User Message<br/>Role: User<br/>Content: But messages aren't the only thing in the graph"]
-    subgraph PrevMessages["Previous Messages"]
-        PrevSystemMsg["📋 System Message<br/>Role: System<br/>Content: Edits are kept in the graph as context"]
-        PrevUserMsg["👤 User Message<br/>Role: User<br/>Content: So we can ensure they're immutable while keeping them editable"]
-    end
-    
-    %% Chat Response as a subgraph
-    subgraph ChatResponseBox["💬 Chat Response"]
-        ChatMetadata["📊 Metadata<br/>Temp: 1.0<br/>..."]
-        ChatResponseText["📝 Response<br/>Hello, Here's a subagent call: &lt;tool&gt;subagent&lt;/tool&gt;"]
-        ChatContent["Content: Hello, Here's a subagent call..."]
-    end
-    
-    %% Tool Response as a subgraph
-    subgraph ToolResponseBox["🔧 Tool Response"]
-        subgraph ToolMetadata["📊 Tool Metadata"]
-            ToolMetadataLength["Length: 3"]
-            subgraph ToolChat["💭 Subagent Chat"]
-                SubagentSystem["📋 System<br/>Content: Subagent call received"]
-                SubagentUser["👤 User<br/>Content: Process this request"]
-                SubagentAssistant["🤖 Assistant<br/>Content: Processing..."]
-                SubagentSystem --> SubagentUser
-                SubagentUser --> SubagentAssistant
-            end
-        end
-        ToolContent["Content: Subagent call output"]
-    end
-    
-    %% Graph flow connections
-    SystemMsg --> UserMsg
-    PrevSystemMsg --> PrevUserMsg
-    PrevMessages -.-> UserMsg
-    UserMsg --> ChatResponseBox
-    ChatResponseBox --> ToolResponseBox
-    
-    class SystemMsg,UserMsg messageNode
-    class ChatResponseBox responseNode
-    class ToolResponseBox responseNode
-    class ChatMetadata,ChatResponseText,ChatContent,ToolMetadata,ToolChat,ToolContent,ToolMetadataLength metadataNode
-```
+Hermes Agent uses two message formats: the **API format** for LLM calls and the **trajectory format** for training data export.

-Messages should be a graph (DAG, specifically) of immutable elements.
+## API Message Format

-## Why immutable elements?
-We want to train on policy
- This means the context cannot change after we call a response.
-
-## Why a graph?
-Nodes and connections are a natural way to represent the flow of information in an agent conversation.
-
-## Will this be annoying to deal with?
-
-It shouldn't be! While there will be internal stuff that may look ???, for the interface, it should be as simple as your
-normal context window edits, so `message_history[2]['content'] = my_edit`, but internally we'll deal with the recordkeeping
-and how this ends up parsing into on policy training data, if requested.
-
-## Edges
-
-Edges are the connections between nodes, and there are two types we are concerned with:
- **Sequential edges**: These represent the flow of conversation, connecting messages in the order they were sent. For example, a user message followed by an assistant response.
- **Parallel edges**: These represent versioning, e.g. edit history, context squishing, etc.
-We, however, are only concerned about parallel edges when we break the prefix, and ignore any other parallel edges.
-
-## So what does this look like in practice?
+Standard OpenAI chat format used during execution:

 ```python
-import copy
+messages = [
+    # System prompt
+    {"role": "system", "content": "You are a helpful assistant with tools..."},
+    
+    # User query
+    {"role": "user", "content": "Search for Python tutorials"},
+    
+    # Assistant with tool call
+    {
+        "role": "assistant",
+        "content": None,
+        "tool_calls": [{
+            "id": "call_abc123",
+            "type": "function",
+            "function": {
+                "name": "web_search",
+                "arguments": "{\"query\": \"Python tutorials\"}"
+            }
+        }]
+    },
+    
+    # Tool result
+    {
+        "role": "tool",
+        "tool_call_id": "call_abc123",
+        "content": "{\"results\": [...]}"
+    },
+    
+    # Final response
+    {"role": "assistant", "content": "Here's what I found..."}
+]
+```

+## Trajectory Format (ShareGPT)

-class MessageGraph:
-    def __init__(self):
-        self.messages = []
-        self.prev_graph = None
+Exported for training in ShareGPT format:

-    def append(self, message):
-        self.messages.append(message)
+```json
+{
+    "conversations": [
+        {"from": "system", "value": "You are a helpful assistant..."},
+        {"from": "human", "value": "Search for Python tutorials"},
+        {"from": "gpt", "value": "<tool_call>\n{\"name\": \"web_search\", \"arguments\": {\"query\": \"Python tutorials\"}}\n</tool_call>"},
+        {"from": "tool", "value": "<tool_response>\n{\"results\": [...]}\n</tool_response>"},
+        {"from": "gpt", "value": "Here's what I found..."}
+    ],
+    "tools": "[{\"type\": \"function\", \"function\": {...}}]",
+    "source": "hermes-agent"
+}
+```

-    def __getitem__(self, index):
-        return self.messages[index]
+## Reasoning Content

-    def __setitem__(self, key, value):
-        # check if an assistant message is after this indx
-        needs_new_graph = False
-        first_idx = -1
-        for i in range(key, len(self.messages)):
-            if (i == key) and (value['role'] == 'assistant') and (value['content'] == self.messages[i]['content']):
-                # no op
-                return
-            needs_new_graph = needs_new_graph or (self.messages[i]['role'] == 'assistant')
-            if needs_new_graph and first_idx == -1:
-                first_idx = i
-        if needs_new_graph:
-            self.prev_graph = copy.deepcopy(self)
-        self.messages[key] = value
+For models that output reasoning/chain-of-thought:

-    def __len__(self):
-        return len(self.messages)
+**During execution** (API format):
+```python
+# Stored internally but not sent back to model in content
+assistant_msg = {
+    "role": "assistant",
+    "content": "Here's what I found...",
+    "reasoning": "Let me think about this step by step..."  # Internal only
+}
+```

-    def __eq__(self, other):
-        return "\n\n".join(f"{msg['role']}: {msg['content']}" for msg in self) == "\n\n".join(
-            f"{msg['role']}: {msg['content']}" for msg in other)
+**In trajectory export** (reasoning wrapped in tags):
+```json
+{
+    "from": "gpt",
+    "value": "<think>\nLet me think about this step by step...\n</think>\nHere's what I found..."
+}
+```

+## Conversion Flow

-# in use
-messages = MessageGraph()
-messages.append({'role': 'system', 'content': 'Hello, I am a system message'})
-messages[0] = {'role': 'user', 'content': 'Hello, I am a user message'}
-```
+```
+API Response → Internal Storage → Trajectory Export
+     ↓              ↓                    ↓
+tool_calls    reasoning field      <tool_call> tags
+reasoning_content                  <think> tags
+```
+
+The conversion happens in `_convert_to_trajectory_format()` in `run_agent.py`.
+
+## Ephemeral System Prompts
+
+Batch processing supports ephemeral system prompts that guide behavior during execution but are NOT saved to trajectories:
+
+```python
+# During execution: full system prompt + ephemeral guidance
+messages = [
+    {"role": "system", "content": SYSTEM_PROMPT + "\n\n" + ephemeral_prompt},
+    ...
+]
+
+# In saved trajectory: only the base system prompt
+trajectory = {
+    "conversations": [
+        {"from": "system", "value": SYSTEM_PROMPT},  # No ephemeral
+        ...
+    ]
+}
+```
+
+## Trajectory Compression
+
+Long trajectories can be compressed for training using `trajectory_compressor.py`:
+
+- Protects first/last N turns
+- Summarizes middle turns with LLM
+- Targets specific token budget
+- See `configs/trajectory_compression.yaml` for settings
--- a/architecture/tools.md
+++ b/architecture/tools.md
@@ -1,16 +1,102 @@
 # Tools

-Not much on this, yet. Tools are just a stateful wrapper around a function, so we can do things like:
- Keep a docker container running
- Keep a game online
+Tools are functions that extend the agent's capabilities. Each tool is defined with an OpenAI-compatible JSON schema and an async handler function.
+
+## Tool Structure
+
+Each tool module in `tools/` exports:
+1. **Schema definitions** - OpenAI function-calling format
+2. **Handler functions** - Async functions that execute the tool

 ```python
-class BaseTool:
-    def definitions(self) -> List[Dict[str, Any]]:
-        # OpenAI API compatible definitions
-        raise NotImplementedError
-    
-    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
-        # Returns at minimum {'role': 'tool', 'content': '...'}
-        raise NotImplementedError
-```
+# Example: tools/web_tools.py
+
+# Schema definition
+WEB_SEARCH_SCHEMA = {
+    "type": "function",
+    "function": {
+        "name": "web_search",
+        "description": "Search the web for information",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "query": {"type": "string", "description": "Search query"}
+            },
+            "required": ["query"]
+        }
+    }
+}
+
+# Handler function
+async def web_search(query: str) -> dict:
+    """Execute web search and return results."""
+    # Implementation...
+    return {"results": [...]}
+```
+
+## Tool Categories
+
+| Category | Module | Tools |
+|----------|--------|-------|
+| **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` |
+| **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal backends) |
+| **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. |
+| **Vision** | `vision_tools.py` | `vision_analyze` |
+| **Image Gen** | `image_generation_tool.py` | `image_generate` |
+| **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` |
+
+## Tool Registration
+
+Tools are registered in `model_tools.py`:
+
+```python
+# model_tools.py
+TOOL_SCHEMAS = [
+    *WEB_TOOL_SCHEMAS,
+    *TERMINAL_TOOL_SCHEMAS,
+    *BROWSER_TOOL_SCHEMAS,
+    # ...
+]
+
+TOOL_HANDLERS = {
+    "web_search": web_search,
+    "terminal": terminal_tool,
+    "browser_navigate": browser_navigate,
+    # ...
+}
+```
+
+## Toolsets
+
+Tools are grouped into **toolsets** for logical organization (see `toolsets.py`):
+
+```python
+TOOLSETS = {
+    "web": {
+        "description": "Web search and content extraction",
+        "tools": ["web_search", "web_extract", "web_crawl"]
+    },
+    "terminal": {
+        "description": "Command execution",
+        "tools": ["terminal"]
+    },
+    # ...
+}
+```
+
+## Adding a New Tool
+
+1. Create handler function in `tools/your_tool.py`
+2. Define JSON schema following OpenAI format
+3. Register in `model_tools.py` (schemas and handlers)
+4. Add to appropriate toolset in `toolsets.py`
+5. Update `tools/__init__.py` exports
+
+## Stateful Tools
+
+Some tools maintain state across calls within a session:
+
+- **Terminal**: Keeps container/sandbox running between commands
+- **Browser**: Maintains browser session for multi-step navigation
+
+State is managed per `task_id` and cleaned up automatically.
--- a/package.json
+++ b/package.json
@@ -2,27 +2,23 @@
  "name": "hermes-agent",
  "version": "1.0.0",
  "description": "An AI agent with advanced tool-calling capabilities, featuring a flexible toolsets system for organizing and managing tools.",
-  "main": "index.js",
-  "directories": {
-    "doc": "docs",
-    "example": "examples",
-    "test": "tests"
-  },
+  "private": true,
  "scripts": {
-    "test": "echo \"Error: no test specified\" && exit 1"
+    "postinstall": "echo '✅ Browser tools ready. Run: python run_agent.py --help'"
  },
  "repository": {
    "type": "git",
    "url": "git+https://github.com/NousResearch/Hermes-Agent.git"
  },
-  "keywords": [],
-  "author": "",
-  "license": "ISC",
+  "license": "MIT",
  "bugs": {
    "url": "https://github.com/NousResearch/Hermes-Agent/issues"
  },
  "homepage": "https://github.com/NousResearch/Hermes-Agent#readme",
  "dependencies": {
    "agent-browser": "^0.7.6"
+  },
+  "engines": {
+    "node": ">=18.0.0"
  }
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,21 +8,38 @@ version = "0.1.0"
 description = "AI agent with advanced tool-calling and toolsets"
 readme = "README.md"
 requires-python = ">=3.10"
-authors = [{ name = "Hermes Agent" }]
+authors = [{ name = "Nous Research" }]
 license = { text = "MIT" }
 dependencies = [
-  "firecrawl-py",
+  # Core
  "openai",
-  "fal-client",
  "python-dotenv",
-  "fire"
+  "fire",
+  "httpx",
+  "rich",
+  "tenacity",
+  "pyyaml",
+  "requests",
+  "jinja2",
+  "pydantic>=2.0",
+  # Tools
+  "firecrawl-py",
+  "fal-client",
+  # mini-swe-agent deps (terminal tool)
+  "litellm>=1.75.5",
+  "typer",
+  "platformdirs",
 ]

+[project.optional-dependencies]
+modal = ["modal", "boto3"]
+dev = ["pytest", "pytest-asyncio"]
+
 [project.scripts]
 hermes-agent = "run_agent:main"

 [tool.setuptools]
-py-modules = ["run_agent", "model_tools", "toolsets"]
+py-modules = ["run_agent", "model_tools", "toolsets", "batch_runner", "trajectory_compressor", "toolset_distributions"]

 [tool.setuptools.packages.find]
 include = ["tools"]