Update environment configuration and enhance tool definitions

- Modified `.env.example` to set default terminal environment to 'local' and updated Docker, Singularity, and Modal image references to use 'python:3.11-slim'.
- Updated `package.json` to include Node.js engine requirements and modified post-install script for better user guidance.
- Enhanced `pyproject.toml` to reflect new dependencies and optional dependencies for modal and development environments.
- Improved `README.md` with additional setup instructions for Singularity and Node.js dependencies, along with clearer toolset documentation.
- Refactored `model_tools.py` to include new tool definitions and ensure consistency across toolsets.
- Updated architecture documentation to clarify tool structure and registration processes.
This commit is contained in:
teknium
2026-01-29 22:36:07 +00:00
parent f8846f85a1
commit 7ea17bb957
8 changed files with 535 additions and 257 deletions

View File

@@ -37,12 +37,12 @@ FAL_KEY=
# - singularity: Runs in Apptainer/Singularity containers (HPC clusters, no root needed)
# - docker: Runs in Docker containers (isolated, requires Docker + docker group)
# - modal: Runs in Modal cloud sandboxes (scalable, requires Modal account)
TERMINAL_ENV=singularity
TERMINAL_ENV=local
# Container images (for singularity/docker/modal backends)
TERMINAL_DOCKER_IMAGE=python:3.11
TERMINAL_SINGULARITY_IMAGE=docker://python:3.11
TERMINAL_MODAL_IMAGE=python:3.11
TERMINAL_DOCKER_IMAGE=python:3.11-slim
TERMINAL_SINGULARITY_IMAGE=docker://python:3.11-slim
TERMINAL_MODAL_IMAGE=python:3.11-slim
# Working directory inside the container
TERMINAL_CWD=/tmp
@@ -53,13 +53,19 @@ TERMINAL_TIMEOUT=60
# Cleanup inactive environments after this many seconds
TERMINAL_LIFETIME_SECONDS=300
# Scratch directory for Singularity sandboxes (optional)
# If not set, uses /scratch (if available) or /tmp
# TERMINAL_SCRATCH_DIR=/scratch/myuser
# Disk usage warning threshold in GB (default: 500)
TERMINAL_DISK_WARNING_GB=500
# =============================================================================
# MODAL CLOUD BACKEND (Optional - for TERMINAL_ENV=modal)
# =============================================================================
# Modal uses CLI authentication, not environment variables.
# Run: pip install modal && modal setup
# This will authenticate via browser and store credentials locally.
# No API key needed in .env - Modal handles auth automatically.
# =============================================================================
# BROWSER TOOL CONFIGURATION (agent-browser + Browserbase)
@@ -79,60 +85,19 @@ BROWSERBASE_API_KEY=
BROWSERBASE_PROJECT_ID=
# Enable residential proxies for better CAPTCHA solving (default: true)
# Routes traffic through residential IPs, significantly improves success rate
BROWSERBASE_PROXIES=true
# Enable advanced stealth mode (default: false, requires Scale Plan)
# Uses custom Chromium build to avoid bot detection altogether
BROWSERBASE_ADVANCED_STEALTH=false
# Browser session timeout in seconds (optional, default: 300)
# Sessions are cleaned up after this duration of inactivity
# Browser session timeout in seconds (default: 300)
BROWSER_SESSION_TIMEOUT=300
# =============================================================================
# Browser automation requires Browserbase cloud service for remote browser execution.
# This allows the agent to navigate websites, fill forms, and extract information.
# Browserbase API Key - Cloud browser execution
# Get at: https://browserbase.com/
BROWSERBASE_API_KEY=
# Browserbase Project ID - From your Browserbase dashboard
BROWSERBASE_PROJECT_ID=
# Enable proxies for better CAPTCHA solving and anti-bot avoidance (default: true)
# Proxies route traffic through residential IPs for more reliable access
BROWSERBASE_PROXIES=true
# Enable advanced stealth mode (default: false, requires Scale Plan)
# Uses custom Chromium build to avoid bot detection altogether
BROWSERBASE_ADVANCED_STEALTH=false
# Browser session timeout in seconds (optional, default: 300)
# Sessions are cleaned up after this duration of inactivity
BROWSER_SESSION_TIMEOUT=300
# =============================================================================
# Browser automation requires Browserbase cloud service for remote browser execution.
# This allows the agent to navigate websites, fill forms, and extract information.
# Browserbase API Key - Cloud browser execution
# Get at: https://browserbase.com/
BROWSERBASE_API_KEY=
# Browserbase Project ID - From your Browserbase dashboard
BROWSERBASE_PROJECT_ID=
# Browser session timeout in seconds (optional, default: 300)
# Sessions are cleaned up after this duration of inactivity
BROWSER_SESSION_TIMEOUT=300
# =============================================================================
# LEGACY/OPTIONAL API KEYS
# LEGACY/OPTIONAL
# =============================================================================
# Morph API Key - For legacy Hecate terminal backend (terminal-hecate tool)
# Morph API Key - For legacy Hecate terminal backend
# Get at: https://morph.so/
MORPH_API_KEY=
@@ -147,12 +112,3 @@ WEB_TOOLS_DEBUG=false
VISION_TOOLS_DEBUG=false
MOA_TOOLS_DEBUG=false
IMAGE_TOOLS_DEBUG=false
# Scratch directory for Singularity sandboxes (optional)
# If not set, uses /scratch (if available) or /tmp
# Set this to a directory with lots of space for large pip installs
# TERMINAL_SCRATCH_DIR=/scratch/myuser
# Disk usage warning threshold in GB (default: 500)
# Warning is printed when total sandbox disk usage exceeds this
TERMINAL_DISK_WARNING_GB=500

103
README.md
View File

@@ -32,11 +32,14 @@ git submodule update --init --recursive
python3 -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Install required packages
# Install Python packages
pip install -r requirements.txt
# Install mini-swe-agent for terminal tools
pip install -e ./mini-swe-agent
# Install Node.js dependencies for browser tools (requires Node.js)
npm install
```
### 3. Configure Environment Variables
@@ -82,6 +85,31 @@ TERMINAL_TIMEOUT=60
- **docker**: Requires Docker installed and user in `docker` group
- **modal**: Requires Modal account (see setup below)
### Singularity/Apptainer Setup (Recommended for HPC)
Singularity/Apptainer provides rootless container execution, ideal for HPC clusters:
```bash
# 1. Verify Apptainer is installed
apptainer --version # or: singularity --version
# 2. Set up cache directories (important for parallel workers)
# Use /scratch if available (HPC), otherwise /tmp
export APPTAINER_CACHEDIR=/scratch/$USER/.apptainer
export APPTAINER_TMPDIR=/scratch/$USER/.apptainer/tmp
mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR"
# 3. Pre-build SIF image (recommended for parallel batch processing)
# This avoids race conditions when multiple workers start simultaneously
apptainer build $APPTAINER_CACHEDIR/python-nodejs.sif docker://nikolaik/python-nodejs:python3.11-nodejs20
# 4. Configure .env to use the local SIF
TERMINAL_ENV=singularity
TERMINAL_SINGULARITY_IMAGE=/scratch/$USER/.apptainer/python-nodejs.sif
```
**Tip:** The batch scripts in `configs/` automatically handle SIF pre-building if `/scratch` is available.
### Modal Cloud Backend Setup
[Modal](https://modal.com) provides serverless cloud compute for running sandboxed environments at scale.
@@ -107,8 +135,9 @@ Browser tools enable the agent to navigate websites, fill forms, click buttons,
# 1. Install Node.js (if not already installed)
# Use nvm (recommended) or your package manager
# 2. Install agent-browser CLI globally
npm install -g agent-browser
# 2. Install agent-browser CLI (choose one option):
npm install -g agent-browser # Option A: Global install (recommended)
npm install # Option B: Local install (uses npx fallback)
# 3. Get Browserbase credentials
# Sign up at https://browserbase.com/ and get your:
@@ -188,7 +217,7 @@ python run_agent.py --enabled_toolsets=safe --query "Help without running comman
python run_agent.py --list_tools
```
For detailed documentation on toolsets, see `TOOLSETS_README.md`.
See `toolsets.py` for the complete list of available toolsets and how to create custom ones.
## Basic Usage
@@ -260,8 +289,36 @@ python batch_runner.py \
- Combined output in `data/<run_name>/trajectories.jsonl`
- Tool usage statistics and success rates
**Quick Start:** See [QUICKSTART_BATCH.md](QUICKSTART_BATCH.md) for a 5-minute getting started guide.
**Full Documentation:** See [BATCH_PROCESSING.md](BATCH_PROCESSING.md) for comprehensive documentation.
Use `--list_distributions` to see available toolset distributions for varied data generation.
### Trajectory Compression
Post-process trajectories to fit within token budgets for training:
```bash
# Compress a directory of JSONL files
python trajectory_compressor.py --input=data/my_run
# Compress a single JSONL file
python trajectory_compressor.py --input=data/trajectories.jsonl
# Compress a 15% sample (useful for creating smaller training sets)
python trajectory_compressor.py --input=data/trajectories.jsonl --sample_percent=15
# Custom output and token target
python trajectory_compressor.py \
--input=data/trajectories.jsonl \
--output=data/compressed.jsonl \
--target_max_tokens=16000
```
**Features:**
- Protects first turns (system, human, first GPT response, first tool call)
- Protects last N turns (configurable)
- Summarizes middle turns using LLM to fit target token budget
- Supports both directory and single file input
- Optional random sampling with `--sample_percent`
- Configurable via `configs/trajectory_compression.yaml`
### Ephemeral System Prompts
@@ -282,7 +339,7 @@ python batch_runner.py \
The ephemeral prompt will influence the model's behavior during execution, but **only the standard tool-calling system prompt** will be saved in the trajectory files.
**Documentation:** See [docs/ephemeral_system_prompt.md](docs/ephemeral_system_prompt.md) for complete details.
The ephemeral prompt influences model behavior during execution, but **only the standard tool-calling system prompt** is saved in trajectory files.
## Command Line Arguments
@@ -321,11 +378,13 @@ All environment variables can be configured in the `.env` file (copy from `.env.
- `FAL_KEY`: Image generation tools
**Terminal Tool Configuration (mini-swe-agent backend):**
- `TERMINAL_ENV`: Backend type - `local`, `docker`, or `modal` (default: `local`)
- `TERMINAL_DOCKER_IMAGE`: Docker image to use (default: `python:3.11-slim`)
- `TERMINAL_ENV`: Backend type - `local`, `docker`, `singularity`, or `modal` (default: `local`)
- `TERMINAL_DOCKER_IMAGE`: Docker image for docker backend (default: `python:3.11-slim`)
- `TERMINAL_SINGULARITY_IMAGE`: Singularity/Apptainer image (can be `docker://...` URL or local `.sif` path)
- `TERMINAL_TIMEOUT`: Command timeout in seconds (default: `60`)
- `TERMINAL_LIFETIME_SECONDS`: Cleanup inactive environments after this time (default: `300`)
- `TERMINAL_CWD`: Working directory inside containers (default: `/tmp`)
- `TERMINAL_SCRATCH_DIR`: Custom scratch directory for sandbox storage (optional, auto-detects `/scratch`)
**Browser Tool Configuration (agent-browser + Browserbase):**
- `BROWSERBASE_API_KEY`: Browserbase API key for cloud browser execution
@@ -340,18 +399,16 @@ All environment variables can be configured in the `.env` file (copy from `.env.
**Debug Options:**
- `WEB_TOOLS_DEBUG`, `VISION_TOOLS_DEBUG`, `MOA_TOOLS_DEBUG`, `IMAGE_TOOLS_DEBUG`: Enable debug logging
## Documentation
## Key Files
**Single Agent Usage:**
- `TOOLSETS_README.md`: Comprehensive guide to the toolsets system
- `toolsets.py`: View and modify available toolsets
- `model_tools.py`: Core tool definitions and handlers
**Batch Processing:**
- `QUICKSTART_BATCH.md`: 5-minute quick start guide
- `BATCH_PROCESSING.md`: Complete batch processing documentation
- `toolset_distributions.py`: Toolset distributions for data generation
## Examples
See `TOOLSETS_README.md` for extensive examples of using different toolsets for various scenarios.
| File | Purpose |
|------|---------|
| `run_agent.py` | Main agent runner - single query execution |
| `batch_runner.py` | Parallel batch processing with checkpointing |
| `model_tools.py` | Core tool definitions and handlers |
| `toolsets.py` | Toolset definitions and composition |
| `toolset_distributions.py` | Probability distributions for data generation |
| `trajectory_compressor.py` | Post-process trajectories for training |
| `tools/` | Individual tool implementations |
| `architecture/` | Design documentation |
| `configs/` | Example batch run scripts |

View File

@@ -1,55 +1,104 @@
# Agents
Agents can be viewed as an FSM using an LLM to generate inputs into the system that operates over a DAG.
The agent is the core loop that orchestrates LLM calls and tool execution.
What this really means is that the agent is just a function without memory that uses text inputs and outputs in a
defined order.
## AIAgent Class
The main agent is implemented in `run_agent.py`:
```python
def my_agent(*args, **kwargs) -> str:
# do whatever you want!
return "Hi I'm an agent!"
class AIAgent:
def __init__(
self,
model: str = "anthropic/claude-sonnet-4",
api_key: str = None,
base_url: str = "https://openrouter.ai/api/v1",
max_turns: int = 20,
enabled_toolsets: list = None,
disabled_toolsets: list = None,
verbose_logging: bool = False,
):
# Initialize OpenAI client, load tools based on toolsets
...
def chat(self, user_message: str, task_id: str = None) -> str:
# Main entry point - runs the agent loop
...
```
Now obviously, that's like saying water's wet, but we're going to be using that definition to inform our design of the
library, namely, that we should *not* store agent state outside the function call.
## Agent Loop
## The Agent Class
The core loop in `_run_agent_loop()`:
So we don't have state, why are we using a class?
Well, we want to initialize things, we want to have some configuration, and we want to have some helper functions.
Preferably all in a single place.
```
1. Add user message to conversation
2. Call LLM with tools
3. If LLM returns tool calls:
- Execute each tool
- Add tool results to conversation
- Go to step 2
4. If LLM returns text response:
- Return response to user
```
```python
class BaseAgent:
def agent_primitives(self) -> list[BaseAgent]:
# Returns a list of Agents that are utilized by this agent to generate inputs
# We use agent primitives here instead of subagents because these are going to be part
# of the message graph, not a subagent tool call.
raise NotImplementedError
while turns < max_turns:
response = client.chat.completions.create(
model=model,
messages=messages,
tools=tool_schemas,
)
def tools(self) -> list[BaseTool]:
# Returns a list of tools that the agent needs to run
raise NotImplementedError
def run(self, config, *args, **kwargs) -> ConversationGraph:
llm = get_llm(config)
tools = self.tools()
for agent in self.agent_primitives():
tools.extend(agent.tools())
tools = remove_duplicates(tools)
tools = initialize_tools(tools, config)
return self(llm, tools, config, *args, **kwargs)
@staticmethod
def __call__(self, llm, tools, config, *args, **kwargs) -> ConversationGraph:
# Returns a ConversationGraph that can be parsed to get the output of the agent
# Use w/e args/kwargs you want, as long as llm/tools/config are satisfied.
raise NotImplementedError
if response.tool_calls:
for tool_call in response.tool_calls:
result = await execute_tool(tool_call)
messages.append(tool_result_message(result))
turns += 1
else:
return response.content
```
Doesn't seem too bad (I hope), it is a bit annoying that we don't initialize everything in the constructor, but
hopefully we all kinda like it :)
## Conversation Management
Messages are stored as a list of dicts following OpenAI format:
```python
messages = [
{"role": "system", "content": "You are a helpful assistant..."},
{"role": "user", "content": "Search for Python tutorials"},
{"role": "assistant", "content": None, "tool_calls": [...]},
{"role": "tool", "tool_call_id": "...", "content": "..."},
{"role": "assistant", "content": "Here's what I found..."},
]
```
## Reasoning Context
For models that support reasoning (chain-of-thought), the agent:
1. Extracts `reasoning_content` from API responses
2. Stores it in `assistant_msg["reasoning"]` for trajectory export
3. Passes it back via `reasoning_content` field on subsequent turns
## Trajectory Export
Conversations can be exported for training:
```python
agent = AIAgent(save_trajectories=True)
agent.chat("Do something")
# Saves to trajectories/*.jsonl in ShareGPT format
```
## Batch Processing
For processing multiple prompts, use `batch_runner.py`:
```bash
python batch_runner.py \
--dataset_file=prompts.jsonl \
--batch_size=20 \
--num_workers=4 \
--run_name=my_run
```
See `batch_runner.py` for parallel execution with checkpointing.

View File

@@ -1,14 +1,124 @@
# LLM Client
A quick wrapper over openai apis
Hermes Agent uses the OpenAI Python SDK with OpenRouter as the backend, providing access to many models through a single API.
## Responsibilities
## Configuration
- Transform "normal" chat/completions requests into graphs
- Translate graphs into LLM requests
- Keep a history of graphs parsed by it
- On Policy Data
- Deduplicating graphs, so we don't keep previous history as separate graphs
```python
from openai import OpenAI
## How to use
Exactly the same as the openai api! Just with the additional support of graph inputs and outputs.
client = OpenAI(
api_key=os.getenv("OPENROUTER_API_KEY"),
base_url="https://openrouter.ai/api/v1"
)
```
## Supported Models
Any model available on [OpenRouter](https://openrouter.ai/models):
```python
# Anthropic
model = "anthropic/claude-sonnet-4"
model = "anthropic/claude-opus-4"
# OpenAI
model = "openai/gpt-4o"
model = "openai/o1"
# Google
model = "google/gemini-2.0-flash"
# Open models
model = "meta-llama/llama-3.3-70b-instruct"
model = "deepseek/deepseek-chat-v3"
model = "moonshotai/kimi-k2.5"
```
## Tool Calling
Standard OpenAI function calling format:
```python
response = client.chat.completions.create(
model=model,
messages=messages,
tools=[
{
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"}
},
"required": ["query"]
}
}
}
],
)
# Check for tool calls
if response.choices[0].message.tool_calls:
for tool_call in response.choices[0].message.tool_calls:
name = tool_call.function.name
args = json.loads(tool_call.function.arguments)
# Execute tool...
```
## Reasoning Models
Some models return reasoning/thinking content:
```python
# Access reasoning if available
message = response.choices[0].message
if hasattr(message, 'reasoning_content') and message.reasoning_content:
reasoning = message.reasoning_content
# Store for trajectory export
```
## Provider Selection
OpenRouter allows selecting specific providers:
```python
response = client.chat.completions.create(
model=model,
messages=messages,
extra_body={
"provider": {
"order": ["Anthropic", "Google"], # Preferred providers
"ignore": ["Novita"], # Providers to skip
}
}
)
```
## Error Handling
Common errors and handling:
```python
try:
response = client.chat.completions.create(...)
except openai.RateLimitError:
# Back off and retry
except openai.APIError as e:
# Check e.code for specific errors
# 400 = bad request (often provider-specific)
# 502 = bad gateway (retry with different provider)
```
## Cost Tracking
OpenRouter returns usage info:
```python
usage = response.usage
print(f"Tokens: {usage.prompt_tokens} + {usage.completion_tokens}")
print(f"Cost: ${usage.cost:.6f}") # If available
```

View File

@@ -1,114 +1,121 @@
# Message Graph
# Message Format & Trajectories
```mermaid
graph TD
%% Message nodes
SystemMsg["📋 System Message<br/>Role: System<br/>Content: Messages are nodes in a graph"]
UserMsg["👤 User Message<br/>Role: User<br/>Content: But messages aren't the only thing in the graph"]
subgraph PrevMessages["Previous Messages"]
PrevSystemMsg["📋 System Message<br/>Role: System<br/>Content: Edits are kept in the graph as context"]
PrevUserMsg["👤 User Message<br/>Role: User<br/>Content: So we can ensure they're immutable while keeping them editable"]
end
%% Chat Response as a subgraph
subgraph ChatResponseBox["💬 Chat Response"]
ChatMetadata["📊 Metadata<br/>Temp: 1.0<br/>..."]
ChatResponseText["📝 Response<br/>Hello, Here's a subagent call: &lt;tool&gt;subagent&lt;/tool&gt;"]
ChatContent["Content: Hello, Here's a subagent call..."]
end
%% Tool Response as a subgraph
subgraph ToolResponseBox["🔧 Tool Response"]
subgraph ToolMetadata["📊 Tool Metadata"]
ToolMetadataLength["Length: 3"]
subgraph ToolChat["💭 Subagent Chat"]
SubagentSystem["📋 System<br/>Content: Subagent call received"]
SubagentUser["👤 User<br/>Content: Process this request"]
SubagentAssistant["🤖 Assistant<br/>Content: Processing..."]
SubagentSystem --> SubagentUser
SubagentUser --> SubagentAssistant
end
end
ToolContent["Content: Subagent call output"]
end
%% Graph flow connections
SystemMsg --> UserMsg
PrevSystemMsg --> PrevUserMsg
PrevMessages -.-> UserMsg
UserMsg --> ChatResponseBox
ChatResponseBox --> ToolResponseBox
class SystemMsg,UserMsg messageNode
class ChatResponseBox responseNode
class ToolResponseBox responseNode
class ChatMetadata,ChatResponseText,ChatContent,ToolMetadata,ToolChat,ToolContent,ToolMetadataLength metadataNode
```
Hermes Agent uses two message formats: the **API format** for LLM calls and the **trajectory format** for training data export.
Messages should be a graph (DAG, specifically) of immutable elements.
## API Message Format
## Why immutable elements?
We want to train on policy
- This means the context cannot change after we call a response.
## Why a graph?
Nodes and connections are a natural way to represent the flow of information in an agent conversation.
## Will this be annoying to deal with?
It shouldn't be! While there will be internal stuff that may look ???, for the interface, it should be as simple as your
normal context window edits, so `message_history[2]['content'] = my_edit`, but internally we'll deal with the recordkeeping
and how this ends up parsing into on policy training data, if requested.
## Edges
Edges are the connections between nodes, and there are two types we are concerned with:
- **Sequential edges**: These represent the flow of conversation, connecting messages in the order they were sent. For example, a user message followed by an assistant response.
- **Parallel edges**: These represent versioning, e.g. edit history, context squishing, etc.
We, however, are only concerned about parallel edges when we break the prefix, and ignore any other parallel edges.
## So what does this look like in practice?
Standard OpenAI chat format used during execution:
```python
import copy
messages = [
# System prompt
{"role": "system", "content": "You are a helpful assistant with tools..."},
# User query
{"role": "user", "content": "Search for Python tutorials"},
# Assistant with tool call
{
"role": "assistant",
"content": None,
"tool_calls": [{
"id": "call_abc123",
"type": "function",
"function": {
"name": "web_search",
"arguments": "{\"query\": \"Python tutorials\"}"
}
}]
},
# Tool result
{
"role": "tool",
"tool_call_id": "call_abc123",
"content": "{\"results\": [...]}"
},
# Final response
{"role": "assistant", "content": "Here's what I found..."}
]
```
## Trajectory Format (ShareGPT)
class MessageGraph:
def __init__(self):
self.messages = []
self.prev_graph = None
Exported for training in ShareGPT format:
def append(self, message):
self.messages.append(message)
```json
{
"conversations": [
{"from": "system", "value": "You are a helpful assistant..."},
{"from": "human", "value": "Search for Python tutorials"},
{"from": "gpt", "value": "<tool_call>\n{\"name\": \"web_search\", \"arguments\": {\"query\": \"Python tutorials\"}}\n</tool_call>"},
{"from": "tool", "value": "<tool_response>\n{\"results\": [...]}\n</tool_response>"},
{"from": "gpt", "value": "Here's what I found..."}
],
"tools": "[{\"type\": \"function\", \"function\": {...}}]",
"source": "hermes-agent"
}
```
def __getitem__(self, index):
return self.messages[index]
## Reasoning Content
def __setitem__(self, key, value):
# check if an assistant message is after this indx
needs_new_graph = False
first_idx = -1
for i in range(key, len(self.messages)):
if (i == key) and (value['role'] == 'assistant') and (value['content'] == self.messages[i]['content']):
# no op
return
needs_new_graph = needs_new_graph or (self.messages[i]['role'] == 'assistant')
if needs_new_graph and first_idx == -1:
first_idx = i
if needs_new_graph:
self.prev_graph = copy.deepcopy(self)
self.messages[key] = value
For models that output reasoning/chain-of-thought:
def __len__(self):
return len(self.messages)
**During execution** (API format):
```python
# Stored internally but not sent back to model in content
assistant_msg = {
"role": "assistant",
"content": "Here's what I found...",
"reasoning": "Let me think about this step by step..." # Internal only
}
```
def __eq__(self, other):
return "\n\n".join(f"{msg['role']}: {msg['content']}" for msg in self) == "\n\n".join(
f"{msg['role']}: {msg['content']}" for msg in other)
**In trajectory export** (reasoning wrapped in tags):
```json
{
"from": "gpt",
"value": "<think>\nLet me think about this step by step...\n</think>\nHere's what I found..."
}
```
## Conversion Flow
# in use
messages = MessageGraph()
messages.append({'role': 'system', 'content': 'Hello, I am a system message'})
messages[0] = {'role': 'user', 'content': 'Hello, I am a user message'}
```
```
API Response → Internal Storage → Trajectory Export
↓ ↓ ↓
tool_calls reasoning field <tool_call> tags
reasoning_content <think> tags
```
The conversion happens in `_convert_to_trajectory_format()` in `run_agent.py`.
## Ephemeral System Prompts
Batch processing supports ephemeral system prompts that guide behavior during execution but are NOT saved to trajectories:
```python
# During execution: full system prompt + ephemeral guidance
messages = [
{"role": "system", "content": SYSTEM_PROMPT + "\n\n" + ephemeral_prompt},
...
]
# In saved trajectory: only the base system prompt
trajectory = {
"conversations": [
{"from": "system", "value": SYSTEM_PROMPT}, # No ephemeral
...
]
}
```
## Trajectory Compression
Long trajectories can be compressed for training using `trajectory_compressor.py`:
- Protects first/last N turns
- Summarizes middle turns with LLM
- Targets specific token budget
- See `configs/trajectory_compression.yaml` for settings

View File

@@ -1,16 +1,102 @@
# Tools
Not much on this, yet. Tools are just a stateful wrapper around a function, so we can do things like:
- Keep a docker container running
- Keep a game online
Tools are functions that extend the agent's capabilities. Each tool is defined with an OpenAI-compatible JSON schema and an async handler function.
## Tool Structure
Each tool module in `tools/` exports:
1. **Schema definitions** - OpenAI function-calling format
2. **Handler functions** - Async functions that execute the tool
```python
class BaseTool:
def definitions(self) -> List[Dict[str, Any]]:
# OpenAI API compatible definitions
raise NotImplementedError
def __call__(self, *args, **kwargs) -> Dict[str, Any]:
# Returns at minimum {'role': 'tool', 'content': '...'}
raise NotImplementedError
```
# Example: tools/web_tools.py
# Schema definition
WEB_SEARCH_SCHEMA = {
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web for information",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "Search query"}
},
"required": ["query"]
}
}
}
# Handler function
async def web_search(query: str) -> dict:
"""Execute web search and return results."""
# Implementation...
return {"results": [...]}
```
## Tool Categories
| Category | Module | Tools |
|----------|--------|-------|
| **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` |
| **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal backends) |
| **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. |
| **Vision** | `vision_tools.py` | `vision_analyze` |
| **Image Gen** | `image_generation_tool.py` | `image_generate` |
| **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` |
## Tool Registration
Tools are registered in `model_tools.py`:
```python
# model_tools.py
TOOL_SCHEMAS = [
*WEB_TOOL_SCHEMAS,
*TERMINAL_TOOL_SCHEMAS,
*BROWSER_TOOL_SCHEMAS,
# ...
]
TOOL_HANDLERS = {
"web_search": web_search,
"terminal": terminal_tool,
"browser_navigate": browser_navigate,
# ...
}
```
## Toolsets
Tools are grouped into **toolsets** for logical organization (see `toolsets.py`):
```python
TOOLSETS = {
"web": {
"description": "Web search and content extraction",
"tools": ["web_search", "web_extract", "web_crawl"]
},
"terminal": {
"description": "Command execution",
"tools": ["terminal"]
},
# ...
}
```
## Adding a New Tool
1. Create handler function in `tools/your_tool.py`
2. Define JSON schema following OpenAI format
3. Register in `model_tools.py` (schemas and handlers)
4. Add to appropriate toolset in `toolsets.py`
5. Update `tools/__init__.py` exports
## Stateful Tools
Some tools maintain state across calls within a session:
- **Terminal**: Keeps container/sandbox running between commands
- **Browser**: Maintains browser session for multi-step navigation
State is managed per `task_id` and cleaned up automatically.

View File

@@ -2,27 +2,23 @@
"name": "hermes-agent",
"version": "1.0.0",
"description": "An AI agent with advanced tool-calling capabilities, featuring a flexible toolsets system for organizing and managing tools.",
"main": "index.js",
"directories": {
"doc": "docs",
"example": "examples",
"test": "tests"
},
"private": true,
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
"postinstall": "echo '✅ Browser tools ready. Run: python run_agent.py --help'"
},
"repository": {
"type": "git",
"url": "git+https://github.com/NousResearch/Hermes-Agent.git"
},
"keywords": [],
"author": "",
"license": "ISC",
"license": "MIT",
"bugs": {
"url": "https://github.com/NousResearch/Hermes-Agent/issues"
},
"homepage": "https://github.com/NousResearch/Hermes-Agent#readme",
"dependencies": {
"agent-browser": "^0.7.6"
},
"engines": {
"node": ">=18.0.0"
}
}

View File

@@ -8,21 +8,38 @@ version = "0.1.0"
description = "AI agent with advanced tool-calling and toolsets"
readme = "README.md"
requires-python = ">=3.10"
authors = [{ name = "Hermes Agent" }]
authors = [{ name = "Nous Research" }]
license = { text = "MIT" }
dependencies = [
"firecrawl-py",
# Core
"openai",
"fal-client",
"python-dotenv",
"fire"
"fire",
"httpx",
"rich",
"tenacity",
"pyyaml",
"requests",
"jinja2",
"pydantic>=2.0",
# Tools
"firecrawl-py",
"fal-client",
# mini-swe-agent deps (terminal tool)
"litellm>=1.75.5",
"typer",
"platformdirs",
]
[project.optional-dependencies]
modal = ["modal", "boto3"]
dev = ["pytest", "pytest-asyncio"]
[project.scripts]
hermes-agent = "run_agent:main"
[tool.setuptools]
py-modules = ["run_agent", "model_tools", "toolsets"]
py-modules = ["run_agent", "model_tools", "toolsets", "batch_runner", "trajectory_compressor", "toolset_distributions"]
[tool.setuptools.packages.find]
include = ["tools"]