diff --git a/.env.example b/.env.example index e77eb617e..4774800d1 100644 --- a/.env.example +++ b/.env.example @@ -37,12 +37,12 @@ FAL_KEY= # - singularity: Runs in Apptainer/Singularity containers (HPC clusters, no root needed) # - docker: Runs in Docker containers (isolated, requires Docker + docker group) # - modal: Runs in Modal cloud sandboxes (scalable, requires Modal account) -TERMINAL_ENV=singularity +TERMINAL_ENV=local # Container images (for singularity/docker/modal backends) -TERMINAL_DOCKER_IMAGE=python:3.11 -TERMINAL_SINGULARITY_IMAGE=docker://python:3.11 -TERMINAL_MODAL_IMAGE=python:3.11 +TERMINAL_DOCKER_IMAGE=python:3.11-slim +TERMINAL_SINGULARITY_IMAGE=docker://python:3.11-slim +TERMINAL_MODAL_IMAGE=python:3.11-slim # Working directory inside the container TERMINAL_CWD=/tmp @@ -53,13 +53,19 @@ TERMINAL_TIMEOUT=60 # Cleanup inactive environments after this many seconds TERMINAL_LIFETIME_SECONDS=300 +# Scratch directory for Singularity sandboxes (optional) +# If not set, uses /scratch (if available) or /tmp +# TERMINAL_SCRATCH_DIR=/scratch/myuser + +# Disk usage warning threshold in GB (default: 500) +TERMINAL_DISK_WARNING_GB=500 + # ============================================================================= # MODAL CLOUD BACKEND (Optional - for TERMINAL_ENV=modal) # ============================================================================= # Modal uses CLI authentication, not environment variables. # Run: pip install modal && modal setup # This will authenticate via browser and store credentials locally. -# No API key needed in .env - Modal handles auth automatically. # ============================================================================= # BROWSER TOOL CONFIGURATION (agent-browser + Browserbase) @@ -79,60 +85,19 @@ BROWSERBASE_API_KEY= BROWSERBASE_PROJECT_ID= # Enable residential proxies for better CAPTCHA solving (default: true) -# Routes traffic through residential IPs, significantly improves success rate BROWSERBASE_PROXIES=true # Enable advanced stealth mode (default: false, requires Scale Plan) -# Uses custom Chromium build to avoid bot detection altogether BROWSERBASE_ADVANCED_STEALTH=false -# Browser session timeout in seconds (optional, default: 300) -# Sessions are cleaned up after this duration of inactivity +# Browser session timeout in seconds (default: 300) BROWSER_SESSION_TIMEOUT=300 # ============================================================================= -# Browser automation requires Browserbase cloud service for remote browser execution. -# This allows the agent to navigate websites, fill forms, and extract information. - -# Browserbase API Key - Cloud browser execution -# Get at: https://browserbase.com/ -BROWSERBASE_API_KEY= - -# Browserbase Project ID - From your Browserbase dashboard -BROWSERBASE_PROJECT_ID= - -# Enable proxies for better CAPTCHA solving and anti-bot avoidance (default: true) -# Proxies route traffic through residential IPs for more reliable access -BROWSERBASE_PROXIES=true - -# Enable advanced stealth mode (default: false, requires Scale Plan) -# Uses custom Chromium build to avoid bot detection altogether -BROWSERBASE_ADVANCED_STEALTH=false - -# Browser session timeout in seconds (optional, default: 300) -# Sessions are cleaned up after this duration of inactivity -BROWSER_SESSION_TIMEOUT=300 - -# ============================================================================= -# Browser automation requires Browserbase cloud service for remote browser execution. -# This allows the agent to navigate websites, fill forms, and extract information. - -# Browserbase API Key - Cloud browser execution -# Get at: https://browserbase.com/ -BROWSERBASE_API_KEY= - -# Browserbase Project ID - From your Browserbase dashboard -BROWSERBASE_PROJECT_ID= - -# Browser session timeout in seconds (optional, default: 300) -# Sessions are cleaned up after this duration of inactivity -BROWSER_SESSION_TIMEOUT=300 - -# ============================================================================= -# LEGACY/OPTIONAL API KEYS +# LEGACY/OPTIONAL # ============================================================================= -# Morph API Key - For legacy Hecate terminal backend (terminal-hecate tool) +# Morph API Key - For legacy Hecate terminal backend # Get at: https://morph.so/ MORPH_API_KEY= @@ -147,12 +112,3 @@ WEB_TOOLS_DEBUG=false VISION_TOOLS_DEBUG=false MOA_TOOLS_DEBUG=false IMAGE_TOOLS_DEBUG=false - -# Scratch directory for Singularity sandboxes (optional) -# If not set, uses /scratch (if available) or /tmp -# Set this to a directory with lots of space for large pip installs -# TERMINAL_SCRATCH_DIR=/scratch/myuser - -# Disk usage warning threshold in GB (default: 500) -# Warning is printed when total sandbox disk usage exceeds this -TERMINAL_DISK_WARNING_GB=500 diff --git a/README.md b/README.md index 9b675494c..699bd88bf 100644 --- a/README.md +++ b/README.md @@ -32,11 +32,14 @@ git submodule update --init --recursive python3 -m venv venv source venv/bin/activate # On Windows: venv\Scripts\activate -# Install required packages +# Install Python packages pip install -r requirements.txt # Install mini-swe-agent for terminal tools pip install -e ./mini-swe-agent + +# Install Node.js dependencies for browser tools (requires Node.js) +npm install ``` ### 3. Configure Environment Variables @@ -82,6 +85,31 @@ TERMINAL_TIMEOUT=60 - **docker**: Requires Docker installed and user in `docker` group - **modal**: Requires Modal account (see setup below) +### Singularity/Apptainer Setup (Recommended for HPC) + +Singularity/Apptainer provides rootless container execution, ideal for HPC clusters: + +```bash +# 1. Verify Apptainer is installed +apptainer --version # or: singularity --version + +# 2. Set up cache directories (important for parallel workers) +# Use /scratch if available (HPC), otherwise /tmp +export APPTAINER_CACHEDIR=/scratch/$USER/.apptainer +export APPTAINER_TMPDIR=/scratch/$USER/.apptainer/tmp +mkdir -p "$APPTAINER_CACHEDIR" "$APPTAINER_TMPDIR" + +# 3. Pre-build SIF image (recommended for parallel batch processing) +# This avoids race conditions when multiple workers start simultaneously +apptainer build $APPTAINER_CACHEDIR/python-nodejs.sif docker://nikolaik/python-nodejs:python3.11-nodejs20 + +# 4. Configure .env to use the local SIF +TERMINAL_ENV=singularity +TERMINAL_SINGULARITY_IMAGE=/scratch/$USER/.apptainer/python-nodejs.sif +``` + +**Tip:** The batch scripts in `configs/` automatically handle SIF pre-building if `/scratch` is available. + ### Modal Cloud Backend Setup [Modal](https://modal.com) provides serverless cloud compute for running sandboxed environments at scale. @@ -107,8 +135,9 @@ Browser tools enable the agent to navigate websites, fill forms, click buttons, # 1. Install Node.js (if not already installed) # Use nvm (recommended) or your package manager -# 2. Install agent-browser CLI globally -npm install -g agent-browser +# 2. Install agent-browser CLI (choose one option): +npm install -g agent-browser # Option A: Global install (recommended) +npm install # Option B: Local install (uses npx fallback) # 3. Get Browserbase credentials # Sign up at https://browserbase.com/ and get your: @@ -188,7 +217,7 @@ python run_agent.py --enabled_toolsets=safe --query "Help without running comman python run_agent.py --list_tools ``` -For detailed documentation on toolsets, see `TOOLSETS_README.md`. +See `toolsets.py` for the complete list of available toolsets and how to create custom ones. ## Basic Usage @@ -260,8 +289,36 @@ python batch_runner.py \ - Combined output in `data//trajectories.jsonl` - Tool usage statistics and success rates -**Quick Start:** See [QUICKSTART_BATCH.md](QUICKSTART_BATCH.md) for a 5-minute getting started guide. -**Full Documentation:** See [BATCH_PROCESSING.md](BATCH_PROCESSING.md) for comprehensive documentation. +Use `--list_distributions` to see available toolset distributions for varied data generation. + +### Trajectory Compression + +Post-process trajectories to fit within token budgets for training: + +```bash +# Compress a directory of JSONL files +python trajectory_compressor.py --input=data/my_run + +# Compress a single JSONL file +python trajectory_compressor.py --input=data/trajectories.jsonl + +# Compress a 15% sample (useful for creating smaller training sets) +python trajectory_compressor.py --input=data/trajectories.jsonl --sample_percent=15 + +# Custom output and token target +python trajectory_compressor.py \ + --input=data/trajectories.jsonl \ + --output=data/compressed.jsonl \ + --target_max_tokens=16000 +``` + +**Features:** +- Protects first turns (system, human, first GPT response, first tool call) +- Protects last N turns (configurable) +- Summarizes middle turns using LLM to fit target token budget +- Supports both directory and single file input +- Optional random sampling with `--sample_percent` +- Configurable via `configs/trajectory_compression.yaml` ### Ephemeral System Prompts @@ -282,7 +339,7 @@ python batch_runner.py \ The ephemeral prompt will influence the model's behavior during execution, but **only the standard tool-calling system prompt** will be saved in the trajectory files. -**Documentation:** See [docs/ephemeral_system_prompt.md](docs/ephemeral_system_prompt.md) for complete details. +The ephemeral prompt influences model behavior during execution, but **only the standard tool-calling system prompt** is saved in trajectory files. ## Command Line Arguments @@ -321,11 +378,13 @@ All environment variables can be configured in the `.env` file (copy from `.env. - `FAL_KEY`: Image generation tools **Terminal Tool Configuration (mini-swe-agent backend):** -- `TERMINAL_ENV`: Backend type - `local`, `docker`, or `modal` (default: `local`) -- `TERMINAL_DOCKER_IMAGE`: Docker image to use (default: `python:3.11-slim`) +- `TERMINAL_ENV`: Backend type - `local`, `docker`, `singularity`, or `modal` (default: `local`) +- `TERMINAL_DOCKER_IMAGE`: Docker image for docker backend (default: `python:3.11-slim`) +- `TERMINAL_SINGULARITY_IMAGE`: Singularity/Apptainer image (can be `docker://...` URL or local `.sif` path) - `TERMINAL_TIMEOUT`: Command timeout in seconds (default: `60`) - `TERMINAL_LIFETIME_SECONDS`: Cleanup inactive environments after this time (default: `300`) - `TERMINAL_CWD`: Working directory inside containers (default: `/tmp`) +- `TERMINAL_SCRATCH_DIR`: Custom scratch directory for sandbox storage (optional, auto-detects `/scratch`) **Browser Tool Configuration (agent-browser + Browserbase):** - `BROWSERBASE_API_KEY`: Browserbase API key for cloud browser execution @@ -340,18 +399,16 @@ All environment variables can be configured in the `.env` file (copy from `.env. **Debug Options:** - `WEB_TOOLS_DEBUG`, `VISION_TOOLS_DEBUG`, `MOA_TOOLS_DEBUG`, `IMAGE_TOOLS_DEBUG`: Enable debug logging -## Documentation +## Key Files -**Single Agent Usage:** -- `TOOLSETS_README.md`: Comprehensive guide to the toolsets system -- `toolsets.py`: View and modify available toolsets -- `model_tools.py`: Core tool definitions and handlers - -**Batch Processing:** -- `QUICKSTART_BATCH.md`: 5-minute quick start guide -- `BATCH_PROCESSING.md`: Complete batch processing documentation -- `toolset_distributions.py`: Toolset distributions for data generation - -## Examples - -See `TOOLSETS_README.md` for extensive examples of using different toolsets for various scenarios. +| File | Purpose | +|------|---------| +| `run_agent.py` | Main agent runner - single query execution | +| `batch_runner.py` | Parallel batch processing with checkpointing | +| `model_tools.py` | Core tool definitions and handlers | +| `toolsets.py` | Toolset definitions and composition | +| `toolset_distributions.py` | Probability distributions for data generation | +| `trajectory_compressor.py` | Post-process trajectories for training | +| `tools/` | Individual tool implementations | +| `architecture/` | Design documentation | +| `configs/` | Example batch run scripts | diff --git a/architecture/agents.md b/architecture/agents.md index 625d4ccf5..2691d8cb1 100644 --- a/architecture/agents.md +++ b/architecture/agents.md @@ -1,55 +1,104 @@ # Agents -Agents can be viewed as an FSM using an LLM to generate inputs into the system that operates over a DAG. +The agent is the core loop that orchestrates LLM calls and tool execution. -What this really means is that the agent is just a function without memory that uses text inputs and outputs in a -defined order. +## AIAgent Class + +The main agent is implemented in `run_agent.py`: ```python -def my_agent(*args, **kwargs) -> str: - # do whatever you want! - return "Hi I'm an agent!" +class AIAgent: + def __init__( + self, + model: str = "anthropic/claude-sonnet-4", + api_key: str = None, + base_url: str = "https://openrouter.ai/api/v1", + max_turns: int = 20, + enabled_toolsets: list = None, + disabled_toolsets: list = None, + verbose_logging: bool = False, + ): + # Initialize OpenAI client, load tools based on toolsets + ... + + def chat(self, user_message: str, task_id: str = None) -> str: + # Main entry point - runs the agent loop + ... ``` -Now obviously, that's like saying water's wet, but we're going to be using that definition to inform our design of the -library, namely, that we should *not* store agent state outside the function call. +## Agent Loop -## The Agent Class +The core loop in `_run_agent_loop()`: -So we don't have state, why are we using a class? - -Well, we want to initialize things, we want to have some configuration, and we want to have some helper functions. -Preferably all in a single place. +``` +1. Add user message to conversation +2. Call LLM with tools +3. If LLM returns tool calls: + - Execute each tool + - Add tool results to conversation + - Go to step 2 +4. If LLM returns text response: + - Return response to user +``` ```python -class BaseAgent: - def agent_primitives(self) -> list[BaseAgent]: - # Returns a list of Agents that are utilized by this agent to generate inputs - # We use agent primitives here instead of subagents because these are going to be part - # of the message graph, not a subagent tool call. - raise NotImplementedError +while turns < max_turns: + response = client.chat.completions.create( + model=model, + messages=messages, + tools=tool_schemas, + ) - def tools(self) -> list[BaseTool]: - # Returns a list of tools that the agent needs to run - raise NotImplementedError - - - def run(self, config, *args, **kwargs) -> ConversationGraph: - llm = get_llm(config) - tools = self.tools() - for agent in self.agent_primitives(): - tools.extend(agent.tools()) - tools = remove_duplicates(tools) - tools = initialize_tools(tools, config) - return self(llm, tools, config, *args, **kwargs) - - @staticmethod - def __call__(self, llm, tools, config, *args, **kwargs) -> ConversationGraph: - # Returns a ConversationGraph that can be parsed to get the output of the agent - # Use w/e args/kwargs you want, as long as llm/tools/config are satisfied. - raise NotImplementedError + if response.tool_calls: + for tool_call in response.tool_calls: + result = await execute_tool(tool_call) + messages.append(tool_result_message(result)) + turns += 1 + else: + return response.content ``` -Doesn't seem too bad (I hope), it is a bit annoying that we don't initialize everything in the constructor, but -hopefully we all kinda like it :) +## Conversation Management +Messages are stored as a list of dicts following OpenAI format: + +```python +messages = [ + {"role": "system", "content": "You are a helpful assistant..."}, + {"role": "user", "content": "Search for Python tutorials"}, + {"role": "assistant", "content": None, "tool_calls": [...]}, + {"role": "tool", "tool_call_id": "...", "content": "..."}, + {"role": "assistant", "content": "Here's what I found..."}, +] +``` + +## Reasoning Context + +For models that support reasoning (chain-of-thought), the agent: +1. Extracts `reasoning_content` from API responses +2. Stores it in `assistant_msg["reasoning"]` for trajectory export +3. Passes it back via `reasoning_content` field on subsequent turns + +## Trajectory Export + +Conversations can be exported for training: + +```python +agent = AIAgent(save_trajectories=True) +agent.chat("Do something") +# Saves to trajectories/*.jsonl in ShareGPT format +``` + +## Batch Processing + +For processing multiple prompts, use `batch_runner.py`: + +```bash +python batch_runner.py \ + --dataset_file=prompts.jsonl \ + --batch_size=20 \ + --num_workers=4 \ + --run_name=my_run +``` + +See `batch_runner.py` for parallel execution with checkpointing. diff --git a/architecture/llm_client.md b/architecture/llm_client.md index fe15d23a4..8566b71ab 100644 --- a/architecture/llm_client.md +++ b/architecture/llm_client.md @@ -1,14 +1,124 @@ # LLM Client -A quick wrapper over openai apis +Hermes Agent uses the OpenAI Python SDK with OpenRouter as the backend, providing access to many models through a single API. -## Responsibilities +## Configuration -- Transform "normal" chat/completions requests into graphs -- Translate graphs into LLM requests -- Keep a history of graphs parsed by it - - On Policy Data - - Deduplicating graphs, so we don't keep previous history as separate graphs +```python +from openai import OpenAI -## How to use -Exactly the same as the openai api! Just with the additional support of graph inputs and outputs. \ No newline at end of file +client = OpenAI( + api_key=os.getenv("OPENROUTER_API_KEY"), + base_url="https://openrouter.ai/api/v1" +) +``` + +## Supported Models + +Any model available on [OpenRouter](https://openrouter.ai/models): + +```python +# Anthropic +model = "anthropic/claude-sonnet-4" +model = "anthropic/claude-opus-4" + +# OpenAI +model = "openai/gpt-4o" +model = "openai/o1" + +# Google +model = "google/gemini-2.0-flash" + +# Open models +model = "meta-llama/llama-3.3-70b-instruct" +model = "deepseek/deepseek-chat-v3" +model = "moonshotai/kimi-k2.5" +``` + +## Tool Calling + +Standard OpenAI function calling format: + +```python +response = client.chat.completions.create( + model=model, + messages=messages, + tools=[ + { + "type": "function", + "function": { + "name": "web_search", + "description": "Search the web", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string"} + }, + "required": ["query"] + } + } + } + ], +) + +# Check for tool calls +if response.choices[0].message.tool_calls: + for tool_call in response.choices[0].message.tool_calls: + name = tool_call.function.name + args = json.loads(tool_call.function.arguments) + # Execute tool... +``` + +## Reasoning Models + +Some models return reasoning/thinking content: + +```python +# Access reasoning if available +message = response.choices[0].message +if hasattr(message, 'reasoning_content') and message.reasoning_content: + reasoning = message.reasoning_content + # Store for trajectory export +``` + +## Provider Selection + +OpenRouter allows selecting specific providers: + +```python +response = client.chat.completions.create( + model=model, + messages=messages, + extra_body={ + "provider": { + "order": ["Anthropic", "Google"], # Preferred providers + "ignore": ["Novita"], # Providers to skip + } + } +) +``` + +## Error Handling + +Common errors and handling: + +```python +try: + response = client.chat.completions.create(...) +except openai.RateLimitError: + # Back off and retry +except openai.APIError as e: + # Check e.code for specific errors + # 400 = bad request (often provider-specific) + # 502 = bad gateway (retry with different provider) +``` + +## Cost Tracking + +OpenRouter returns usage info: + +```python +usage = response.usage +print(f"Tokens: {usage.prompt_tokens} + {usage.completion_tokens}") +print(f"Cost: ${usage.cost:.6f}") # If available +``` diff --git a/architecture/message_graph.md b/architecture/message_graph.md index 251a3a416..a615d27f1 100644 --- a/architecture/message_graph.md +++ b/architecture/message_graph.md @@ -1,114 +1,121 @@ -# Message Graph +# Message Format & Trajectories -```mermaid -graph TD - %% Message nodes - SystemMsg["📋 System Message
Role: System
Content: Messages are nodes in a graph"] - UserMsg["👤 User Message
Role: User
Content: But messages aren't the only thing in the graph"] - subgraph PrevMessages["Previous Messages"] - PrevSystemMsg["📋 System Message
Role: System
Content: Edits are kept in the graph as context"] - PrevUserMsg["👤 User Message
Role: User
Content: So we can ensure they're immutable while keeping them editable"] - end - - %% Chat Response as a subgraph - subgraph ChatResponseBox["💬 Chat Response"] - ChatMetadata["📊 Metadata
Temp: 1.0
..."] - ChatResponseText["📝 Response
Hello, Here's a subagent call: <tool>subagent</tool>"] - ChatContent["Content: Hello, Here's a subagent call..."] - end - - %% Tool Response as a subgraph - subgraph ToolResponseBox["🔧 Tool Response"] - subgraph ToolMetadata["📊 Tool Metadata"] - ToolMetadataLength["Length: 3"] - subgraph ToolChat["💭 Subagent Chat"] - SubagentSystem["📋 System
Content: Subagent call received"] - SubagentUser["👤 User
Content: Process this request"] - SubagentAssistant["🤖 Assistant
Content: Processing..."] - SubagentSystem --> SubagentUser - SubagentUser --> SubagentAssistant - end - end - ToolContent["Content: Subagent call output"] - end - - %% Graph flow connections - SystemMsg --> UserMsg - PrevSystemMsg --> PrevUserMsg - PrevMessages -.-> UserMsg - UserMsg --> ChatResponseBox - ChatResponseBox --> ToolResponseBox - - class SystemMsg,UserMsg messageNode - class ChatResponseBox responseNode - class ToolResponseBox responseNode - class ChatMetadata,ChatResponseText,ChatContent,ToolMetadata,ToolChat,ToolContent,ToolMetadataLength metadataNode -``` +Hermes Agent uses two message formats: the **API format** for LLM calls and the **trajectory format** for training data export. -Messages should be a graph (DAG, specifically) of immutable elements. +## API Message Format -## Why immutable elements? -We want to train on policy -- This means the context cannot change after we call a response. - -## Why a graph? -Nodes and connections are a natural way to represent the flow of information in an agent conversation. - -## Will this be annoying to deal with? - -It shouldn't be! While there will be internal stuff that may look ???, for the interface, it should be as simple as your -normal context window edits, so `message_history[2]['content'] = my_edit`, but internally we'll deal with the recordkeeping -and how this ends up parsing into on policy training data, if requested. - -## Edges - -Edges are the connections between nodes, and there are two types we are concerned with: -- **Sequential edges**: These represent the flow of conversation, connecting messages in the order they were sent. For example, a user message followed by an assistant response. -- **Parallel edges**: These represent versioning, e.g. edit history, context squishing, etc. -We, however, are only concerned about parallel edges when we break the prefix, and ignore any other parallel edges. - -## So what does this look like in practice? +Standard OpenAI chat format used during execution: ```python -import copy +messages = [ + # System prompt + {"role": "system", "content": "You are a helpful assistant with tools..."}, + + # User query + {"role": "user", "content": "Search for Python tutorials"}, + + # Assistant with tool call + { + "role": "assistant", + "content": None, + "tool_calls": [{ + "id": "call_abc123", + "type": "function", + "function": { + "name": "web_search", + "arguments": "{\"query\": \"Python tutorials\"}" + } + }] + }, + + # Tool result + { + "role": "tool", + "tool_call_id": "call_abc123", + "content": "{\"results\": [...]}" + }, + + # Final response + {"role": "assistant", "content": "Here's what I found..."} +] +``` +## Trajectory Format (ShareGPT) -class MessageGraph: - def __init__(self): - self.messages = [] - self.prev_graph = None +Exported for training in ShareGPT format: - def append(self, message): - self.messages.append(message) +```json +{ + "conversations": [ + {"from": "system", "value": "You are a helpful assistant..."}, + {"from": "human", "value": "Search for Python tutorials"}, + {"from": "gpt", "value": "\n{\"name\": \"web_search\", \"arguments\": {\"query\": \"Python tutorials\"}}\n"}, + {"from": "tool", "value": "\n{\"results\": [...]}\n"}, + {"from": "gpt", "value": "Here's what I found..."} + ], + "tools": "[{\"type\": \"function\", \"function\": {...}}]", + "source": "hermes-agent" +} +``` - def __getitem__(self, index): - return self.messages[index] +## Reasoning Content - def __setitem__(self, key, value): - # check if an assistant message is after this indx - needs_new_graph = False - first_idx = -1 - for i in range(key, len(self.messages)): - if (i == key) and (value['role'] == 'assistant') and (value['content'] == self.messages[i]['content']): - # no op - return - needs_new_graph = needs_new_graph or (self.messages[i]['role'] == 'assistant') - if needs_new_graph and first_idx == -1: - first_idx = i - if needs_new_graph: - self.prev_graph = copy.deepcopy(self) - self.messages[key] = value +For models that output reasoning/chain-of-thought: - def __len__(self): - return len(self.messages) +**During execution** (API format): +```python +# Stored internally but not sent back to model in content +assistant_msg = { + "role": "assistant", + "content": "Here's what I found...", + "reasoning": "Let me think about this step by step..." # Internal only +} +``` - def __eq__(self, other): - return "\n\n".join(f"{msg['role']}: {msg['content']}" for msg in self) == "\n\n".join( - f"{msg['role']}: {msg['content']}" for msg in other) +**In trajectory export** (reasoning wrapped in tags): +```json +{ + "from": "gpt", + "value": "\nLet me think about this step by step...\n\nHere's what I found..." +} +``` +## Conversion Flow -# in use -messages = MessageGraph() -messages.append({'role': 'system', 'content': 'Hello, I am a system message'}) -messages[0] = {'role': 'user', 'content': 'Hello, I am a user message'} -``` \ No newline at end of file +``` +API Response → Internal Storage → Trajectory Export + ↓ ↓ ↓ +tool_calls reasoning field tags +reasoning_content tags +``` + +The conversion happens in `_convert_to_trajectory_format()` in `run_agent.py`. + +## Ephemeral System Prompts + +Batch processing supports ephemeral system prompts that guide behavior during execution but are NOT saved to trajectories: + +```python +# During execution: full system prompt + ephemeral guidance +messages = [ + {"role": "system", "content": SYSTEM_PROMPT + "\n\n" + ephemeral_prompt}, + ... +] + +# In saved trajectory: only the base system prompt +trajectory = { + "conversations": [ + {"from": "system", "value": SYSTEM_PROMPT}, # No ephemeral + ... + ] +} +``` + +## Trajectory Compression + +Long trajectories can be compressed for training using `trajectory_compressor.py`: + +- Protects first/last N turns +- Summarizes middle turns with LLM +- Targets specific token budget +- See `configs/trajectory_compression.yaml` for settings diff --git a/architecture/tools.md b/architecture/tools.md index b899c5ebd..37a82a370 100644 --- a/architecture/tools.md +++ b/architecture/tools.md @@ -1,16 +1,102 @@ # Tools -Not much on this, yet. Tools are just a stateful wrapper around a function, so we can do things like: -- Keep a docker container running -- Keep a game online +Tools are functions that extend the agent's capabilities. Each tool is defined with an OpenAI-compatible JSON schema and an async handler function. + +## Tool Structure + +Each tool module in `tools/` exports: +1. **Schema definitions** - OpenAI function-calling format +2. **Handler functions** - Async functions that execute the tool ```python -class BaseTool: - def definitions(self) -> List[Dict[str, Any]]: - # OpenAI API compatible definitions - raise NotImplementedError - - def __call__(self, *args, **kwargs) -> Dict[str, Any]: - # Returns at minimum {'role': 'tool', 'content': '...'} - raise NotImplementedError -``` \ No newline at end of file +# Example: tools/web_tools.py + +# Schema definition +WEB_SEARCH_SCHEMA = { + "type": "function", + "function": { + "name": "web_search", + "description": "Search the web for information", + "parameters": { + "type": "object", + "properties": { + "query": {"type": "string", "description": "Search query"} + }, + "required": ["query"] + } + } +} + +# Handler function +async def web_search(query: str) -> dict: + """Execute web search and return results.""" + # Implementation... + return {"results": [...]} +``` + +## Tool Categories + +| Category | Module | Tools | +|----------|--------|-------| +| **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` | +| **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal backends) | +| **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. | +| **Vision** | `vision_tools.py` | `vision_analyze` | +| **Image Gen** | `image_generation_tool.py` | `image_generate` | +| **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` | + +## Tool Registration + +Tools are registered in `model_tools.py`: + +```python +# model_tools.py +TOOL_SCHEMAS = [ + *WEB_TOOL_SCHEMAS, + *TERMINAL_TOOL_SCHEMAS, + *BROWSER_TOOL_SCHEMAS, + # ... +] + +TOOL_HANDLERS = { + "web_search": web_search, + "terminal": terminal_tool, + "browser_navigate": browser_navigate, + # ... +} +``` + +## Toolsets + +Tools are grouped into **toolsets** for logical organization (see `toolsets.py`): + +```python +TOOLSETS = { + "web": { + "description": "Web search and content extraction", + "tools": ["web_search", "web_extract", "web_crawl"] + }, + "terminal": { + "description": "Command execution", + "tools": ["terminal"] + }, + # ... +} +``` + +## Adding a New Tool + +1. Create handler function in `tools/your_tool.py` +2. Define JSON schema following OpenAI format +3. Register in `model_tools.py` (schemas and handlers) +4. Add to appropriate toolset in `toolsets.py` +5. Update `tools/__init__.py` exports + +## Stateful Tools + +Some tools maintain state across calls within a session: + +- **Terminal**: Keeps container/sandbox running between commands +- **Browser**: Maintains browser session for multi-step navigation + +State is managed per `task_id` and cleaned up automatically. diff --git a/package.json b/package.json index e648b3aa3..d95916310 100644 --- a/package.json +++ b/package.json @@ -2,27 +2,23 @@ "name": "hermes-agent", "version": "1.0.0", "description": "An AI agent with advanced tool-calling capabilities, featuring a flexible toolsets system for organizing and managing tools.", - "main": "index.js", - "directories": { - "doc": "docs", - "example": "examples", - "test": "tests" - }, + "private": true, "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" + "postinstall": "echo '✅ Browser tools ready. Run: python run_agent.py --help'" }, "repository": { "type": "git", "url": "git+https://github.com/NousResearch/Hermes-Agent.git" }, - "keywords": [], - "author": "", - "license": "ISC", + "license": "MIT", "bugs": { "url": "https://github.com/NousResearch/Hermes-Agent/issues" }, "homepage": "https://github.com/NousResearch/Hermes-Agent#readme", "dependencies": { "agent-browser": "^0.7.6" + }, + "engines": { + "node": ">=18.0.0" } } diff --git a/pyproject.toml b/pyproject.toml index 17ad4e69e..10e257f77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,21 +8,38 @@ version = "0.1.0" description = "AI agent with advanced tool-calling and toolsets" readme = "README.md" requires-python = ">=3.10" -authors = [{ name = "Hermes Agent" }] +authors = [{ name = "Nous Research" }] license = { text = "MIT" } dependencies = [ - "firecrawl-py", + # Core "openai", - "fal-client", "python-dotenv", - "fire" + "fire", + "httpx", + "rich", + "tenacity", + "pyyaml", + "requests", + "jinja2", + "pydantic>=2.0", + # Tools + "firecrawl-py", + "fal-client", + # mini-swe-agent deps (terminal tool) + "litellm>=1.75.5", + "typer", + "platformdirs", ] +[project.optional-dependencies] +modal = ["modal", "boto3"] +dev = ["pytest", "pytest-asyncio"] + [project.scripts] hermes-agent = "run_agent:main" [tool.setuptools] -py-modules = ["run_agent", "model_tools", "toolsets"] +py-modules = ["run_agent", "model_tools", "toolsets", "batch_runner", "trajectory_compressor", "toolset_distributions"] [tool.setuptools.packages.find] include = ["tools"]