From 9e85408c7bfd6024754709800ab762402d1a2816 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Tue, 17 Feb 2026 23:30:31 -0800 Subject: [PATCH] Add todo tool for task management and enhance CLI features - Introduced a new `todo_tool.py` for planning and tracking multi-step tasks, enhancing the agent's capabilities. - Updated CLI to include a floating autocomplete dropdown for commands and improved user instructions for better navigation. - Revised toolsets to incorporate the new `todo` tool and updated documentation to reflect changes in available tools and commands. - Enhanced user experience with new keybindings and clearer command descriptions in the CLI. --- AGENTS.md | 22 +++++++++++++++++++--- README.md | 24 ++++++++++++++++++++---- TODO.md | 8 ++++---- cli-config.yaml.example | 11 ++++++++--- docs/tools.md | 7 ++++++- hermes_cli/setup.py | 3 +++ toolsets.py | 20 ++++++++++++++++++++ 7 files changed, 80 insertions(+), 15 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index ea0c32edb..df36223d0 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,6 +25,7 @@ hermes-agent/ │ ├── uninstall.py # Uninstaller │ └── cron.py # Cron job management ├── tools/ # Tool implementations +│ ├── todo_tool.py # Planning & task management (in-memory TodoStore) │ ├── process_registry.py # Background process management (spawn, poll, wait, kill) │ ├── transcription_tools.py # Speech-to-text (Whisper API) ├── gateway/ # Messaging platform adapters @@ -151,13 +152,23 @@ For models that support chain-of-thought reasoning: The interactive CLI uses: - **Rich** - For the welcome banner and styled panels -- **prompt_toolkit** - For fixed input area with history and `patch_stdout` -- **KawaiiSpinner** (in run_agent.py) - Animated feedback during API calls and tool execution +- **prompt_toolkit** - For fixed input area with history, `patch_stdout`, slash command autocomplete, and floating completion menus +- **KawaiiSpinner** (in run_agent.py) - Animated kawaii faces during API calls; clean `┊` activity feed for tool execution results Key components: - `HermesCLI` class - Main CLI controller with commands and conversation loop +- `SlashCommandCompleter` - Autocomplete dropdown for `/commands` (type `/` to see all) - `load_cli_config()` - Loads config, sets environment variables for terminal - `build_welcome_banner()` - Displays ASCII art logo, tools, and skills summary + +CLI UX notes: +- Thinking spinner (during LLM API call) shows animated kawaii face + verb (`(⌐■_■) deliberating...`) +- When LLM returns tool calls, the spinner clears silently (no "got it!" noise) +- Tool execution results appear as a clean activity feed: `┊ {emoji} {verb} {detail} {duration}` +- "got it!" only appears when the LLM returns a final text response (`⚕ ready`) +- The prompt shows `⚕ ❯` when the agent is working, `❯` when idle +- Pasting 5+ lines auto-saves to `~/.hermes/pastes/` and collapses to a reference +- Multi-line input via Alt+Enter or Ctrl+J - `/commands` - Process user commands like `/help`, `/clear`, `/personality`, etc. CLI uses `quiet_mode=True` when creating AIAgent to suppress verbose logging. @@ -472,7 +483,12 @@ Follow this strict order to maintain consistency: - Add to `OPTIONAL_ENV_VARS` in `hermes_cli/config.py` - The tool will be auto-disabled if the key is missing -6. Optionally add to `toolset_distributions.py` for batch processing +6. Add `"todo"` to the relevant platform toolsets (`hermes-cli`, `hermes-telegram`, etc.) + +7. Optionally add to `toolset_distributions.py` for batch processing + +**Special case: tools that need agent-level state** (like `todo`): +If your tool needs access to the AIAgent instance (e.g., in-memory state per session), intercept it directly in `run_agent.py`'s tool dispatch loop *before* `handle_function_call()`. Add a fallback error in `handle_function_call()` for safety. See `todo_tool.py` and the `if function_name == "todo":` block in `run_agent.py` for the pattern. For RL environments, add the same intercept in `environments/agent_loop.py`. ### Tool Implementation Pattern diff --git a/README.md b/README.md index 40cd6ae2f..14c294a8c 100644 --- a/README.md +++ b/README.md @@ -107,16 +107,32 @@ hermes version # Show version info ### CLI Commands (inside chat) +Type `/` to see an autocomplete dropdown of all commands. + | Command | Description | |---------|-------------| | `/help` | Show available commands | | `/tools` | List available tools | +| `/toolsets` | List available toolsets | | `/model [name]` | Show or change model | +| `/prompt` | View/set custom system prompt | | `/personality [name]` | Set personality (kawaii, pirate, etc.) | -| `/clear` | Clear screen and reset | -| `/cron` | Manage scheduled tasks | +| `/clear` | Clear screen and reset conversation | +| `/history` | Show conversation history | +| `/reset` | Reset conversation only (keep screen) | +| `/retry` | Retry the last message | +| `/undo` | Remove the last exchange | +| `/save` | Save the current conversation | | `/config` | Show current configuration | -| `/quit` | Exit | +| `/cron` | Manage scheduled tasks | +| `/platforms` | Show gateway/messaging platform status | +| `/quit` | Exit (also: `/exit`, `/q`) | + +**Keybindings:** +- `Enter` — send message +- `Alt+Enter` or `Ctrl+J` — new line (multi-line input) +- `Ctrl+C` — interrupt agent (double-press to force exit) +- `Ctrl+D` — exit --- @@ -134,7 +150,7 @@ hermes --toolsets "web,terminal" hermes --list-tools ``` -**Available toolsets:** `web`, `terminal`, `browser`, `vision`, `creative`, `reasoning`, `skills`, `tts`, `cronjob`, and more. +**Available toolsets:** `web`, `terminal`, `file`, `browser`, `vision`, `image_gen`, `moa`, `skills`, `tts`, `todo`, `cronjob`, and more. ### 🔊 Text-to-Speech diff --git a/TODO.md b/TODO.md index 9f0f945c3..994e1cb3c 100644 --- a/TODO.md +++ b/TODO.md @@ -4,7 +4,7 @@ ## What We Already Have (for reference) -**42+ tools** across 12 toolsets: web (search, extract), terminal + process management, file ops (read, write, patch, search), vision, MoA reasoning, image gen, browser (10 tools via Browserbase), skills (41 skills), cronjobs, RL training (10 tools via Tinker-Atropos), TTS, cross-channel messaging. +**43+ tools** across 13 toolsets: web (search, extract), terminal + process management, file ops (read, write, patch, search), vision, MoA reasoning, image gen, browser (10 tools via Browserbase), skills (41 skills), **todo (task planning)**, cronjobs, RL training (10 tools via Tinker-Atropos), TTS, cross-channel messaging. **4 platform adapters**: Telegram, Discord, WhatsApp, Slack -- all with typing indicators, image/voice auto-analysis, dangerous command approval, interrupt support, background process watchers. @@ -41,9 +41,9 @@ The main agent becomes an orchestrator that delegates context-heavy tasks to sub --- -## 2. Planning & Task Management 📋 +## 2. Planning & Task Management 📋 ✅ -**Status:** Not started +**Status:** Implemented **Priority:** High -- every serious agent has this now A `todo` tool the agent uses to decompose complex tasks, track progress, and recover from failures. Must be **cache-friendly** -- no system prompt mutation, no injected messages that invalidate the KV cache prefix. @@ -935,7 +935,7 @@ This goes in the tool description: **Tier 1 (High impact, foundation for everything else):** 1. Programmatic Tool Calling (code-mediated tool use) -- #20 2. Memory System (Phase 1: MEMORY.md + USER.md) -- #5 -3. Planning & Task Management (todo tool) -- #2 +3. ~~Planning & Task Management (todo tool) -- #2~~ **DONE** 4. Session Transcript Search -- #6 5. Self-Learning from Errors -- #16 diff --git a/cli-config.yaml.example b/cli-config.yaml.example index 6da5297ba..069472380 100644 --- a/cli-config.yaml.example +++ b/cli-config.yaml.example @@ -185,15 +185,20 @@ agent: # # web - Web search and content extraction (web_search, web_extract) # search - Web search only, no scraping (web_search) -# terminal - Command execution (terminal) +# terminal - Command execution and process management (terminal, process) +# file - File operations: read, write, patch, search # browser - Full browser automation (navigate, click, type, screenshot, etc.) # vision - Image analysis (vision_analyze) # image_gen - Image generation with FLUX (image_generate) -# skills - Load skill documents (skills_categories, skills_list, skill_view) +# skills - Load skill documents (skills_list, skill_view) # moa - Mixture of Agents reasoning (mixture_of_agents) +# todo - Task planning and tracking for multi-step work +# tts - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI) +# cronjob - Schedule and manage automated tasks (CLI-only) +# rl - RL training tools (Tinker-Atropos) # # Composite toolsets: -# debugging - terminal + web (for troubleshooting) +# debugging - terminal + web + file (for troubleshooting) # safe - web + vision + moa (no terminal access) # ----------------------------------------------------------------------------- diff --git a/docs/tools.md b/docs/tools.md index 3fc60e14e..a8682d9ce 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -47,6 +47,7 @@ async def web_search(query: str) -> dict: | **TTS** | `tts_tool.py` | `text_to_speech` (Edge TTS free / ElevenLabs / OpenAI) | | **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` | | **Skills** | `skills_tool.py` | `skills_list`, `skill_view` | +| **Todo** | `todo_tool.py` | `todo` (read/write task list for multi-step planning) | | **Cronjob** | `cronjob_tools.py` | `schedule_cronjob`, `list_cronjobs`, `remove_cronjob` | | **RL Training** | `rl_training_tool.py` | `rl_list_environments`, `rl_start_training`, `rl_check_status`, etc. | @@ -83,7 +84,11 @@ TOOLSETS = { }, "terminal": { "description": "Command execution", - "tools": ["terminal"] + "tools": ["terminal", "process"] + }, + "todo": { + "description": "Task planning and tracking for multi-step work", + "tools": ["todo"] }, # ... } diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index adf024279..dc5c26c0a 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -202,6 +202,9 @@ def _print_setup_summary(config: dict, hermes_home): # Terminal (always available if system deps met) tool_status.append(("Terminal/Commands", True, None)) + # Task planning (always available, in-memory) + tool_status.append(("Task Planning (todo)", True, None)) + # Skills (always available if skills dir exists) tool_status.append(("Skills Knowledge Base", True, None)) diff --git a/toolsets.py b/toolsets.py index 54b103431..1f0013c6a 100644 --- a/toolsets.py +++ b/toolsets.py @@ -189,6 +189,11 @@ TOOLSETS = { "image_generate", # Text-to-speech "text_to_speech", + # Browser automation (requires Browserbase API key) + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision", # Skills - access knowledge base "skills_list", "skill_view", # Planning & task management @@ -216,6 +221,11 @@ TOOLSETS = { "image_generate", # Text-to-speech "text_to_speech", + # Browser automation (requires Browserbase API key) + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision", # Skills - access knowledge base "skills_list", "skill_view", # Planning & task management @@ -243,6 +253,11 @@ TOOLSETS = { "image_generate", # Text-to-speech "text_to_speech", + # Browser automation (requires Browserbase API key) + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision", # Skills "skills_list", "skill_view", # Planning & task management @@ -270,6 +285,11 @@ TOOLSETS = { "image_generate", # Text-to-speech "text_to_speech", + # Browser automation (requires Browserbase API key) + "browser_navigate", "browser_snapshot", "browser_click", + "browser_type", "browser_scroll", "browser_back", + "browser_press", "browser_close", "browser_get_images", + "browser_vision", # Skills - access knowledge base "skills_list", "skill_view", # Planning & task management