From 2bf96ad244611ea067e2f24cd22a09e620a0fa09 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Mon, 23 Feb 2026 23:55:42 -0800
Subject: [PATCH] feat: add ephemeral prefill messages and system prompt
 loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.
---
 cli.py                   | 49 +++++++++++++++++++++++---
 gateway/run.py           | 75 ++++++++++++++++++++++++++++++++++++++--
 hermes_cli/config.py     | 19 ++++++++++
 run_agent.py             | 66 +++++++++++++++++++++++++----------
 tools/file_operations.py | 27 +++++++++++++--
 tools/file_tools.py      |  2 +-
 tools/web_tools.py       | 16 ++++-----
 7 files changed, 218 insertions(+), 36 deletions(-)

diff --git a/cli.py b/cli.py
index ce0b00ad4..42394efa1 100755
--- a/cli.py
+++ b/cli.py
@@ -61,6 +61,35 @@ if env_path.exists():
 # Configuration Loading
 # =============================================================================
 
+def _load_prefill_messages(file_path: str) -> List[Dict[str, Any]]:
+    """Load ephemeral prefill messages from a JSON file.
+    
+    The file should contain a JSON array of {role, content} dicts, e.g.:
+        [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]
+    
+    Relative paths are resolved from ~/.hermes/.
+    Returns an empty list if the path is empty or the file doesn't exist.
+    """
+    if not file_path:
+        return []
+    path = Path(file_path).expanduser()
+    if not path.is_absolute():
+        path = Path.home() / ".hermes" / path
+    if not path.exists():
+        logger.warning("Prefill messages file not found: %s", path)
+        return []
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if not isinstance(data, list):
+            logger.warning("Prefill messages file must contain a JSON array: %s", path)
+            return []
+        return data
+    except Exception as e:
+        logger.warning("Failed to load prefill messages from %s: %s", path, e)
+        return []
+
+
 def load_cli_config() -> Dict[str, Any]:
     """
     Load CLI configuration from config files.
@@ -116,6 +145,7 @@ def load_cli_config() -> Dict[str, Any]:
             "max_turns": 60,  # Default max tool-calling iterations
             "verbose": False,
             "system_prompt": "",
+            "prefill_messages_file": "",
             "personalities": {
                 "helpful": "You are a helpful, friendly AI assistant.",
                 "concise": "You are a concise assistant. Keep responses brief and to the point.",
@@ -753,10 +783,18 @@ class HermesCLI:
             if invalid:
                 self.console.print(f"[bold red]Warning: Unknown toolsets: {', '.join(invalid)}[/]")
         
-        # System prompt and personalities from config
-        self.system_prompt = CLI_CONFIG["agent"].get("system_prompt", "")
+        # Ephemeral system prompt: env var takes precedence, then config
+        self.system_prompt = (
+            os.getenv("HERMES_EPHEMERAL_SYSTEM_PROMPT", "")
+            or CLI_CONFIG["agent"].get("system_prompt", "")
+        )
         self.personalities = CLI_CONFIG["agent"].get("personalities", {})
         
+        # Ephemeral prefill messages (few-shot priming, never persisted)
+        self.prefill_messages = _load_prefill_messages(
+            CLI_CONFIG["agent"].get("prefill_messages_file", "")
+        )
+        
         # Agent will be initialized on first use
         self.agent: Optional[AIAgent] = None
         self._app = None  # prompt_toolkit Application (set in run())
@@ -848,10 +886,11 @@ class HermesCLI:
                 max_iterations=self.max_turns,
                 enabled_toolsets=self.enabled_toolsets,
                 verbose_logging=self.verbose,
-                quiet_mode=True,  # Suppress verbose output for clean CLI
+                quiet_mode=True,
                 ephemeral_system_prompt=self.system_prompt if self.system_prompt else None,
-                session_id=self.session_id,  # Pass CLI's session ID to agent
-                platform="cli",  # CLI interface — agent uses terminal-friendly formatting
+                prefill_messages=self.prefill_messages or None,
+                session_id=self.session_id,
+                platform="cli",
                 session_db=self._session_db,
                 clarify_callback=self._clarify_callback,
             )
diff --git a/gateway/run.py b/gateway/run.py
index b7be4478e..9c07d28e5 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -92,6 +92,11 @@ class GatewayRunner:
         self.config = config or load_gateway_config()
         self.adapters: Dict[Platform, BasePlatformAdapter] = {}
 
+        # Load ephemeral config from config.yaml / env vars.
+        # Both are injected at API-call time only and never persisted.
+        self._prefill_messages = self._load_prefill_messages()
+        self._ephemeral_system_prompt = self._load_ephemeral_system_prompt()
+
         # Wire process registry into session store for reset protection
         from tools.process_registry import process_registry
         self.session_store = SessionStore(
@@ -119,6 +124,66 @@ class GatewayRunner:
         from gateway.hooks import HookRegistry
         self.hooks = HookRegistry()
     
+    @staticmethod
+    def _load_prefill_messages() -> List[Dict[str, Any]]:
+        """Load ephemeral prefill messages from config or env var.
+        
+        Checks HERMES_PREFILL_MESSAGES_FILE env var first, then falls back to
+        the prefill_messages_file key in ~/.hermes/config.yaml.
+        Relative paths are resolved from ~/.hermes/.
+        """
+        import json as _json
+        file_path = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "")
+        if not file_path:
+            try:
+                import yaml as _y
+                cfg_path = Path.home() / ".hermes" / "config.yaml"
+                if cfg_path.exists():
+                    with open(cfg_path) as _f:
+                        cfg = _y.safe_load(_f) or {}
+                    file_path = cfg.get("prefill_messages_file", "")
+            except Exception:
+                pass
+        if not file_path:
+            return []
+        path = Path(file_path).expanduser()
+        if not path.is_absolute():
+            path = Path.home() / ".hermes" / path
+        if not path.exists():
+            logger.warning("Prefill messages file not found: %s", path)
+            return []
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                data = _json.load(f)
+            if not isinstance(data, list):
+                logger.warning("Prefill messages file must contain a JSON array: %s", path)
+                return []
+            return data
+        except Exception as e:
+            logger.warning("Failed to load prefill messages from %s: %s", path, e)
+            return []
+
+    @staticmethod
+    def _load_ephemeral_system_prompt() -> str:
+        """Load ephemeral system prompt from config or env var.
+        
+        Checks HERMES_EPHEMERAL_SYSTEM_PROMPT env var first, then falls back to
+        agent.system_prompt in ~/.hermes/config.yaml.
+        """
+        prompt = os.getenv("HERMES_EPHEMERAL_SYSTEM_PROMPT", "")
+        if prompt:
+            return prompt
+        try:
+            import yaml as _y
+            cfg_path = Path.home() / ".hermes" / "config.yaml"
+            if cfg_path.exists():
+                with open(cfg_path) as _f:
+                    cfg = _y.safe_load(_f) or {}
+                return (cfg.get("agent", {}).get("system_prompt", "") or "").strip()
+        except Exception:
+            pass
+        return ""
+
     async def start(self) -> bool:
         """
         Start the gateway and all configured platform adapters.
@@ -1275,15 +1340,21 @@ class GatewayRunner:
             # Platform.LOCAL ("local") maps to "cli"; others pass through as-is.
             platform_key = "cli" if source.platform == Platform.LOCAL else source.platform.value
             
+            # Combine platform context with user-configured ephemeral system prompt
+            combined_ephemeral = context_prompt or ""
+            if self._ephemeral_system_prompt:
+                combined_ephemeral = (combined_ephemeral + "\n\n" + self._ephemeral_system_prompt).strip()
+            
             agent = AIAgent(
                 model=os.getenv("HERMES_MODEL", "anthropic/claude-opus-4.6"),
                 max_iterations=max_iterations,
                 quiet_mode=True,
                 enabled_toolsets=enabled_toolsets,
-                ephemeral_system_prompt=context_prompt,
+                ephemeral_system_prompt=combined_ephemeral or None,
+                prefill_messages=self._prefill_messages or None,
                 session_id=session_id,
                 tool_progress_callback=progress_callback if tool_progress_enabled else None,
-                platform=platform_key,  # Tells the agent which interface to format for
+                platform=platform_key,
             )
             
             # Store agent reference for interrupt support
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index ba45ad9f1..0b2868fae 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -122,6 +122,11 @@ DEFAULT_CONFIG = {
         "user_char_limit": 1375,     # ~500 tokens at 2.75 chars/token
     },
     
+    # Ephemeral prefill messages file — JSON list of {role, content} dicts
+    # injected at the start of every API call for few-shot priming.
+    # Never saved to sessions, logs, or trajectories.
+    "prefill_messages_file": "",
+    
     # Permanently allowed dangerous command patterns (added via "always" approval)
     "command_allowlist": [],
     
@@ -312,6 +317,20 @@ OPTIONAL_ENV_VARS = {
         "password": False,
         "category": "setting",
     },
+    "HERMES_PREFILL_MESSAGES_FILE": {
+        "description": "Path to JSON file with ephemeral prefill messages for few-shot priming",
+        "prompt": "Prefill messages file path",
+        "url": None,
+        "password": False,
+        "category": "setting",
+    },
+    "HERMES_EPHEMERAL_SYSTEM_PROMPT": {
+        "description": "Ephemeral system prompt injected at API-call time (never persisted to sessions)",
+        "prompt": "Ephemeral system prompt",
+        "url": None,
+        "password": False,
+        "category": "setting",
+    },
 }
 
 
diff --git a/run_agent.py b/run_agent.py
index 3b557451d..3b31b9947 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -677,12 +677,10 @@ class AIAgent:
             "value": user_query
         })
         
-        # Calculate where agent responses start in the messages list.
-        # Prefill messages are ephemeral (only used to prime model response style)
-        # so we skip them entirely in the saved trajectory.
-        # Layout: [*prefill_msgs, actual_user_msg, ...agent_responses...]
-        num_prefill = len(self.prefill_messages) if self.prefill_messages else 0
-        i = num_prefill + 1  # Skip prefill messages + the actual user message (already added above)
+        # Skip the first message (the user query) since we already added it above.
+        # Prefill messages are injected at API-call time only (not in the messages
+        # list), so no offset adjustment is needed here.
+        i = 1
         
         while i < len(messages):
             msg = messages[i]
@@ -1043,9 +1041,10 @@ class AIAgent:
         if tool_guidance:
             prompt_parts.append(" ".join(tool_guidance))
 
-        caller_prompt = system_message if system_message is not None else self.ephemeral_system_prompt
-        if caller_prompt:
-            prompt_parts.append(caller_prompt)
+        # Note: ephemeral_system_prompt is NOT included here. It's injected at
+        # API-call time only so it stays out of the cached/stored system prompt.
+        if system_message is not None:
+            prompt_parts.append(system_message)
 
         if self._memory_store:
             if self._memory_enabled:
@@ -1510,6 +1509,19 @@ class AIAgent:
                 logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
                 logging.debug(f"Tool result preview: {result_preview}...")
 
+            # Guard against tools returning absurdly large content that would
+            # blow up the context window. 100K chars ≈ 25K tokens — generous
+            # enough for any reasonable tool output but prevents catastrophic
+            # context explosions (e.g. accidental base64 image dumps).
+            MAX_TOOL_RESULT_CHARS = 100_000
+            if len(function_result) > MAX_TOOL_RESULT_CHARS:
+                original_len = len(function_result)
+                function_result = (
+                    function_result[:MAX_TOOL_RESULT_CHARS]
+                    + f"\n\n[Truncated: tool response was {original_len:,} chars, "
+                    f"exceeding the {MAX_TOOL_RESULT_CHARS:,} char limit]"
+                )
+
             tool_msg = {
                 "role": "tool",
                 "content": function_result,
@@ -1551,8 +1563,15 @@ class AIAgent:
 
         try:
             api_messages = messages.copy()
+            effective_system = self._cached_system_prompt or ""
             if self.ephemeral_system_prompt:
-                api_messages = [{"role": "system", "content": self.ephemeral_system_prompt}] + api_messages
+                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
+            if effective_system:
+                api_messages = [{"role": "system", "content": effective_system}] + api_messages
+            if self.prefill_messages:
+                sys_offset = 1 if effective_system else 0
+                for idx, pfm in enumerate(self.prefill_messages):
+                    api_messages.insert(sys_offset + idx, pfm.copy())
 
             summary_extra_body = {}
             if "openrouter" in self.base_url.lower():
@@ -1628,11 +1647,10 @@ class AIAgent:
         if conversation_history and not self._todo_store.has_items():
             self._hydrate_todo_store(conversation_history)
         
-        # Inject prefill messages at the start of conversation (before user's actual prompt)
-        # This is used for few-shot priming, e.g., a greeting exchange to set response style
-        if self.prefill_messages and not conversation_history:
-            for prefill_msg in self.prefill_messages:
-                messages.append(prefill_msg.copy())
+        # Prefill messages (few-shot priming) are injected at API-call time only,
+        # never stored in the messages list. This keeps them ephemeral: they won't
+        # be saved to session DB, session logs, or batch trajectories, but they're
+        # automatically re-applied on every API call (including session continuations).
         
         # Track user turns for memory flush and periodic nudge logic
         self._user_turn_count += 1
@@ -1733,9 +1751,21 @@ class AIAgent:
                 # The signature field helps maintain reasoning continuity
                 api_messages.append(api_msg)
             
-            if active_system_prompt:
-                # Insert system message at the beginning
-                api_messages = [{"role": "system", "content": active_system_prompt}] + api_messages
+            # Build the final system message: cached prompt + ephemeral system prompt.
+            # The ephemeral part is appended here (not baked into the cached prompt)
+            # so it stays out of the session DB and logs.
+            effective_system = active_system_prompt or ""
+            if self.ephemeral_system_prompt:
+                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
+            if effective_system:
+                api_messages = [{"role": "system", "content": effective_system}] + api_messages
+            
+            # Inject ephemeral prefill messages right after the system prompt
+            # but before conversation history. Same API-call-time-only pattern.
+            if self.prefill_messages:
+                sys_offset = 1 if effective_system else 0
+                for idx, pfm in enumerate(self.prefill_messages):
+                    api_messages.insert(sys_offset + idx, pfm.copy())
             
             # Apply Anthropic prompt caching for Claude models via OpenRouter.
             # Auto-detected: if model name contains "claude" and base_url is OpenRouter,
diff --git a/tools/file_operations.py b/tools/file_operations.py
index 64ce6d86a..ae7dad79c 100644
--- a/tools/file_operations.py
+++ b/tools/file_operations.py
@@ -412,9 +412,17 @@ class ShellFileOperations(FileOperations):
             # Still try to read, but warn
             pass
         
-        # Check if it's an image - return base64
+        # Images are never inlined — redirect to the vision tool
         if self._is_image(path):
-            return self._read_image(path)
+            return ReadResult(
+                is_image=True,
+                is_binary=True,
+                file_size=file_size,
+                hint=(
+                    "Image file detected. Automatically redirected to vision_analyze tool. "
+                    "Use vision_analyze with this file path to inspect the image contents."
+                ),
+            )
         
         # Read a sample to check for binary content
         sample_cmd = f"head -c 1000 {self._escape_shell_arg(path)} 2>/dev/null"
@@ -457,6 +465,10 @@ class ShellFileOperations(FileOperations):
             hint=hint
         )
     
+    # Images larger than this are too expensive to inline as base64 in the
+    # conversation context. Return metadata only and suggest vision_analyze.
+    MAX_IMAGE_BYTES = 512 * 1024  # 512 KB
+
     def _read_image(self, path: str) -> ReadResult:
         """Read an image file, returning base64 content."""
         # Get file size
@@ -467,6 +479,17 @@ class ShellFileOperations(FileOperations):
         except ValueError:
             file_size = 0
         
+        if file_size > self.MAX_IMAGE_BYTES:
+            return ReadResult(
+                is_image=True,
+                is_binary=True,
+                file_size=file_size,
+                hint=(
+                    f"Image is too large to inline ({file_size:,} bytes). "
+                    "Use vision_analyze to inspect the image, or reference it by path."
+                ),
+            )
+        
         # Get base64 content
         b64_cmd = f"base64 -w 0 {self._escape_shell_arg(path)} 2>/dev/null"
         b64_result = self._exec(b64_cmd, timeout=30)
diff --git a/tools/file_tools.py b/tools/file_tools.py
index c9708b2eb..91d69c411 100644
--- a/tools/file_tools.py
+++ b/tools/file_tools.py
@@ -199,7 +199,7 @@ def _check_file_reqs():
 
 READ_FILE_SCHEMA = {
     "name": "read_file",
-    "description": "Read a file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Images (png/jpg/gif/webp) returned as base64. Use offset and limit for large files.",
+    "description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. NOTE: Cannot read images or binary files — use vision_analyze for images.",
     "parameters": {
         "type": "object",
         "properties": {
diff --git a/tools/web_tools.py b/tools/web_tools.py
index ed98b05c2..43a7da55a 100644
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -159,7 +159,7 @@ async def process_content_with_llm(
         return processed_content
         
     except Exception as e:
-        logger.error("Error processing content with LLM: %s", e)
+        logger.debug("Error processing content with LLM: %s", e)
         return f"[Failed to process content: {str(e)[:100]}. Content size: {len(content):,} chars]"
 
 
@@ -318,7 +318,7 @@ async def _process_large_content_chunked(
             summaries.append(f"## Section {chunk_idx + 1}\n{summary}")
     
     if not summaries:
-        logger.error("All chunk summarizations failed")
+        logger.debug("All chunk summarizations failed")
         return "[Failed to process large content: all chunk summarizations failed]"
     
     logger.info("Got %d/%d chunk summaries", len(summaries), len(chunks))
@@ -532,7 +532,7 @@ def web_search_tool(query: str, limit: int = 5) -> str:
         
     except Exception as e:
         error_msg = f"Error searching web: {str(e)}"
-        logger.error("%s", error_msg)
+        logger.debug("%s", error_msg)
         
         debug_call_data["error"] = error_msg
         _debug.log_call("web_search_tool", debug_call_data)
@@ -673,7 +673,7 @@ async def web_extract_tool(
                 })
                 
             except Exception as scrape_err:
-                logger.error("Error scraping %s: %s", url, scrape_err)
+                logger.debug("Scrape failed for %s: %s", url, scrape_err)
                 results.append({
                     "url": url,
                     "title": "",
@@ -799,7 +799,7 @@ async def web_extract_tool(
             
     except Exception as e:
         error_msg = f"Error extracting content: {str(e)}"
-        logger.error("%s", error_msg)
+        logger.debug("%s", error_msg)
         
         debug_call_data["error"] = error_msg
         _debug.log_call("web_extract_tool", debug_call_data)
@@ -892,7 +892,7 @@ async def web_crawl_tool(
                 **crawl_params
             )
         except Exception as e:
-            logger.error("Crawl API call failed: %s", e)
+            logger.debug("Crawl API call failed: %s", e)
             raise
 
         pages: List[Dict[str, Any]] = []
@@ -1092,7 +1092,7 @@ async def web_crawl_tool(
         
     except Exception as e:
         error_msg = f"Error crawling website: {str(e)}"
-        logger.error("%s", error_msg)
+        logger.debug("%s", error_msg)
         
         debug_call_data["error"] = error_msg
         _debug.log_call("web_crawl_tool", debug_call_data)
@@ -1227,7 +1227,7 @@ WEB_SEARCH_SCHEMA = {
 
 WEB_EXTRACT_SCHEMA = {
     "name": "web_extract",
-    "description": "Extract content from web page URLs. Pages under 5000 chars return raw content; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. Use browser tools only when pages require interaction or dynamic content.",
+    "description": "Extract content from web page URLs. Returns page content in markdown format. Pages under 5000 chars return full markdown; larger pages are LLM-summarized and capped at ~5000 chars per page. Pages over 2M chars are refused. If a URL fails or times out, use the browser tool to access it instead.",
     "parameters": {
         "type": "object",
         "properties": {