Merge branch 'main' into codex/align-codex-provider-conventions-mainrepo

2026-02-28 18:13:38 -08:00
parent 32070e6bc0 7f7643cf63
commit 5a79e423fe
96 changed files with 10884 additions and 447 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -128,6 +128,7 @@ class AIAgent:
        session_id: str = None,
        tool_progress_callback: callable = None,
        clarify_callback: callable = None,
+        step_callback: callable = None,
        max_tokens: int = None,
        reasoning_config: Dict[str, Any] = None,
        prefill_messages: List[Dict[str, Any]] = None,
@@ -135,6 +136,7 @@ class AIAgent:
        skip_context_files: bool = False,
        skip_memory: bool = False,
        session_db=None,
+        honcho_session_key: str = None,
    ):
        """
        Initialize the AI Agent.
@@ -174,6 +176,8 @@ class AIAgent:
            skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules
                into the system prompt. Use this for batch processing and data generation to avoid
                polluting trajectories with user-specific persona or project instructions.
+            honcho_session_key (str): Session key for Honcho integration (e.g., "telegram:123456" or CLI session_id).
+                When provided and Honcho is enabled in config, enables persistent cross-session user modeling.
        """
        self.model = model
        self.max_iterations = max_iterations
@@ -200,8 +204,16 @@ class AIAgent:
            self.provider = "openai-codex"
        else:
            self.api_mode = "chat_completions"
+        if base_url and "api.anthropic.com" in base_url.strip().lower():
+            raise ValueError(
+                "Anthropic's native /v1/messages API is not supported yet (planned for a future release). "
+                "Hermes currently requires OpenAI-compatible /chat/completions endpoints. "
+                "To use Claude models now, route through OpenRouter (OPENROUTER_API_KEY) "
+                "or any OpenAI-compatible proxy that wraps the Anthropic API."
+            )
        self.tool_progress_callback = tool_progress_callback
        self.clarify_callback = clarify_callback
+        self.step_callback = step_callback
        self._last_reported_tool = None  # Track for "new tool" mode
        
        # Interrupt mechanism for breaking out of tool loops
@@ -304,7 +316,7 @@ class AIAgent:
            client_kwargs["default_headers"] = {
                "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
                "X-OpenRouter-Title": "Hermes Agent",
-                "X-OpenRouter-Categories": "cli-agent",
+                "X-OpenRouter-Categories": "productivity,cli-agent",
            }
        
        self._client_kwargs = client_kwargs  # stored for rebuilding after interrupt
@@ -435,6 +447,46 @@ class AIAgent:
            except Exception:
                pass  # Memory is optional -- don't break agent init
        
+        # Honcho AI-native memory (cross-session user modeling)
+        # Reads ~/.honcho/config.json as the single source of truth.
+        self._honcho = None  # HonchoSessionManager | None
+        self._honcho_session_key = honcho_session_key
+        if not skip_memory:
+            try:
+                from honcho_integration.client import HonchoClientConfig, get_honcho_client
+                hcfg = HonchoClientConfig.from_global_config()
+                if hcfg.enabled and hcfg.api_key:
+                    from honcho_integration.session import HonchoSessionManager
+                    client = get_honcho_client(hcfg)
+                    self._honcho = HonchoSessionManager(
+                        honcho=client,
+                        config=hcfg,
+                        context_tokens=hcfg.context_tokens,
+                    )
+                    # Resolve session key: explicit arg > global sessions map > fallback
+                    if not self._honcho_session_key:
+                        self._honcho_session_key = (
+                            hcfg.resolve_session_name()
+                            or "hermes-default"
+                        )
+                    # Ensure session exists in Honcho
+                    self._honcho.get_or_create(self._honcho_session_key)
+                    # Inject session context into the honcho tool module
+                    from tools.honcho_tools import set_session_context
+                    set_session_context(self._honcho, self._honcho_session_key)
+                    logger.info(
+                        "Honcho active (session: %s, user: %s, workspace: %s)",
+                        self._honcho_session_key, hcfg.peer_name, hcfg.workspace_id,
+                    )
+                else:
+                    if not hcfg.enabled:
+                        logger.debug("Honcho disabled in global config")
+                    elif not hcfg.api_key:
+                        logger.debug("Honcho enabled but no API key configured")
+            except Exception as e:
+                logger.debug("Honcho init failed (non-fatal): %s", e)
+                self._honcho = None
+
        # Skills config: nudge interval for skill creation reminders
        self._skill_nudge_interval = 15
        try:
@@ -446,9 +498,10 @@ class AIAgent:
        
        # Initialize context compressor for automatic context management
        # Compresses conversation when approaching model's context limit
-        # Configuration via environment variables (can be set in .env or cli-config.yaml)
+        # Configuration via config.yaml (compression section) or environment variables
        compression_threshold = float(os.getenv("CONTEXT_COMPRESSION_THRESHOLD", "0.85"))
        compression_enabled = os.getenv("CONTEXT_COMPRESSION_ENABLED", "true").lower() in ("true", "1", "yes")
+        compression_summary_model = os.getenv("CONTEXT_COMPRESSION_MODEL") or None
        
        self.context_compressor = ContextCompressor(
            model=self.model,
@@ -456,6 +509,7 @@ class AIAgent:
            protect_first_n=3,
            protect_last_n=4,
            summary_target_tokens=500,
+            summary_model_override=compression_summary_model,
            quiet_mode=self.quiet_mode,
        )
        self.compression_enabled = compression_enabled
@@ -467,6 +521,21 @@ class AIAgent:
            else:
                print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
    
+    def _max_tokens_param(self, value: int) -> dict:
+        """Return the correct max tokens kwarg for the current provider.
+        
+        OpenAI's newer models (gpt-4o, o-series, gpt-5+) require
+        'max_completion_tokens'. OpenRouter, local models, and older
+        OpenAI models use 'max_tokens'.
+        """
+        _is_direct_openai = (
+            "api.openai.com" in self.base_url.lower()
+            and "openrouter" not in self.base_url.lower()
+        )
+        if _is_direct_openai:
+            return {"max_completion_tokens": value}
+        return {"max_tokens": value}
+
    def _has_content_after_think_block(self, content: str) -> bool:
        """
        Check if content has actual text after any <think></think> blocks.
@@ -669,7 +738,7 @@ class AIAgent:
        if not self._session_db:
            return
        try:
-            start_idx = (len(conversation_history) if conversation_history else 0) + 1
+            start_idx = len(conversation_history) if conversation_history else 0
            for msg in messages[start_idx:]:
                role = msg.get("role", "unknown")
                content = msg.get("content")
@@ -1016,8 +1085,6 @@ class AIAgent:
        if not content:
            return content
        content = convert_scratchpad_to_think(content)
-        # Strip extra newlines before/after think blocks
-        import re
        content = re.sub(r'\n+(<think>)', r'\n\1', content)
        content = re.sub(r'(</think>)\n+', r'\1\n', content)
        return content.strip()
@@ -1144,7 +1211,67 @@ class AIAgent:
    def is_interrupted(self) -> bool:
        """Check if an interrupt has been requested."""
        return self._interrupt_requested
-    
+
+    # ── Honcho integration helpers ──
+
+    def _honcho_prefetch(self, user_message: str) -> str:
+        """Fetch user context from Honcho for system prompt injection.
+
+        Returns a formatted context block, or empty string if unavailable.
+        """
+        if not self._honcho or not self._honcho_session_key:
+            return ""
+        try:
+            ctx = self._honcho.get_prefetch_context(self._honcho_session_key, user_message)
+            if not ctx:
+                return ""
+            parts = []
+            rep = ctx.get("representation", "")
+            card = ctx.get("card", "")
+            if rep:
+                parts.append(rep)
+            if card:
+                parts.append(card)
+            if not parts:
+                return ""
+            return "# Honcho User Context\n" + "\n\n".join(parts)
+        except Exception as e:
+            logger.debug("Honcho prefetch failed (non-fatal): %s", e)
+            return ""
+
+    def _honcho_save_user_observation(self, content: str) -> str:
+        """Route a memory tool target=user add to Honcho.
+
+        Sends the content as a user peer message so Honcho's reasoning
+        model can incorporate it into the user representation.
+        """
+        if not content or not content.strip():
+            return json.dumps({"success": False, "error": "Content cannot be empty."})
+        try:
+            session = self._honcho.get_or_create(self._honcho_session_key)
+            session.add_message("user", f"[observation] {content.strip()}")
+            self._honcho.save(session)
+            return json.dumps({
+                "success": True,
+                "target": "user",
+                "message": "Saved to Honcho user model.",
+            })
+        except Exception as e:
+            logger.debug("Honcho user observation failed: %s", e)
+            return json.dumps({"success": False, "error": f"Honcho save failed: {e}"})
+
+    def _honcho_sync(self, user_content: str, assistant_content: str) -> None:
+        """Sync the user/assistant message pair to Honcho."""
+        if not self._honcho or not self._honcho_session_key:
+            return
+        try:
+            session = self._honcho.get_or_create(self._honcho_session_key)
+            session.add_message("user", user_content)
+            session.add_message("assistant", assistant_content)
+            self._honcho.save(session)
+        except Exception as e:
+            logger.debug("Honcho sync failed (non-fatal): %s", e)
+
    def _build_system_prompt(self, system_message: str = None) -> str:
        """
        Assemble the full system prompt from all layers.
@@ -1184,6 +1311,7 @@ class AIAgent:
                mem_block = self._memory_store.format_for_system_prompt("memory")
                if mem_block:
                    prompt_parts.append(mem_block)
+            # USER.md is always included when enabled -- Honcho prefetch is additive.
            if self._user_profile_enabled:
                user_block = self._memory_store.format_for_system_prompt("user")
                if user_block:
@@ -1865,11 +1993,11 @@ class AIAgent:
            "model": self.model,
            "messages": api_messages,
            "tools": self.tools if self.tools else None,
-            "timeout": 600.0,
+            "timeout": 900.0,
        }

        if self.max_tokens is not None:
-            api_kwargs["max_tokens"] = self.max_tokens
+            api_kwargs.update(self._max_tokens_param(self.max_tokens))

        extra_body = {}

@@ -1994,7 +2122,8 @@ class AIAgent:
            "[System: The session is being compressed. "
            "Please save anything worth remembering to your memories.]"
        )
-        flush_msg = {"role": "user", "content": flush_content}
+        _sentinel = f"__flush_{id(self)}_{time.monotonic()}"
+        flush_msg = {"role": "user", "content": flush_content, "_flush_sentinel": _sentinel}
        messages.append(flush_msg)

        try:
@@ -2023,50 +2152,50 @@ class AIAgent:
                messages.pop()  # remove flush msg
                return

-            if self.api_mode == "codex_responses":
-                codex_kwargs = self._build_api_kwargs(api_messages)
-                codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
-                response = self._run_codex_stream(codex_kwargs)
-                assistant_message, _ = self._normalize_codex_response(response)
-            else:
-                api_kwargs = {
-                    "model": self.model,
-                    "messages": api_messages,
-                    "tools": [memory_tool_def],
-                    "temperature": 0.3,
-                    "max_tokens": 1024,
-                }
-                response = self.client.chat.completions.create(**api_kwargs, timeout=30.0)
-                if not response.choices:
-                    assistant_message = None
-                else:
-                    assistant_message = response.choices[0].message
+            api_kwargs = {
+                "model": self.model,
+                "messages": api_messages,
+                "tools": [memory_tool_def],
+                "temperature": 0.3,
+                **self._max_tokens_param(1024),
+            }

-            if assistant_message and assistant_message.tool_calls:
-                # Execute only memory tool calls
-                for tc in assistant_message.tool_calls:
-                    if tc.function.name == "memory":
-                        try:
-                            args = json.loads(tc.function.arguments)
-                            from tools.memory_tool import memory_tool as _memory_tool
-                            _memory_tool(
-                                action=args.get("action"),
-                                target=args.get("target", "memory"),
-                                content=args.get("content"),
-                                old_text=args.get("old_text"),
-                                store=self._memory_store,
-                            )
-                            if not self.quiet_mode:
-                                print(f"  🧠 Memory flush: saved to {args.get('target', 'memory')}")
-                        except Exception as e:
-                            logger.debug("Memory flush tool call failed: %s", e)
+            response = self.client.chat.completions.create(**api_kwargs, timeout=30.0)
+
+            if response.choices:
+                assistant_message = response.choices[0].message
+                if assistant_message.tool_calls:
+                    # Execute only memory tool calls
+                    for tc in assistant_message.tool_calls:
+                        if tc.function.name == "memory":
+                            try:
+                                args = json.loads(tc.function.arguments)
+                                flush_target = args.get("target", "memory")
+                                from tools.memory_tool import memory_tool as _memory_tool
+                                result = _memory_tool(
+                                    action=args.get("action"),
+                                    target=flush_target,
+                                    content=args.get("content"),
+                                    old_text=args.get("old_text"),
+                                    store=self._memory_store,
+                                )
+                                # Also send user observations to Honcho when active
+                                if self._honcho and flush_target == "user" and args.get("action") == "add":
+                                    self._honcho_save_user_observation(args.get("content", ""))
+                                if not self.quiet_mode:
+                                    print(f"  🧠 Memory flush: saved to {args.get('target', 'memory')}")
+                            except Exception as e:
+                                logger.debug("Memory flush tool call failed: %s", e)
        except Exception as e:
            logger.debug("Memory flush API call failed: %s", e)
        finally:
-            # Strip flush artifacts: remove everything from the flush message onward
-            while messages and messages[-1] is not flush_msg and len(messages) > 0:
+            # Strip flush artifacts: remove everything from the flush message onward.
+            # Use sentinel marker instead of identity check for robustness.
+            while messages and messages[-1].get("_flush_sentinel") != _sentinel:
                messages.pop()
-            if messages and messages[-1] is flush_msg:
+                if not messages:
+                    break
+            if messages and messages[-1].get("_flush_sentinel") == _sentinel:
                messages.pop()

    def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None) -> tuple:
@@ -2163,26 +2292,33 @@ class AIAgent:
                tool_duration = time.time() - tool_start_time
                if self.quiet_mode:
                    print(f"  {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
-            elif function_name == "session_search" and self._session_db:
-                from tools.session_search_tool import session_search as _session_search
-                function_result = _session_search(
-                    query=function_args.get("query", ""),
-                    role_filter=function_args.get("role_filter"),
-                    limit=function_args.get("limit", 3),
-                    db=self._session_db,
-                )
+            elif function_name == "session_search":
+                if not self._session_db:
+                    function_result = json.dumps({"success": False, "error": "Session database not available."})
+                else:
+                    from tools.session_search_tool import session_search as _session_search
+                    function_result = _session_search(
+                        query=function_args.get("query", ""),
+                        role_filter=function_args.get("role_filter"),
+                        limit=function_args.get("limit", 3),
+                        db=self._session_db,
+                    )
                tool_duration = time.time() - tool_start_time
                if self.quiet_mode:
                    print(f"  {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
            elif function_name == "memory":
+                target = function_args.get("target", "memory")
                from tools.memory_tool import memory_tool as _memory_tool
                function_result = _memory_tool(
                    action=function_args.get("action"),
-                    target=function_args.get("target", "memory"),
+                    target=target,
                    content=function_args.get("content"),
                    old_text=function_args.get("old_text"),
                    store=self._memory_store,
                )
+                # Also send user observations to Honcho when active
+                if self._honcho and target == "user" and function_args.get("action") == "add":
+                    self._honcho_save_user_observation(function_args.get("content", ""))
                tool_duration = time.time() - tool_start_time
                if self.quiet_mode:
                    print(f"  {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
@@ -2258,12 +2394,19 @@ class AIAgent:
                try:
                    function_result = handle_function_call(function_name, function_args, effective_task_id)
                    _spinner_result = function_result
+                except Exception as tool_error:
+                    function_result = f"Error executing tool '{function_name}': {tool_error}"
+                    logger.error("handle_function_call raised for %s: %s", function_name, tool_error)
                finally:
                    tool_duration = time.time() - tool_start_time
                    cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
                    spinner.stop(cute_msg)
            else:
-                function_result = handle_function_call(function_name, function_args, effective_task_id)
+                try:
+                    function_result = handle_function_call(function_name, function_args, effective_task_id)
+                except Exception as tool_error:
+                    function_result = f"Error executing tool '{function_name}': {tool_error}"
+                    logger.error("handle_function_call raised for %s: %s", function_name, tool_error)
                tool_duration = time.time() - tool_start_time

            result_preview = function_result[:200] if len(function_result) > 200 else function_result
@@ -2350,12 +2493,19 @@ class AIAgent:
            if _is_nous:
                summary_extra_body["tags"] = ["product=hermes-agent"]

-            if self.api_mode == "codex_responses":
-                summary_kwargs = self._build_api_kwargs(api_messages)
-                summary_kwargs["tools"] = None
-                summary_response = self._run_codex_stream(summary_kwargs)
-                assistant_message, _ = self._normalize_codex_response(summary_response)
-                final_response = assistant_message.content or ""
+            summary_kwargs = {
+                "model": self.model,
+                "messages": api_messages,
+            }
+            if self.max_tokens is not None:
+                summary_kwargs.update(self._max_tokens_param(self.max_tokens))
+            if summary_extra_body:
+                summary_kwargs["extra_body"] = summary_extra_body
+
+            summary_response = self.client.chat.completions.create(**summary_kwargs)
+
+            if summary_response.choices and summary_response.choices[0].message.content:
+                final_response = summary_response.choices[0].message.content
                if "<think>" in final_response:
                    final_response = re.sub(r'<think>.*?</think>\s*', '', final_response, flags=re.DOTALL).strip()
                if final_response:
@@ -2435,6 +2585,10 @@ class AIAgent:
        # Track user turns for memory flush and periodic nudge logic
        self._user_turn_count += 1

+        # Preserve the original user message before nudge injection.
+        # Honcho should receive the actual user input, not system nudges.
+        original_user_message = user_message
+
        # Periodic memory nudge: remind the model to consider saving memories.
        # Counter resets whenever the memory tool is actually used.
        if (self._memory_nudge_interval > 0
@@ -2459,6 +2613,14 @@ class AIAgent:
            )
            self._iters_since_skill = 0

+        # Honcho prefetch: retrieve user context for system prompt injection
+        self._honcho_context = ""
+        if self._honcho and self._honcho_session_key:
+            try:
+                self._honcho_context = self._honcho_prefetch(user_message)
+            except Exception as e:
+                logger.debug("Honcho prefetch failed (non-fatal): %s", e)
+
        # Add user message
        user_msg = {"role": "user", "content": user_message}
        messages.append(user_msg)
@@ -2501,6 +2663,22 @@ class AIAgent:
            
            api_call_count += 1

+            # Fire step_callback for gateway hooks (agent:step event)
+            if self.step_callback is not None:
+                try:
+                    prev_tools = []
+                    for _m in reversed(messages):
+                        if _m.get("role") == "assistant" and _m.get("tool_calls"):
+                            prev_tools = [
+                                tc["function"]["name"]
+                                for tc in _m["tool_calls"]
+                                if isinstance(tc, dict)
+                            ]
+                            break
+                    self.step_callback(api_call_count, prev_tools)
+                except Exception as _step_err:
+                    logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err)
+
            # Track tool-calling iterations for skill nudge.
            # Counter resets whenever skill_manage is actually used.
            if (self._skill_nudge_interval > 0
@@ -2538,6 +2716,8 @@ class AIAgent:
            effective_system = active_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
+            if self._honcho_context:
+                effective_system = (effective_system + "\n\n" + self._honcho_context).strip()
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages
            
@@ -2587,7 +2767,7 @@ class AIAgent:

            finish_reason = "stop"

-            while retry_count <= max_retries:
+            while retry_count < max_retries:
                try:
                    api_kwargs = self._build_api_kwargs(api_messages)
                    if self.api_mode == "codex_responses":
@@ -2699,6 +2879,7 @@ class AIAgent:
                            if self._interrupt_requested:
                                print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.")
                                self._persist_session(messages, conversation_history)
+                                self.clear_interrupt()
                                return {
                                    "final_response": "Operation interrupted.",
                                    "messages": messages,
@@ -2837,6 +3018,7 @@ class AIAgent:
                    if self._interrupt_requested:
                        print(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.")
                        self._persist_session(messages, conversation_history)
+                        self.clear_interrupt()
                        return {
                            "final_response": "Operation interrupted.",
                            "messages": messages,
@@ -2845,10 +3027,45 @@ class AIAgent:
                            "interrupted": True,
                        }
                    
+                    # Check for 413 payload-too-large BEFORE generic 4xx handler.
+                    # A 413 is a payload-size error — the correct response is to
+                    # compress history and retry, not abort immediately.
+                    status_code = getattr(api_error, "status_code", None)
+                    is_payload_too_large = (
+                        status_code == 413
+                        or 'request entity too large' in error_msg
+                        or 'payload too large' in error_msg
+                        or 'error code: 413' in error_msg
+                    )
+
+                    if is_payload_too_large:
+                        print(f"{self.log_prefix}⚠️  Request payload too large (413) - attempting compression...")
+
+                        original_len = len(messages)
+                        messages, active_system_prompt = self._compress_context(
+                            messages, system_message, approx_tokens=approx_tokens
+                        )
+
+                        if len(messages) < original_len:
+                            print(f"{self.log_prefix}   🗜️  Compressed {original_len} → {len(messages)} messages, retrying...")
+                            continue  # Retry with compressed messages
+                        else:
+                            print(f"{self.log_prefix}❌ Payload too large and cannot compress further.")
+                            logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.")
+                            self._persist_session(messages, conversation_history)
+                            return {
+                                "messages": messages,
+                                "completed": False,
+                                "api_calls": api_call_count,
+                                "error": "Request payload too large (413). Cannot compress further.",
+                                "partial": True
+                            }
+
                    # Check for non-retryable client errors (4xx HTTP status codes).
                    # These indicate a problem with the request itself (bad model ID,
                    # invalid API key, forbidden, etc.) and will never succeed on retry.
-                    is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500
+                    # Note: 413 is excluded — it's handled above via compression.
+                    is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
                    is_client_error = is_client_status_error or any(phrase in error_msg for phrase in [
                        'error code: 400', 'error code: 401', 'error code: 403',
                        'error code: 404', 'error code: 422',
@@ -2856,7 +3073,7 @@ class AIAgent:
                        'invalid api key', 'invalid_api_key', 'authentication',
                        'unauthorized', 'forbidden', 'not found',
                    ])
-                    
+
                    if is_client_error:
                        self._dump_api_request_debug(
                            api_kwargs, reason="non_retryable_client_error", error=api_error,
@@ -2876,8 +3093,9 @@ class AIAgent:
                    
                    # Check for non-retryable errors (context length exceeded)
                    is_context_length_error = any(phrase in error_msg for phrase in [
-                        'context length', 'maximum context', 'token limit', 
-                        'too many tokens', 'reduce the length', 'exceeds the limit'
+                        'context length', 'maximum context', 'token limit',
+                        'too many tokens', 'reduce the length', 'exceeds the limit',
+                        'request entity too large',  # OpenRouter/Nous 413 safety net
                    ])
                    
                    if is_context_length_error:
@@ -2912,9 +3130,10 @@ class AIAgent:
                        raise api_error

                    wait_time = min(2 ** retry_count, 60)  # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s
-                    print(f"⚠️  OpenAI-compatible API call failed (attempt {retry_count}/{max_retries}): {str(api_error)[:100]}")
-                    print(f"⏳ Retrying in {wait_time}s...")
                    logging.warning(f"API retry {retry_count}/{max_retries} after error: {api_error}")
+                    if retry_count >= max_retries:
+                        print(f"{self.log_prefix}⚠️  API call failed after {retry_count} attempts: {str(api_error)[:100]}")
+                        print(f"{self.log_prefix}⏳ Final retry in {wait_time}s...")
                    
                    # Sleep in small increments so we can respond to interrupts quickly
                    # instead of blocking the entire wait_time in one sleep() call
@@ -2923,6 +3142,7 @@ class AIAgent:
                        if self._interrupt_requested:
                            print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.")
                            self._persist_session(messages, conversation_history)
+                            self.clear_interrupt()
                            return {
                                "final_response": "Operation interrupted.",
                                "messages": messages,
@@ -3194,7 +3414,8 @@ class AIAgent:
                                            tool_names.append(fn.get("name", "unknown"))
                                        msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
                                        break
-                                final_response = fallback
+                                # Strip <think> blocks from fallback content for user display
+                                final_response = self._strip_think_blocks(fallback).strip()
                                break
                            
                            # No fallback -- append the empty message as-is
@@ -3253,6 +3474,9 @@ class AIAgent:

                    codex_ack_continuations = 0
                    
+                    # Strip <think> blocks from user-facing response (keep raw in messages for trajectory)
+                    final_response = self._strip_think_blocks(final_response).strip()
+                    
                    final_msg = self._build_assistant_message(assistant_message, finish_reason)
                    
                    messages.append(final_msg)
@@ -3327,7 +3551,11 @@ class AIAgent:

        # Persist session to both JSON log and SQLite
        self._persist_session(messages, conversation_history)
-        
+
+        # Sync conversation to Honcho for user modeling
+        if final_response and not interrupted:
+            self._honcho_sync(original_user_message, final_response)
+
        # Build result with interrupt info if applicable
        result = {
            "final_response": final_response,