diff --git a/cli.py b/cli.py
index 7f712d702..ddcc2157b 100644
--- a/cli.py
+++ b/cli.py
@@ -5508,13 +5508,25 @@ class HermesCLI:
             def run_agent():
                 nonlocal result
                 agent_message = _voice_prefix + message if _voice_prefix else message
-                result = self.agent.run_conversation(
-                    user_message=agent_message,
-                    conversation_history=self.conversation_history[:-1],  # Exclude the message we just added
-                    stream_callback=stream_callback,
-                    task_id=self.session_id,
-                    persist_user_message=message if _voice_prefix else None,
-                )
+                try:
+                    result = self.agent.run_conversation(
+                        user_message=agent_message,
+                        conversation_history=self.conversation_history[:-1],  # Exclude the message we just added
+                        stream_callback=stream_callback,
+                        task_id=self.session_id,
+                        persist_user_message=message if _voice_prefix else None,
+                    )
+                except Exception as exc:
+                    logging.error("run_conversation raised: %s", exc, exc_info=True)
+                    _summary = getattr(self.agent, '_summarize_api_error', lambda e: str(e)[:300])(exc)
+                    result = {
+                        "final_response": f"Error: {_summary}",
+                        "messages": [],
+                        "api_calls": 0,
+                        "completed": False,
+                        "failed": True,
+                        "error": _summary,
+                    }
 
             # Start agent in background thread
             agent_thread = threading.Thread(target=run_agent)
diff --git a/run_agent.py b/run_agent.py
index 28519905e..810fea4b6 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -6697,7 +6697,15 @@ class AIAgent:
                         self._dump_api_request_debug(
                             api_kwargs, reason="max_retries_exhausted", error=api_error,
                         )
-                        raise api_error
+                        self._persist_session(messages, conversation_history)
+                        return {
+                            "final_response": f"API call failed after {max_retries} retries: {_final_summary}",
+                            "messages": messages,
+                            "api_calls": api_call_count,
+                            "completed": False,
+                            "failed": True,
+                            "error": _final_summary,
+                        }
 
                     wait_time = min(2 ** retry_count, 60)  # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s
                     logger.warning(
diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py
index 3dd9a134b..d961244f3 100644
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -1425,8 +1425,8 @@ class TestRetryExhaustion:
         assert "error" in result
         assert "Invalid API response" in result["error"]
 
-    def test_api_error_raises_after_retries(self, agent):
-        """Exhausted retries on API errors must raise, not fall through."""
+    def test_api_error_returns_gracefully_after_retries(self, agent):
+        """Exhausted retries on API errors must return error result, not crash."""
         self._setup_agent(agent)
         agent.client.chat.completions.create.side_effect = RuntimeError("rate limited")
         with (
@@ -1435,8 +1435,11 @@ class TestRetryExhaustion:
             patch.object(agent, "_cleanup_task_resources"),
             patch("run_agent.time", self._make_fast_time_mock()),
         ):
-            with pytest.raises(RuntimeError, match="rate limited"):
-                agent.run_conversation("hello")
+            result = agent.run_conversation("hello")
+        assert result.get("completed") is False
+        assert result.get("failed") is True
+        assert "error" in result
+        assert "rate limited" in result["error"]
 
 
 # ---------------------------------------------------------------------------