Merge PR #754: fix: stabilize system prompt across gateway turns for cache hits

Prevents unnecessary Anthropic prompt cache misses by reusing stored system prompts for continuing sessions and stabilizing Honcho context per session instead of per turn.
2026-03-09 02:00:14 -07:00
parent aaf8f2d2d2 aedb773f0d
commit a2d0d07109
2 changed files with 178 additions and 10 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -3092,9 +3092,14 @@ class AIAgent:
            )
            self._iters_since_skill = 0

-        # Honcho prefetch: retrieve user context for system prompt injection
+        # Honcho prefetch: retrieve user context for system prompt injection.
+        # Only on the FIRST turn of a session (empty history).  On subsequent
+        # turns the model already has all prior context in its conversation
+        # history, and the Honcho context is baked into the stored system
+        # prompt — re-fetching it would change the system message and break
+        # Anthropic prompt caching.
        self._honcho_context = ""
-        if self._honcho and self._honcho_session_key:
+        if self._honcho and self._honcho_session_key and not conversation_history:
            try:
                self._honcho_context = self._honcho_prefetch(user_message)
            except Exception as e:
@@ -3112,14 +3117,42 @@ class AIAgent:
        # Built once on first call, reused for all subsequent calls.
        # Only rebuilt after context compression events (which invalidate
        # the cache and reload memory from disk).
+        #
+        # For continuing sessions (gateway creates a fresh AIAgent per
+        # message), we load the stored system prompt from the session DB
+        # instead of rebuilding.  Rebuilding would pick up memory changes
+        # from disk that the model already knows about (it wrote them!),
+        # producing a different system prompt and breaking the Anthropic
+        # prefix cache.
        if self._cached_system_prompt is None:
-            self._cached_system_prompt = self._build_system_prompt(system_message)
-            # Store the system prompt snapshot in SQLite
-            if self._session_db:
+            stored_prompt = None
+            if conversation_history and self._session_db:
                try:
-                    self._session_db.update_system_prompt(self.session_id, self._cached_system_prompt)
-                except Exception as e:
-                    logger.debug("Session DB update_system_prompt failed: %s", e)
+                    session_row = self._session_db.get_session(self.session_id)
+                    if session_row:
+                        stored_prompt = session_row.get("system_prompt") or None
+                except Exception:
+                    pass  # Fall through to build fresh
+
+            if stored_prompt:
+                # Continuing session — reuse the exact system prompt from
+                # the previous turn so the Anthropic cache prefix matches.
+                self._cached_system_prompt = stored_prompt
+            else:
+                # First turn of a new session — build from scratch.
+                self._cached_system_prompt = self._build_system_prompt(system_message)
+                # Bake Honcho context into the prompt so it's stable for
+                # the entire session (not re-fetched per turn).
+                if self._honcho_context:
+                    self._cached_system_prompt = (
+                        self._cached_system_prompt + "\n\n" + self._honcho_context
+                    ).strip()
+                # Store the system prompt snapshot in SQLite
+                if self._session_db:
+                    try:
+                        self._session_db.update_system_prompt(self.session_id, self._cached_system_prompt)
+                    except Exception as e:
+                        logger.debug("Session DB update_system_prompt failed: %s", e)

        active_system_prompt = self._cached_system_prompt

@@ -3244,11 +3277,13 @@ class AIAgent:
            # Build the final system message: cached prompt + ephemeral system prompt.
            # The ephemeral part is appended here (not baked into the cached prompt)
            # so it stays out of the session DB and logs.
+            # Note: Honcho context is baked into _cached_system_prompt on the first
+            # turn and stored in the session DB, so it does NOT need to be injected
+            # here.  This keeps the system message identical across all turns in a
+            # session, maximizing Anthropic prompt cache hits.
            effective_system = active_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
-            if self._honcho_context:
-                effective_system = (effective_system + "\n\n" + self._honcho_context).strip()
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages
            
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -1040,3 +1040,136 @@ class TestMaxTokensParam:
        agent.base_url = "https://openrouter.ai/api/v1/api.openai.com"
        result = agent._max_tokens_param(4096)
        assert result == {"max_tokens": 4096}
+
+
+# ---------------------------------------------------------------------------
+# System prompt stability for prompt caching
+# ---------------------------------------------------------------------------
+
+class TestSystemPromptStability:
+    """Verify that the system prompt stays stable across turns for cache hits."""
+
+    def test_stored_prompt_reused_for_continuing_session(self, agent):
+        """When conversation_history is non-empty and session DB has a stored
+        prompt, it should be reused instead of rebuilding from disk."""
+        stored = "You are helpful. [stored from turn 1]"
+        mock_db = MagicMock()
+        mock_db.get_session.return_value = {"system_prompt": stored}
+        agent._session_db = mock_db
+
+        # Simulate a continuing session with history
+        history = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+        ]
+
+        # First call — _cached_system_prompt is None, history is non-empty
+        agent._cached_system_prompt = None
+
+        # Patch run_conversation internals to just test the system prompt logic.
+        # We'll call the prompt caching block directly by simulating what
+        # run_conversation does.
+        conversation_history = history
+
+        # The block under test (from run_conversation):
+        if agent._cached_system_prompt is None:
+            stored_prompt = None
+            if conversation_history and agent._session_db:
+                try:
+                    session_row = agent._session_db.get_session(agent.session_id)
+                    if session_row:
+                        stored_prompt = session_row.get("system_prompt") or None
+                except Exception:
+                    pass
+
+            if stored_prompt:
+                agent._cached_system_prompt = stored_prompt
+
+        assert agent._cached_system_prompt == stored
+        mock_db.get_session.assert_called_once_with(agent.session_id)
+
+    def test_fresh_build_when_no_history(self, agent):
+        """On the first turn (no history), system prompt should be built fresh."""
+        mock_db = MagicMock()
+        agent._session_db = mock_db
+
+        agent._cached_system_prompt = None
+        conversation_history = []
+
+        # The block under test:
+        if agent._cached_system_prompt is None:
+            stored_prompt = None
+            if conversation_history and agent._session_db:
+                session_row = agent._session_db.get_session(agent.session_id)
+                if session_row:
+                    stored_prompt = session_row.get("system_prompt") or None
+
+            if stored_prompt:
+                agent._cached_system_prompt = stored_prompt
+            else:
+                agent._cached_system_prompt = agent._build_system_prompt()
+
+        # Should have built fresh, not queried the DB
+        mock_db.get_session.assert_not_called()
+        assert agent._cached_system_prompt is not None
+        assert "Hermes Agent" in agent._cached_system_prompt
+
+    def test_fresh_build_when_db_has_no_prompt(self, agent):
+        """If the session DB has no stored prompt, build fresh even with history."""
+        mock_db = MagicMock()
+        mock_db.get_session.return_value = {"system_prompt": ""}
+        agent._session_db = mock_db
+
+        agent._cached_system_prompt = None
+        conversation_history = [{"role": "user", "content": "hi"}]
+
+        if agent._cached_system_prompt is None:
+            stored_prompt = None
+            if conversation_history and agent._session_db:
+                try:
+                    session_row = agent._session_db.get_session(agent.session_id)
+                    if session_row:
+                        stored_prompt = session_row.get("system_prompt") or None
+                except Exception:
+                    pass
+
+            if stored_prompt:
+                agent._cached_system_prompt = stored_prompt
+            else:
+                agent._cached_system_prompt = agent._build_system_prompt()
+
+        # Empty string is falsy, so should fall through to fresh build
+        assert "Hermes Agent" in agent._cached_system_prompt
+
+    def test_honcho_context_baked_into_prompt_on_first_turn(self, agent):
+        """Honcho context should be baked into _cached_system_prompt on
+        the first turn, not injected separately per API call."""
+        agent._honcho_context = "User prefers Python over JavaScript."
+        agent._cached_system_prompt = None
+
+        # Simulate first turn: build fresh and bake in Honcho
+        agent._cached_system_prompt = agent._build_system_prompt()
+        if agent._honcho_context:
+            agent._cached_system_prompt = (
+                agent._cached_system_prompt + "\n\n" + agent._honcho_context
+            ).strip()
+
+        assert "User prefers Python over JavaScript" in agent._cached_system_prompt
+
+    def test_honcho_prefetch_skipped_on_continuing_session(self):
+        """Honcho prefetch should not be called when conversation_history
+        is non-empty (continuing session)."""
+        conversation_history = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi there"},
+        ]
+
+        # The guard: `not conversation_history` is False when history exists
+        should_prefetch = not conversation_history
+        assert should_prefetch is False
+
+    def test_honcho_prefetch_runs_on_first_turn(self):
+        """Honcho prefetch should run when conversation_history is empty."""
+        conversation_history = []
+        should_prefetch = not conversation_history
+        assert should_prefetch is True