fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API call time instead of appending it to the system prompt. This preserves the stable system-prefix cache while keeping Honcho continuity context available for the turn. Also adds regression coverage for the injection helper and for continuing sessions so Honcho recall stays out of the system prompt.
2026-03-13 21:07:00 -07:00
parent 07d70a0345
commit 938e887b4c
2 changed files with 87 additions and 7 deletions
--- a/run_agent.py
+++ b/run_agent.py
@@ -202,6 +202,32 @@ _NEVER_PARALLEL_TOOLS = frozenset({"clarify"})
 _MAX_TOOL_WORKERS = 8


+def _inject_honcho_turn_context(content, turn_context: str):
+    """Append Honcho recall to the current-turn user message without mutating history.
+
+    The returned content is sent to the API for this turn only. Keeping Honcho
+    recall out of the system prompt preserves the stable cache prefix while
+    still giving the model continuity context.
+    """
+    if not turn_context:
+        return content
+
+    note = (
+        "[System note: The following Honcho memory was retrieved from prior "
+        "sessions. It is continuity context for this turn only, not new user "
+        "input.]\n\n"
+        f"{turn_context}"
+    )
+
+    if isinstance(content, list):
+        return list(content) + [{"type": "text", "text": note}]
+
+    text = "" if content is None else str(content)
+    if not text.strip():
+        return note
+    return f"{text}\n\n{note}"
+
+
 class AIAgent:
    """
    AI Agent with tool calling capabilities.
@@ -3909,10 +3935,11 @@ class AIAgent:

        # Honcho prefetch consumption:
        # - First turn: bake into cached system prompt (stable for the session).
-        # - Later turns: inject as ephemeral system context for this API call only.
+        # - Later turns: attach recall to the current-turn user message at
+        #   API-call time only (never persisted to history / session DB).
        #
-        # This keeps the persisted/cached prompt stable while still allowing
-        # turn N to consume background prefetch results from turn N-1.
+        # This keeps the system-prefix cache stable while still allowing turn N
+        # to consume background prefetch results from turn N-1.
        self._honcho_context = ""
        self._honcho_turn_context = ""
        _recall_mode = (self._honcho_config.recall_mode if self._honcho_config else "hybrid")
@@ -3930,6 +3957,7 @@ class AIAgent:
        # Add user message
        user_msg = {"role": "user", "content": user_message}
        messages.append(user_msg)
+        current_turn_user_idx = len(messages) - 1
        
        if not self.quiet_mode:
            print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
@@ -4079,9 +4107,14 @@ class AIAgent:
            # However, providers like Moonshot AI require a separate 'reasoning_content' field
            # on assistant messages with tool_calls. We handle both cases here.
            api_messages = []
-            for msg in messages:
+            for idx, msg in enumerate(messages):
                api_msg = msg.copy()

+                if idx == current_turn_user_idx and msg.get("role") == "user" and self._honcho_turn_context:
+                    api_msg["content"] = _inject_honcho_turn_context(
+                        api_msg.get("content", ""), self._honcho_turn_context
+                    )
+
                # For ALL assistant messages, pass reasoning back to the API
                # This ensures multi-turn reasoning context is preserved
                if msg.get("role") == "assistant":
@@ -4109,11 +4142,11 @@ class AIAgent:

            # Build the final system message: cached prompt + ephemeral system prompt.
            # Ephemeral additions are API-call-time only (not persisted to session DB).
+            # Honcho later-turn recall is intentionally kept OUT of the system prompt
+            # so the stable cache prefix remains unchanged.
            effective_system = active_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
-            if self._honcho_turn_context:
-                effective_system = (effective_system + "\n\n" + self._honcho_turn_context).strip()
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages