Merge PR #395: fix(gateway): use filtered history length for transcript message extraction

Authored by PercyDikec. Fixes #394. The transcript extraction used len(history) to find new messages, but history includes session_meta entries stripped before reaching the agent. This caused 1 message lost per turn from turn 2 onwards. Fix returns history_offset (filtered length) from _run_agent and uses it for the slice.
2026-03-04 16:25:09 -08:00
parent 4c72329412 d3504f84af
commit bd3025d669
2 changed files with 274 additions and 3 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -945,9 +945,12 @@ class GatewayRunner:
                    }
                )
            
-            # Find only the NEW messages from this turn (skip history we loaded)
-            history_len = len(history)
-            new_messages = agent_messages[history_len:] if len(agent_messages) > history_len else agent_messages
+            # Find only the NEW messages from this turn (skip history we loaded).
+            # Use the filtered history length (history_offset) that was actually
+            # passed to the agent, not len(history) which includes session_meta
+            # entries that were stripped before the agent saw them.
+            history_len = agent_result.get("history_offset", len(history))
+            new_messages = agent_messages[history_len:] if len(agent_messages) > history_len else []
            
            # If no new messages found (edge case), fall back to simple user/assistant
            if not new_messages:
@@ -2070,6 +2073,7 @@ class GatewayRunner:
                "messages": result_holder[0].get("messages", []) if result_holder[0] else [],
                "api_calls": result_holder[0].get("api_calls", 0) if result_holder[0] else 0,
                "tools": tools_holder[0] or [],
+                "history_offset": len(agent_history),
            }
        
        # Start progress message sender if enabled
--- a/tests/gateway/test_transcript_offset.py
+++ b/tests/gateway/test_transcript_offset.py
@@ -0,0 +1,267 @@
+"""Tests for transcript history offset fix.
+
+Regression tests for a bug where the gateway transcript lost 1 message
+per turn from turn 2 onwards.  The raw transcript history includes
+``session_meta`` entries that are filtered out before being passed to
+the agent.  The agent returns messages built from this filtered history
+plus new messages from the current turn.
+
+The old code used ``len(history)`` (raw count, includes session_meta)
+to slice ``agent_messages``, which caused the slice to skip valid new
+messages.  The fix adds ``history_offset`` (the filtered history length)
+to ``_run_agent``'s return dict and uses it for the slice.
+"""
+
+import pytest
+
+
+# ---------------------------------------------------------------------------
+# Helpers - replicate the filtering logic from _run_agent
+# ---------------------------------------------------------------------------
+
+def _filter_history(history: list) -> list:
+    """Replicate the agent_history filtering from GatewayRunner._run_agent.
+
+    Strips session_meta and system messages, exactly as the real code does.
+    """
+    agent_history = []
+    for msg in history:
+        role = msg.get("role")
+        if not role:
+            continue
+        if role in ("session_meta",):
+            continue
+        if role == "system":
+            continue
+
+        has_tool_calls = "tool_calls" in msg
+        has_tool_call_id = "tool_call_id" in msg
+        is_tool_message = role == "tool"
+
+        if has_tool_calls or has_tool_call_id or is_tool_message:
+            clean_msg = {k: v for k, v in msg.items() if k != "timestamp"}
+            agent_history.append(clean_msg)
+        else:
+            content = msg.get("content")
+            if content:
+                agent_history.append({"role": role, "content": content})
+    return agent_history
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+class TestTranscriptHistoryOffset:
+    """Verify the transcript extraction uses the filtered history length."""
+
+    def test_session_meta_causes_offset_mismatch(self):
+        """Turn 2: session_meta makes len(history) > len(agent_history).
+
+        - history (raw): 1 session_meta + 2 conversation = 3 entries
+        - agent_history (filtered): 2 entries
+        - Agent returns 2 old + 2 new = 4 messages
+        - OLD: agent_messages[3:] = 1 message (lost the user message)
+        - FIX: agent_messages[2:] = 2 messages (correct)
+        """
+        history = [
+            {"role": "session_meta", "tools": [], "model": "gpt-4",
+             "platform": "telegram", "timestamp": "t0"},
+            {"role": "user", "content": "Hello", "timestamp": "t1"},
+            {"role": "assistant", "content": "Hi there!", "timestamp": "t1"},
+        ]
+
+        agent_history = _filter_history(history)
+        assert len(agent_history) == 2  # session_meta stripped
+
+        # Agent returns: filtered history (2) + new turn (2)
+        agent_messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi there!"},
+            {"role": "user", "content": "What is Python?"},
+            {"role": "assistant", "content": "A programming language."},
+        ]
+
+        # OLD behavior: len(history) = 3, skips too many
+        old_offset = len(history)
+        old_new = (agent_messages[old_offset:]
+                   if len(agent_messages) > old_offset
+                   else agent_messages)
+        assert len(old_new) == 1  # BUG: lost the user message
+
+        # FIXED behavior: history_offset = 2
+        history_offset = len(agent_history)
+        fixed_new = (agent_messages[history_offset:]
+                     if len(agent_messages) > history_offset
+                     else [])
+        assert len(fixed_new) == 2
+        assert fixed_new[0]["content"] == "What is Python?"
+        assert fixed_new[1]["content"] == "A programming language."
+
+    def test_no_session_meta_same_result(self):
+        """First turn has no session_meta, so both approaches agree."""
+        history = []
+        agent_history = _filter_history(history)
+        assert len(agent_history) == 0
+
+        agent_messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi!"},
+        ]
+
+        old_new = (agent_messages[len(history):]
+                   if len(agent_messages) > len(history)
+                   else agent_messages)
+        fixed_new = (agent_messages[len(agent_history):]
+                     if len(agent_messages) > len(agent_history)
+                     else [])
+
+        assert old_new == fixed_new
+        assert len(fixed_new) == 2
+
+    def test_multiple_session_meta_larger_drift(self):
+        """Two session_meta entries double the offset error.
+
+        This can happen when the session spans tool definition changes
+        or model switches that each write a new session_meta record.
+        """
+        history = [
+            {"role": "session_meta", "tools": [], "timestamp": "t0"},
+            {"role": "user", "content": "msg1", "timestamp": "t1"},
+            {"role": "assistant", "content": "reply1", "timestamp": "t1"},
+            {"role": "session_meta", "tools": ["new_tool"], "timestamp": "t2"},
+            {"role": "user", "content": "msg2", "timestamp": "t3"},
+            {"role": "assistant", "content": "reply2", "timestamp": "t3"},
+        ]
+
+        agent_history = _filter_history(history)
+        assert len(agent_history) == 4
+        assert len(history) == 6  # 2 extra session_meta entries
+
+        # Agent returns 4 old + 2 new = 6 total
+        agent_messages = [
+            {"role": "user", "content": "msg1"},
+            {"role": "assistant", "content": "reply1"},
+            {"role": "user", "content": "msg2"},
+            {"role": "assistant", "content": "reply2"},
+            {"role": "user", "content": "msg3"},
+            {"role": "assistant", "content": "reply3"},
+        ]
+
+        # OLD: len(history) == len(agent_messages) == 6 -> else branch
+        old_offset = len(history)
+        old_new = (agent_messages[old_offset:]
+                   if len(agent_messages) > old_offset
+                   else agent_messages)
+        # BUG: treats ALL messages as new (duplicates entire history)
+        assert old_new == agent_messages
+
+        # FIXED: history_offset = 4
+        fixed_new = (agent_messages[len(agent_history):]
+                     if len(agent_messages) > len(agent_history)
+                     else [])
+        assert len(fixed_new) == 2
+        assert fixed_new[0]["content"] == "msg3"
+        assert fixed_new[1]["content"] == "reply3"
+
+    def test_system_messages_also_filtered(self):
+        """system messages in history are also stripped from agent_history."""
+        history = [
+            {"role": "session_meta", "tools": [], "timestamp": "t0"},
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hi", "timestamp": "t1"},
+            {"role": "assistant", "content": "Hello!", "timestamp": "t1"},
+        ]
+
+        agent_history = _filter_history(history)
+        assert len(agent_history) == 2  # only user + assistant
+
+        agent_messages = [
+            {"role": "user", "content": "Hi"},
+            {"role": "assistant", "content": "Hello!"},
+            {"role": "user", "content": "New question"},
+            {"role": "assistant", "content": "New answer"},
+        ]
+
+        # OLD: len(history) = 4, skips everything
+        old_offset = len(history)
+        old_new = (agent_messages[old_offset:]
+                   if len(agent_messages) > old_offset
+                   else agent_messages)
+        assert old_new == agent_messages  # BUG: all treated as new
+
+        # FIXED
+        fixed_new = (agent_messages[len(agent_history):]
+                     if len(agent_messages) > len(agent_history)
+                     else [])
+        assert len(fixed_new) == 2
+        assert fixed_new[0]["content"] == "New question"
+
+    def test_else_branch_returns_empty_list(self):
+        """When agent has fewer messages than offset, return [] not all.
+
+        The old code had ``else agent_messages`` which would treat the
+        entire message list as new when the agent compressed or dropped
+        messages.  The fix changes this to ``else []``, falling through
+        to the simple user/assistant fallback path.
+        """
+        history = [
+            {"role": "session_meta", "tools": [], "timestamp": "t0"},
+            {"role": "user", "content": "Hello", "timestamp": "t1"},
+            {"role": "assistant", "content": "Hi!", "timestamp": "t1"},
+        ]
+
+        # Agent compressed and returned fewer messages than history
+        agent_messages = [
+            {"role": "user", "content": "Hello"},
+            {"role": "assistant", "content": "Hi!"},
+        ]
+
+        history_offset = len(_filter_history(history))  # 2
+        new_messages = (agent_messages[history_offset:]
+                        if len(agent_messages) > history_offset
+                        else [])
+        # 2 == 2, so no new messages - falls to fallback
+        assert new_messages == []
+
+    def test_tool_call_messages_preserved_in_filter(self):
+        """Tool call messages pass through the filter, keeping offset correct."""
+        history = [
+            {"role": "session_meta", "tools": [], "timestamp": "t0"},
+            {"role": "user", "content": "Search for cats", "timestamp": "t1"},
+            {"role": "assistant", "content": None, "timestamp": "t1",
+             "tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
+            {"role": "tool", "tool_call_id": "tc1",
+             "content": "Results about cats", "timestamp": "t1"},
+            {"role": "assistant", "content": "Here are results.",
+             "timestamp": "t1"},
+        ]
+
+        agent_history = _filter_history(history)
+        # session_meta filtered, but tool_calls/tool messages kept
+        assert len(agent_history) == 4
+        assert len(history) == 5  # 1 session_meta extra
+
+        agent_messages = [
+            {"role": "user", "content": "Search for cats"},
+            {"role": "assistant", "content": None,
+             "tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
+            {"role": "tool", "tool_call_id": "tc1", "content": "Results about cats"},
+            {"role": "assistant", "content": "Here are results."},
+            {"role": "user", "content": "Now search for dogs"},
+            {"role": "assistant", "content": "Dog results here."},
+        ]
+
+        # OLD: len(history) = 5, agent_messages[5:] = 1 message (lost user msg)
+        old_new = (agent_messages[len(history):]
+                   if len(agent_messages) > len(history)
+                   else agent_messages)
+        assert len(old_new) == 1  # BUG
+
+        # FIXED
+        fixed_new = (agent_messages[len(agent_history):]
+                     if len(agent_messages) > len(agent_history)
+                     else [])
+        assert len(fixed_new) == 2
+        assert fixed_new[0]["content"] == "Now search for dogs"
+        assert fixed_new[1]["content"] == "Dog results here."