The transcript extraction used len(history) to find new messages, but history includes session_meta entries that are stripped before passing to the agent. This mismatch caused 1 message to be lost from the transcript on every turn after the first, because the slice offset was too high. Use the filtered history length (history_offset) returned by _run_agent instead. Also changed the else branch from returning all agent_messages to returning an empty list, so compressed/shorter agent output does not duplicate the entire history into the transcript.
268 lines
11 KiB
Python
268 lines
11 KiB
Python
"""Tests for transcript history offset fix.
|
|
|
|
Regression tests for a bug where the gateway transcript lost 1 message
|
|
per turn from turn 2 onwards. The raw transcript history includes
|
|
``session_meta`` entries that are filtered out before being passed to
|
|
the agent. The agent returns messages built from this filtered history
|
|
plus new messages from the current turn.
|
|
|
|
The old code used ``len(history)`` (raw count, includes session_meta)
|
|
to slice ``agent_messages``, which caused the slice to skip valid new
|
|
messages. The fix adds ``history_offset`` (the filtered history length)
|
|
to ``_run_agent``'s return dict and uses it for the slice.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers - replicate the filtering logic from _run_agent
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _filter_history(history: list) -> list:
|
|
"""Replicate the agent_history filtering from GatewayRunner._run_agent.
|
|
|
|
Strips session_meta and system messages, exactly as the real code does.
|
|
"""
|
|
agent_history = []
|
|
for msg in history:
|
|
role = msg.get("role")
|
|
if not role:
|
|
continue
|
|
if role in ("session_meta",):
|
|
continue
|
|
if role == "system":
|
|
continue
|
|
|
|
has_tool_calls = "tool_calls" in msg
|
|
has_tool_call_id = "tool_call_id" in msg
|
|
is_tool_message = role == "tool"
|
|
|
|
if has_tool_calls or has_tool_call_id or is_tool_message:
|
|
clean_msg = {k: v for k, v in msg.items() if k != "timestamp"}
|
|
agent_history.append(clean_msg)
|
|
else:
|
|
content = msg.get("content")
|
|
if content:
|
|
agent_history.append({"role": role, "content": content})
|
|
return agent_history
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestTranscriptHistoryOffset:
|
|
"""Verify the transcript extraction uses the filtered history length."""
|
|
|
|
def test_session_meta_causes_offset_mismatch(self):
|
|
"""Turn 2: session_meta makes len(history) > len(agent_history).
|
|
|
|
- history (raw): 1 session_meta + 2 conversation = 3 entries
|
|
- agent_history (filtered): 2 entries
|
|
- Agent returns 2 old + 2 new = 4 messages
|
|
- OLD: agent_messages[3:] = 1 message (lost the user message)
|
|
- FIX: agent_messages[2:] = 2 messages (correct)
|
|
"""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "model": "gpt-4",
|
|
"platform": "telegram", "timestamp": "t0"},
|
|
{"role": "user", "content": "Hello", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Hi there!", "timestamp": "t1"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 2 # session_meta stripped
|
|
|
|
# Agent returns: filtered history (2) + new turn (2)
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi there!"},
|
|
{"role": "user", "content": "What is Python?"},
|
|
{"role": "assistant", "content": "A programming language."},
|
|
]
|
|
|
|
# OLD behavior: len(history) = 3, skips too many
|
|
old_offset = len(history)
|
|
old_new = (agent_messages[old_offset:]
|
|
if len(agent_messages) > old_offset
|
|
else agent_messages)
|
|
assert len(old_new) == 1 # BUG: lost the user message
|
|
|
|
# FIXED behavior: history_offset = 2
|
|
history_offset = len(agent_history)
|
|
fixed_new = (agent_messages[history_offset:]
|
|
if len(agent_messages) > history_offset
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "What is Python?"
|
|
assert fixed_new[1]["content"] == "A programming language."
|
|
|
|
def test_no_session_meta_same_result(self):
|
|
"""First turn has no session_meta, so both approaches agree."""
|
|
history = []
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 0
|
|
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi!"},
|
|
]
|
|
|
|
old_new = (agent_messages[len(history):]
|
|
if len(agent_messages) > len(history)
|
|
else agent_messages)
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
|
|
assert old_new == fixed_new
|
|
assert len(fixed_new) == 2
|
|
|
|
def test_multiple_session_meta_larger_drift(self):
|
|
"""Two session_meta entries double the offset error.
|
|
|
|
This can happen when the session spans tool definition changes
|
|
or model switches that each write a new session_meta record.
|
|
"""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "user", "content": "msg1", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "reply1", "timestamp": "t1"},
|
|
{"role": "session_meta", "tools": ["new_tool"], "timestamp": "t2"},
|
|
{"role": "user", "content": "msg2", "timestamp": "t3"},
|
|
{"role": "assistant", "content": "reply2", "timestamp": "t3"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 4
|
|
assert len(history) == 6 # 2 extra session_meta entries
|
|
|
|
# Agent returns 4 old + 2 new = 6 total
|
|
agent_messages = [
|
|
{"role": "user", "content": "msg1"},
|
|
{"role": "assistant", "content": "reply1"},
|
|
{"role": "user", "content": "msg2"},
|
|
{"role": "assistant", "content": "reply2"},
|
|
{"role": "user", "content": "msg3"},
|
|
{"role": "assistant", "content": "reply3"},
|
|
]
|
|
|
|
# OLD: len(history) == len(agent_messages) == 6 -> else branch
|
|
old_offset = len(history)
|
|
old_new = (agent_messages[old_offset:]
|
|
if len(agent_messages) > old_offset
|
|
else agent_messages)
|
|
# BUG: treats ALL messages as new (duplicates entire history)
|
|
assert old_new == agent_messages
|
|
|
|
# FIXED: history_offset = 4
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "msg3"
|
|
assert fixed_new[1]["content"] == "reply3"
|
|
|
|
def test_system_messages_also_filtered(self):
|
|
"""system messages in history are also stripped from agent_history."""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
{"role": "user", "content": "Hi", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Hello!", "timestamp": "t1"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
assert len(agent_history) == 2 # only user + assistant
|
|
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hi"},
|
|
{"role": "assistant", "content": "Hello!"},
|
|
{"role": "user", "content": "New question"},
|
|
{"role": "assistant", "content": "New answer"},
|
|
]
|
|
|
|
# OLD: len(history) = 4, skips everything
|
|
old_offset = len(history)
|
|
old_new = (agent_messages[old_offset:]
|
|
if len(agent_messages) > old_offset
|
|
else agent_messages)
|
|
assert old_new == agent_messages # BUG: all treated as new
|
|
|
|
# FIXED
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "New question"
|
|
|
|
def test_else_branch_returns_empty_list(self):
|
|
"""When agent has fewer messages than offset, return [] not all.
|
|
|
|
The old code had ``else agent_messages`` which would treat the
|
|
entire message list as new when the agent compressed or dropped
|
|
messages. The fix changes this to ``else []``, falling through
|
|
to the simple user/assistant fallback path.
|
|
"""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "user", "content": "Hello", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Hi!", "timestamp": "t1"},
|
|
]
|
|
|
|
# Agent compressed and returned fewer messages than history
|
|
agent_messages = [
|
|
{"role": "user", "content": "Hello"},
|
|
{"role": "assistant", "content": "Hi!"},
|
|
]
|
|
|
|
history_offset = len(_filter_history(history)) # 2
|
|
new_messages = (agent_messages[history_offset:]
|
|
if len(agent_messages) > history_offset
|
|
else [])
|
|
# 2 == 2, so no new messages - falls to fallback
|
|
assert new_messages == []
|
|
|
|
def test_tool_call_messages_preserved_in_filter(self):
|
|
"""Tool call messages pass through the filter, keeping offset correct."""
|
|
history = [
|
|
{"role": "session_meta", "tools": [], "timestamp": "t0"},
|
|
{"role": "user", "content": "Search for cats", "timestamp": "t1"},
|
|
{"role": "assistant", "content": None, "timestamp": "t1",
|
|
"tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
|
|
{"role": "tool", "tool_call_id": "tc1",
|
|
"content": "Results about cats", "timestamp": "t1"},
|
|
{"role": "assistant", "content": "Here are results.",
|
|
"timestamp": "t1"},
|
|
]
|
|
|
|
agent_history = _filter_history(history)
|
|
# session_meta filtered, but tool_calls/tool messages kept
|
|
assert len(agent_history) == 4
|
|
assert len(history) == 5 # 1 session_meta extra
|
|
|
|
agent_messages = [
|
|
{"role": "user", "content": "Search for cats"},
|
|
{"role": "assistant", "content": None,
|
|
"tool_calls": [{"id": "tc1", "function": {"name": "web_search"}}]},
|
|
{"role": "tool", "tool_call_id": "tc1", "content": "Results about cats"},
|
|
{"role": "assistant", "content": "Here are results."},
|
|
{"role": "user", "content": "Now search for dogs"},
|
|
{"role": "assistant", "content": "Dog results here."},
|
|
]
|
|
|
|
# OLD: len(history) = 5, agent_messages[5:] = 1 message (lost user msg)
|
|
old_new = (agent_messages[len(history):]
|
|
if len(agent_messages) > len(history)
|
|
else agent_messages)
|
|
assert len(old_new) == 1 # BUG
|
|
|
|
# FIXED
|
|
fixed_new = (agent_messages[len(agent_history):]
|
|
if len(agent_messages) > len(agent_history)
|
|
else [])
|
|
assert len(fixed_new) == 2
|
|
assert fixed_new[0]["content"] == "Now search for dogs"
|
|
assert fixed_new[1]["content"] == "Dog results here."
|