From bf52468a913ebbdea89bb20ad979bfa610631d82 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 28 Feb 2026 16:49:49 -0800 Subject: [PATCH] fix(gateway): improve MEDIA tag handling to prevent duplication across turns Refactor the extraction of MEDIA paths to collect them from the history before processing the current turn's messages. This change ensures that MEDIA tags are deduplicated based on previously seen paths, preventing TTS voice messages from being re-attached in subsequent replies. This addresses the issue outlined in #160. --- gateway/run.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index cf2188a9e..ccd02bc5f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1739,8 +1739,18 @@ class GatewayRunner: content = f"[Delivered from {mirror_src}] {content}" agent_history.append({"role": role, "content": content}) - # Track history length to only scan NEW messages for MEDIA tags - history_len = len(agent_history) + # Collect MEDIA paths already in history so we can exclude them + # from the current turn's extraction. This is compression-safe: + # even if the message list shrinks, we know which paths are old. + _history_media_paths: set = set() + for _hm in agent_history: + if _hm.get("role") in ("tool", "function"): + _hc = _hm.get("content", "") + if "MEDIA:" in _hc: + for _match in re.finditer(r'MEDIA:(\S+)', _hc): + _p = _match.group(1).strip().rstrip('",}') + if _p: + _history_media_paths.add(_p) result = agent.run_conversation(message, conversation_history=agent_history) result_holder[0] = result @@ -1763,28 +1773,24 @@ class GatewayRunner: # append any that aren't already present in the final response, so the # adapter's extract_media() can find and deliver the files exactly once. # - # IMPORTANT: Only scan messages from the CURRENT turn (after history_len), - # not the full history. This prevents TTS voice messages from earlier - # turns being re-attached to every subsequent reply. (Fixes #160) + # Uses path-based deduplication against _history_media_paths (collected + # before run_conversation) instead of index slicing. This is safe even + # when context compression shrinks the message list. (Fixes #160) if "MEDIA:" not in final_response: media_tags = [] has_voice_directive = False - all_messages = result.get("messages", []) - # Only process new messages from this turn - new_messages = all_messages[history_len:] if len(all_messages) > history_len else [] - for msg in new_messages: - if msg.get("role") == "tool" or msg.get("role") == "function": + for msg in result.get("messages", []): + if msg.get("role") in ("tool", "function"): content = msg.get("content", "") if "MEDIA:" in content: for match in re.finditer(r'MEDIA:(\S+)', content): path = match.group(1).strip().rstrip('",}') - if path: + if path and path not in _history_media_paths: media_tags.append(f"MEDIA:{path}") if "[[audio_as_voice]]" in content: has_voice_directive = True if media_tags: - # Deduplicate while preserving order seen = set() unique_tags = [] for tag in media_tags: