Files
hermes-agent/tests/test_surrogate_sanitization.py
Teknium 857a5d7b47 fix: sanitize surrogate characters from clipboard paste to prevent UnicodeEncodeError (#3624)
Pasting text from rich-text editors (Google Docs, Word, etc.) can inject
lone surrogate characters (U+D800..U+DFFF) that are invalid UTF-8.
The OpenAI SDK serializes messages with ensure_ascii=False, then encodes
to UTF-8 for the HTTP body — surrogates crash this with:
  UnicodeEncodeError: 'utf-8' codec can't encode character '\udce2'

Three-layer fix:
1. Primary: sanitize user_message at the top of run_conversation()
2. CLI: sanitize in chat() before appending to conversation_history
3. Safety net: catch UnicodeEncodeError in the API error handler,
   sanitize the entire messages list in-place, and retry once.
   Also exclude UnicodeEncodeError from is_local_validation_error
   so it doesn't get classified as non-retryable.

Includes 14 new tests covering the sanitization helpers and the
integration with run_conversation().
2026-03-28 16:53:14 -07:00

155 lines
6.0 KiB
Python

"""Tests for surrogate character sanitization in user input.
Surrogates (U+D800..U+DFFF) are invalid in UTF-8 and crash json.dumps()
inside the OpenAI SDK. They can appear via clipboard paste from rich-text
editors like Google Docs.
"""
import json
import pytest
from unittest.mock import MagicMock, patch
from run_agent import (
_sanitize_surrogates,
_sanitize_messages_surrogates,
_SURROGATE_RE,
)
class TestSanitizeSurrogates:
"""Test the _sanitize_surrogates() helper."""
def test_normal_text_unchanged(self):
text = "Hello, this is normal text with unicode: café ñ 日本語 🎉"
assert _sanitize_surrogates(text) == text
def test_empty_string(self):
assert _sanitize_surrogates("") == ""
def test_single_surrogate_replaced(self):
result = _sanitize_surrogates("Hello \udce2 world")
assert result == "Hello \ufffd world"
def test_multiple_surrogates_replaced(self):
result = _sanitize_surrogates("a\ud800b\udc00c\udfff")
assert result == "a\ufffdb\ufffdc\ufffd"
def test_all_surrogate_range(self):
"""Verify the regex catches the full surrogate range."""
for cp in [0xD800, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, 0xDFFF]:
text = f"test{chr(cp)}end"
result = _sanitize_surrogates(text)
assert '\ufffd' in result, f"Surrogate U+{cp:04X} not caught"
def test_result_is_json_serializable(self):
"""Sanitized text must survive json.dumps + utf-8 encoding."""
dirty = "data \udce2\udcb0 from clipboard"
clean = _sanitize_surrogates(dirty)
serialized = json.dumps({"content": clean}, ensure_ascii=False)
# Must not raise UnicodeEncodeError
serialized.encode("utf-8")
def test_original_surrogates_fail_encoding(self):
"""Confirm the original bug: surrogates crash utf-8 encoding."""
dirty = "data \udce2 from clipboard"
serialized = json.dumps({"content": dirty}, ensure_ascii=False)
with pytest.raises(UnicodeEncodeError):
serialized.encode("utf-8")
class TestSanitizeMessagesSurrogates:
"""Test the _sanitize_messages_surrogates() helper for message lists."""
def test_clean_messages_returns_false(self):
msgs = [
{"role": "user", "content": "all clean"},
{"role": "assistant", "content": "me too"},
]
assert _sanitize_messages_surrogates(msgs) is False
def test_dirty_string_content_sanitized(self):
msgs = [
{"role": "user", "content": "text with \udce2 surrogate"},
]
assert _sanitize_messages_surrogates(msgs) is True
assert "\ufffd" in msgs[0]["content"]
assert "\udce2" not in msgs[0]["content"]
def test_dirty_multimodal_content_sanitized(self):
msgs = [
{"role": "user", "content": [
{"type": "text", "text": "multimodal \udce2 content"},
{"type": "image_url", "image_url": {"url": "http://example.com"}},
]},
]
assert _sanitize_messages_surrogates(msgs) is True
assert "\ufffd" in msgs[0]["content"][0]["text"]
assert "\udce2" not in msgs[0]["content"][0]["text"]
def test_mixed_clean_and_dirty(self):
msgs = [
{"role": "user", "content": "clean text"},
{"role": "user", "content": "dirty \udce2 text"},
{"role": "assistant", "content": "clean response"},
]
assert _sanitize_messages_surrogates(msgs) is True
assert msgs[0]["content"] == "clean text"
assert "\ufffd" in msgs[1]["content"]
assert msgs[2]["content"] == "clean response"
def test_non_dict_items_skipped(self):
msgs = ["not a dict", {"role": "user", "content": "ok"}]
assert _sanitize_messages_surrogates(msgs) is False
def test_tool_messages_sanitized(self):
"""Tool results could also contain surrogates from file reads etc."""
msgs = [
{"role": "tool", "content": "result with \udce2 data", "tool_call_id": "x"},
]
assert _sanitize_messages_surrogates(msgs) is True
assert "\ufffd" in msgs[0]["content"]
class TestRunConversationSurrogateSanitization:
"""Integration: verify run_conversation sanitizes user_message."""
@patch("run_agent.AIAgent._build_system_prompt")
@patch("run_agent.AIAgent._interruptible_streaming_api_call")
@patch("run_agent.AIAgent._interruptible_api_call")
def test_user_message_surrogates_sanitized(self, mock_api, mock_stream, mock_sys):
"""Surrogates in user_message are stripped before API call."""
from run_agent import AIAgent
mock_sys.return_value = "system prompt"
# Mock streaming to return a simple response
mock_choice = MagicMock()
mock_choice.message.content = "response"
mock_choice.message.tool_calls = None
mock_choice.message.refusal = None
mock_choice.finish_reason = "stop"
mock_choice.message.reasoning_content = None
mock_response = MagicMock()
mock_response.choices = [mock_choice]
mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=5, total_tokens=15)
mock_response.model = "test-model"
mock_response.id = "test-id"
mock_stream.return_value = mock_response
mock_api.return_value = mock_response
agent = AIAgent(model="test/model", quiet_mode=True, skip_memory=True, skip_context_files=True)
agent.client = MagicMock()
# Pass a message with surrogates
result = agent.run_conversation(
user_message="test \udce2 message",
conversation_history=[],
)
# The message stored in history should have surrogates replaced
for msg in result.get("messages", []):
if msg.get("role") == "user":
assert "\udce2" not in msg["content"], "Surrogate leaked into stored message"
assert "\ufffd" in msg["content"], "Replacement char not in stored message"