diff --git a/cli.py b/cli.py index 22594875f..ca93fbb00 100644 --- a/cli.py +++ b/cli.py @@ -5534,6 +5534,13 @@ class HermesCLI: except Exception as e: logging.debug("@ context reference expansion failed: %s", e) + # Sanitize surrogate characters that can arrive via clipboard paste from + # rich-text editors (Google Docs, Word, etc.). Lone surrogates are invalid + # UTF-8 and crash JSON serialization in the OpenAI SDK. + if isinstance(message, str): + from run_agent import _sanitize_surrogates + message = _sanitize_surrogates(message) + # Add user message to history self.conversation_history.append({"role": "user", "content": message}) diff --git a/run_agent.py b/run_agent.py index 7cdaa3279..574cde7a1 100644 --- a/run_agent.py +++ b/run_agent.py @@ -368,6 +368,48 @@ _BUDGET_WARNING_RE = re.compile( ) +# Regex to match lone surrogate code points (U+D800..U+DFFF). +# These are invalid in UTF-8 and cause UnicodeEncodeError when the OpenAI SDK +# serialises messages to JSON. Common source: clipboard paste from Google Docs +# or other rich-text editors on some platforms. +_SURROGATE_RE = re.compile(r'[\ud800-\udfff]') + + +def _sanitize_surrogates(text: str) -> str: + """Replace lone surrogate code points with U+FFFD (replacement character). + + Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the + OpenAI SDK. This is a fast no-op when the text contains no surrogates. + """ + if _SURROGATE_RE.search(text): + return _SURROGATE_RE.sub('\ufffd', text) + return text + + +def _sanitize_messages_surrogates(messages: list) -> bool: + """Sanitize surrogate characters from all string content in a messages list. + + Walks message dicts in-place. Returns True if any surrogates were found + and replaced, False otherwise. + """ + found = False + for msg in messages: + if not isinstance(msg, dict): + continue + content = msg.get("content") + if isinstance(content, str) and _SURROGATE_RE.search(content): + msg["content"] = _SURROGATE_RE.sub('\ufffd', content) + found = True + elif isinstance(content, list): + for part in content: + if isinstance(part, dict): + text = part.get("text") + if isinstance(text, str) and _SURROGATE_RE.search(text): + part["text"] = _SURROGATE_RE.sub('\ufffd', text) + found = True + return found + + def _strip_budget_warnings_from_history(messages: list) -> None: """Remove budget pressure warnings from tool-result messages in-place. @@ -5959,6 +6001,14 @@ class AIAgent: # Installed once, transparent when streams are healthy, prevents crash on write. _install_safe_stdio() + # Sanitize surrogate characters from user input. Clipboard paste from + # rich-text editors (Google Docs, Word, etc.) can inject lone surrogates + # that are invalid UTF-8 and crash JSON serialization in the OpenAI SDK. + if isinstance(user_message, str): + user_message = _sanitize_surrogates(user_message) + if isinstance(persist_user_message, str): + persist_user_message = _sanitize_surrogates(persist_user_message) + # Store stream callback for _interruptible_api_call to pick up self._stream_callback = stream_callback self._persist_user_message_idx = None @@ -5975,6 +6025,7 @@ class AIAgent: self._codex_incomplete_retries = 0 self._last_content_with_tools = None self._mute_post_response = False + self._surrogate_sanitized = False # NOTE: _turns_since_memory and _iters_since_skill are NOT reset here. # They are initialized in __init__ and must persist across run_conversation # calls so that nudge logic accumulates correctly in CLI mode. @@ -6810,6 +6861,24 @@ class AIAgent: if self.thinking_callback: self.thinking_callback("") + # ----------------------------------------------------------- + # Surrogate character recovery. UnicodeEncodeError happens + # when the messages contain lone surrogates (U+D800..U+DFFF) + # that are invalid UTF-8. Common source: clipboard paste + # from Google Docs or similar rich-text editors. We sanitize + # the entire messages list in-place and retry once. + # ----------------------------------------------------------- + if isinstance(api_error, UnicodeEncodeError) and not getattr(self, '_surrogate_sanitized', False): + self._surrogate_sanitized = True + if _sanitize_messages_surrogates(messages): + self._vprint( + f"{self.log_prefix}⚠️ Stripped invalid surrogate characters from messages. Retrying...", + force=True, + ) + continue + # Surrogates weren't in messages — might be in system + # prompt or prefill. Fall through to normal error path. + status_code = getattr(api_error, "status_code", None) if ( self.api_mode == "codex_responses" @@ -7078,8 +7147,13 @@ class AIAgent: # 529 (Anthropic overloaded) is also transient. # Also catch local validation errors (ValueError, TypeError) — these # are programming bugs, not transient failures. + # Exclude UnicodeEncodeError — it's a ValueError subclass but is + # handled separately by the surrogate sanitization path above. _RETRYABLE_STATUS_CODES = {413, 429, 529} - is_local_validation_error = isinstance(api_error, (ValueError, TypeError)) + is_local_validation_error = ( + isinstance(api_error, (ValueError, TypeError)) + and not isinstance(api_error, UnicodeEncodeError) + ) # Detect generic 400s from Anthropic OAuth (transient server-side failures). # Real invalid_request_error responses include a descriptive message; # transient ones contain only "Error" or are empty. (ref: issue #1608) diff --git a/tests/test_surrogate_sanitization.py b/tests/test_surrogate_sanitization.py new file mode 100644 index 000000000..defad587e --- /dev/null +++ b/tests/test_surrogate_sanitization.py @@ -0,0 +1,154 @@ +"""Tests for surrogate character sanitization in user input. + +Surrogates (U+D800..U+DFFF) are invalid in UTF-8 and crash json.dumps() +inside the OpenAI SDK. They can appear via clipboard paste from rich-text +editors like Google Docs. +""" +import json +import pytest +from unittest.mock import MagicMock, patch + +from run_agent import ( + _sanitize_surrogates, + _sanitize_messages_surrogates, + _SURROGATE_RE, +) + + +class TestSanitizeSurrogates: + """Test the _sanitize_surrogates() helper.""" + + def test_normal_text_unchanged(self): + text = "Hello, this is normal text with unicode: café ñ 日本語 🎉" + assert _sanitize_surrogates(text) == text + + def test_empty_string(self): + assert _sanitize_surrogates("") == "" + + def test_single_surrogate_replaced(self): + result = _sanitize_surrogates("Hello \udce2 world") + assert result == "Hello \ufffd world" + + def test_multiple_surrogates_replaced(self): + result = _sanitize_surrogates("a\ud800b\udc00c\udfff") + assert result == "a\ufffdb\ufffdc\ufffd" + + def test_all_surrogate_range(self): + """Verify the regex catches the full surrogate range.""" + for cp in [0xD800, 0xD900, 0xDA00, 0xDB00, 0xDC00, 0xDD00, 0xDE00, 0xDF00, 0xDFFF]: + text = f"test{chr(cp)}end" + result = _sanitize_surrogates(text) + assert '\ufffd' in result, f"Surrogate U+{cp:04X} not caught" + + def test_result_is_json_serializable(self): + """Sanitized text must survive json.dumps + utf-8 encoding.""" + dirty = "data \udce2\udcb0 from clipboard" + clean = _sanitize_surrogates(dirty) + serialized = json.dumps({"content": clean}, ensure_ascii=False) + # Must not raise UnicodeEncodeError + serialized.encode("utf-8") + + def test_original_surrogates_fail_encoding(self): + """Confirm the original bug: surrogates crash utf-8 encoding.""" + dirty = "data \udce2 from clipboard" + serialized = json.dumps({"content": dirty}, ensure_ascii=False) + with pytest.raises(UnicodeEncodeError): + serialized.encode("utf-8") + + +class TestSanitizeMessagesSurrogates: + """Test the _sanitize_messages_surrogates() helper for message lists.""" + + def test_clean_messages_returns_false(self): + msgs = [ + {"role": "user", "content": "all clean"}, + {"role": "assistant", "content": "me too"}, + ] + assert _sanitize_messages_surrogates(msgs) is False + + def test_dirty_string_content_sanitized(self): + msgs = [ + {"role": "user", "content": "text with \udce2 surrogate"}, + ] + assert _sanitize_messages_surrogates(msgs) is True + assert "\ufffd" in msgs[0]["content"] + assert "\udce2" not in msgs[0]["content"] + + def test_dirty_multimodal_content_sanitized(self): + msgs = [ + {"role": "user", "content": [ + {"type": "text", "text": "multimodal \udce2 content"}, + {"type": "image_url", "image_url": {"url": "http://example.com"}}, + ]}, + ] + assert _sanitize_messages_surrogates(msgs) is True + assert "\ufffd" in msgs[0]["content"][0]["text"] + assert "\udce2" not in msgs[0]["content"][0]["text"] + + def test_mixed_clean_and_dirty(self): + msgs = [ + {"role": "user", "content": "clean text"}, + {"role": "user", "content": "dirty \udce2 text"}, + {"role": "assistant", "content": "clean response"}, + ] + assert _sanitize_messages_surrogates(msgs) is True + assert msgs[0]["content"] == "clean text" + assert "\ufffd" in msgs[1]["content"] + assert msgs[2]["content"] == "clean response" + + def test_non_dict_items_skipped(self): + msgs = ["not a dict", {"role": "user", "content": "ok"}] + assert _sanitize_messages_surrogates(msgs) is False + + def test_tool_messages_sanitized(self): + """Tool results could also contain surrogates from file reads etc.""" + msgs = [ + {"role": "tool", "content": "result with \udce2 data", "tool_call_id": "x"}, + ] + assert _sanitize_messages_surrogates(msgs) is True + assert "\ufffd" in msgs[0]["content"] + + +class TestRunConversationSurrogateSanitization: + """Integration: verify run_conversation sanitizes user_message.""" + + @patch("run_agent.AIAgent._build_system_prompt") + @patch("run_agent.AIAgent._interruptible_streaming_api_call") + @patch("run_agent.AIAgent._interruptible_api_call") + def test_user_message_surrogates_sanitized(self, mock_api, mock_stream, mock_sys): + """Surrogates in user_message are stripped before API call.""" + from run_agent import AIAgent + + mock_sys.return_value = "system prompt" + + # Mock streaming to return a simple response + mock_choice = MagicMock() + mock_choice.message.content = "response" + mock_choice.message.tool_calls = None + mock_choice.message.refusal = None + mock_choice.finish_reason = "stop" + mock_choice.message.reasoning_content = None + + mock_response = MagicMock() + mock_response.choices = [mock_choice] + mock_response.usage = MagicMock(prompt_tokens=10, completion_tokens=5, total_tokens=15) + mock_response.model = "test-model" + mock_response.id = "test-id" + + mock_stream.return_value = mock_response + mock_api.return_value = mock_response + + agent = AIAgent(model="test/model", quiet_mode=True, skip_memory=True, skip_context_files=True) + agent.client = MagicMock() + + # Pass a message with surrogates + result = agent.run_conversation( + user_message="test \udce2 message", + conversation_history=[], + ) + + # The message stored in history should have surrogates replaced + for msg in result.get("messages", []): + if msg.get("role") == "user": + assert "\udce2" not in msg["content"], "Surrogate leaked into stored message" + assert "\ufffd" in msg["content"], "Replacement char not in stored message"