[loop-cycle-38] fix: add retry logic for Ollama 500 errors (#131) (#133)

Co-authored-by: hermes <hermes@timmy.local> Co-committed-by: hermes <hermes@timmy.local>
2026-03-15 09:38:21 -04:00
parent f4e5148825
commit 76df262563
2 changed files with 152 additions and 9 deletions
--- a/tests/timmy/test_agent_retry.py
+++ b/tests/timmy/test_agent_retry.py
@@ -0,0 +1,123 @@
+"""Tests for agent retry logic on transient errors."""
+
+from unittest.mock import MagicMock
+
+import httpx
+import pytest
+
+from timmy.agents.base import SubAgent
+
+
+@pytest.fixture
+def sub_agent():
+    """Create a SubAgent with mocked agent."""
+    agent = SubAgent(
+        agent_id="test-agent",
+        name="TestAgent",
+        role="test",
+        system_prompt="You are a test agent.",
+        model="llama3.2",
+    )
+    # Mock the underlying agent.run method
+    agent.agent = MagicMock()
+    return agent
+
+
+@pytest.mark.asyncio
+async def test_run_retries_on_transient_error(sub_agent):
+    """Mock agent.run to fail twice then succeed. Verify result is returned."""
+    # Arrange: fail twice, then succeed
+    call_count = 0
+
+    def mock_run(*args, **kwargs):
+        nonlocal call_count
+        call_count += 1
+        if call_count <= 2:
+            raise Exception("Ollama 500 error: XML parse error")
+        # Return successful response
+        result = MagicMock()
+        result.content = "Success after retries"
+        return result
+
+    sub_agent.agent.run = mock_run
+
+    # Act
+    result = await sub_agent.run("test message")
+
+    # Assert
+    assert result == "Success after retries"
+    assert call_count == 3  # 2 failures + 1 success
+
+
+@pytest.mark.asyncio
+async def test_run_exhausts_retries(sub_agent):
+    """Mock agent.run to fail 3 times. Verify exception is raised."""
+    # Arrange: always fail
+    sub_agent.agent.run.side_effect = Exception("Ollama 500 error: XML parse error")
+
+    # Act & Assert
+    with pytest.raises(Exception, match="Ollama 500 error: XML parse error"):
+        await sub_agent.run("test message")
+
+    # Should have been called 3 times (max retries)
+    assert sub_agent.agent.run.call_count == 3
+
+
+@pytest.mark.asyncio
+async def test_run_no_retry_on_connection_error(sub_agent):
+    """Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry."""
+    # Arrange: raise ConnectError
+    sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused")
+
+    # Act & Assert
+    with pytest.raises(httpx.ConnectError, match="Connection refused"):
+        await sub_agent.run("test message")
+
+    # Should have been called only once (no retry)
+    assert sub_agent.agent.run.call_count == 1
+
+
+@pytest.mark.asyncio
+async def test_run_no_retry_on_read_error(sub_agent):
+    """Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry."""
+    # Arrange: raise ReadError
+    sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected")
+
+    # Act & Assert
+    with pytest.raises(httpx.ReadError, match="Server disconnected"):
+        await sub_agent.run("test message")
+
+    # Should have been called only once (no retry)
+    assert sub_agent.agent.run.call_count == 1
+
+
+@pytest.mark.asyncio
+async def test_run_logs_retry_attempts(sub_agent, caplog):
+    """Mock agent.run to fail once then succeed. Verify logger.warning was called with retry info."""
+    import logging
+
+    # Set logging level to capture warnings
+    with caplog.at_level(logging.WARNING):
+        # Arrange: fail once, then succeed
+        call_count = 0
+
+        def mock_run(*args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise Exception("Transient error")
+            # Return successful response
+            result = MagicMock()
+            result.content = "Success"
+            return result
+
+        sub_agent.agent.run = mock_run
+
+        # Act
+        result = await sub_agent.run("test message")
+
+        # Assert
+        assert result == "Success"
+        assert call_count == 2  # 1 failure + 1 success
+        assert "Agent run failed on attempt 1/3" in caplog.text
+        assert "Retrying..." in caplog.text