"""Tests for agent retry logic on transient errors.""" from unittest.mock import MagicMock, patch import httpx import pytest from timmy.agents.base import SubAgent @pytest.fixture def sub_agent(): """Create a SubAgent with mocked agent.""" agent = SubAgent( agent_id="test-agent", name="TestAgent", role="test", system_prompt="You are a test agent.", model="llama3.2", ) # Mock the underlying agent.run method agent.agent = MagicMock() return agent @pytest.mark.asyncio async def test_run_retries_on_transient_error(sub_agent): """Mock agent.run to fail twice then succeed. Verify result is returned.""" # Arrange: fail twice, then succeed call_count = 0 def mock_run(*args, **kwargs): nonlocal call_count call_count += 1 if call_count <= 2: raise Exception("Ollama 500 error: XML parse error") # Return successful response result = MagicMock() result.content = "Success after retries" return result sub_agent.agent.run = mock_run # Act with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: result = await sub_agent.run("test message") # Assert assert result == "Success after retries" assert call_count == 3 # 2 failures + 1 success # Verify exponential backoff: attempt 1 = 1s, attempt 2 = 2s assert mock_sleep.call_count == 2 mock_sleep.assert_any_call(1) mock_sleep.assert_any_call(2) @pytest.mark.asyncio async def test_run_exhausts_retries(sub_agent): """Mock agent.run to fail 3 times. Verify exception is raised.""" # Arrange: always fail sub_agent.agent.run.side_effect = Exception("Ollama 500 error: XML parse error") # Act & Assert with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: with pytest.raises(Exception, match="Ollama 500 error: XML parse error"): await sub_agent.run("test message") # Should have been called 3 times (max retries) assert sub_agent.agent.run.call_count == 3 # Verify exponential backoff: attempt 1 = 1s, attempt 2 = 2s assert mock_sleep.call_count == 2 mock_sleep.assert_any_call(1) mock_sleep.assert_any_call(2) @pytest.mark.asyncio async def test_run_retries_on_connection_error(sub_agent): """ConnectError is transient (GPU contention) — retries with backoff (#70).""" sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused") with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: with pytest.raises(httpx.ConnectError, match="Connection refused"): await sub_agent.run("test message") # Should have retried all 3 attempts assert sub_agent.agent.run.call_count == 3 # Contention backoff: 2**1=2, 2**2=4 assert mock_sleep.call_count == 2 mock_sleep.assert_any_call(2) mock_sleep.assert_any_call(4) @pytest.mark.asyncio async def test_run_retries_on_read_error(sub_agent): """ReadError is transient (GPU contention) — retries with backoff (#70).""" sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected") with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: with pytest.raises(httpx.ReadError, match="Server disconnected"): await sub_agent.run("test message") # Should have retried all 3 attempts assert sub_agent.agent.run.call_count == 3 # Contention backoff: 2**1=2, 2**2=4 assert mock_sleep.call_count == 2 mock_sleep.assert_any_call(2) mock_sleep.assert_any_call(4) @pytest.mark.asyncio async def test_run_recovers_from_contention(sub_agent): """Simulate GPU contention: ReadError then success on retry (#70).""" call_count = 0 def mock_run(*args, **kwargs): nonlocal call_count call_count += 1 if call_count == 1: raise httpx.ReadError("Server disconnected") result = MagicMock() result.content = "Recovered after contention" return result sub_agent.agent.run = mock_run with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: result = await sub_agent.run("test message") assert result == "Recovered after contention" assert call_count == 2 mock_sleep.assert_called_once_with(2) # 2**1 contention backoff @pytest.mark.asyncio async def test_run_logs_retry_attempts(sub_agent, caplog): """Mock agent.run to fail once then succeed. Verify logger.warning was called with retry info.""" import logging # Set logging level to capture warnings with caplog.at_level(logging.WARNING): # Arrange: fail once, then succeed call_count = 0 def mock_run(*args, **kwargs): nonlocal call_count call_count += 1 if call_count == 1: raise Exception("Transient error") # Return successful response result = MagicMock() result.content = "Success" return result sub_agent.agent.run = mock_run # Act with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: result = await sub_agent.run("test message") # Assert assert result == "Success" assert call_count == 2 # 1 failure + 1 success assert "Agent run failed on attempt 1/3" in caplog.text assert "Retrying..." in caplog.text # Verify sleep was called with 1s for first attempt mock_sleep.assert_called_once_with(1)