diff --git a/src/timmy/agents/base.py b/src/timmy/agents/base.py index cc0ccfb1..a4d64dac 100644 --- a/src/timmy/agents/base.py +++ b/src/timmy/agents/base.py @@ -10,6 +10,7 @@ SubAgent is the single seed class for ALL agents. Differentiation comes entirely from config (agents.yaml), not from Python subclasses. """ +import asyncio import logging from abc import ABC, abstractmethod from typing import Any @@ -124,15 +125,34 @@ class BaseAgent(ABC): Returns: Agent response """ - try: - result = self.agent.run(message, stream=False) - response = result.content if hasattr(result, "content") else str(result) - except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc: - logger.error("Ollama disconnected: %s", exc) - raise - except Exception as exc: - logger.error("Agent run failed: %s", exc) - raise + max_retries = 3 + last_exception = None + + for attempt in range(1, max_retries + 1): + try: + result = self.agent.run(message, stream=False) + response = result.content if hasattr(result, "content") else str(result) + break # Success, exit the retry loop + except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc: + logger.error("Ollama disconnected: %s", exc) + raise + except Exception as exc: + last_exception = exc + if attempt < max_retries: + logger.warning( + "Agent run failed on attempt %d/%d: %s. Retrying...", + attempt, + max_retries, + exc, + ) + await asyncio.sleep(1) + else: + logger.error( + "Agent run failed after %d attempts: %s", + max_retries, + exc, + ) + raise last_exception from exc # Emit completion event if self.event_bus: diff --git a/tests/timmy/test_agent_retry.py b/tests/timmy/test_agent_retry.py new file mode 100644 index 00000000..c2511f3c --- /dev/null +++ b/tests/timmy/test_agent_retry.py @@ -0,0 +1,123 @@ +"""Tests for agent retry logic on transient errors.""" + +from unittest.mock import MagicMock + +import httpx +import pytest + +from timmy.agents.base import SubAgent + + +@pytest.fixture +def sub_agent(): + """Create a SubAgent with mocked agent.""" + agent = SubAgent( + agent_id="test-agent", + name="TestAgent", + role="test", + system_prompt="You are a test agent.", + model="llama3.2", + ) + # Mock the underlying agent.run method + agent.agent = MagicMock() + return agent + + +@pytest.mark.asyncio +async def test_run_retries_on_transient_error(sub_agent): + """Mock agent.run to fail twice then succeed. Verify result is returned.""" + # Arrange: fail twice, then succeed + call_count = 0 + + def mock_run(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count <= 2: + raise Exception("Ollama 500 error: XML parse error") + # Return successful response + result = MagicMock() + result.content = "Success after retries" + return result + + sub_agent.agent.run = mock_run + + # Act + result = await sub_agent.run("test message") + + # Assert + assert result == "Success after retries" + assert call_count == 3 # 2 failures + 1 success + + +@pytest.mark.asyncio +async def test_run_exhausts_retries(sub_agent): + """Mock agent.run to fail 3 times. Verify exception is raised.""" + # Arrange: always fail + sub_agent.agent.run.side_effect = Exception("Ollama 500 error: XML parse error") + + # Act & Assert + with pytest.raises(Exception, match="Ollama 500 error: XML parse error"): + await sub_agent.run("test message") + + # Should have been called 3 times (max retries) + assert sub_agent.agent.run.call_count == 3 + + +@pytest.mark.asyncio +async def test_run_no_retry_on_connection_error(sub_agent): + """Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry.""" + # Arrange: raise ConnectError + sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused") + + # Act & Assert + with pytest.raises(httpx.ConnectError, match="Connection refused"): + await sub_agent.run("test message") + + # Should have been called only once (no retry) + assert sub_agent.agent.run.call_count == 1 + + +@pytest.mark.asyncio +async def test_run_no_retry_on_read_error(sub_agent): + """Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry.""" + # Arrange: raise ReadError + sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected") + + # Act & Assert + with pytest.raises(httpx.ReadError, match="Server disconnected"): + await sub_agent.run("test message") + + # Should have been called only once (no retry) + assert sub_agent.agent.run.call_count == 1 + + +@pytest.mark.asyncio +async def test_run_logs_retry_attempts(sub_agent, caplog): + """Mock agent.run to fail once then succeed. Verify logger.warning was called with retry info.""" + import logging + + # Set logging level to capture warnings + with caplog.at_level(logging.WARNING): + # Arrange: fail once, then succeed + call_count = 0 + + def mock_run(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise Exception("Transient error") + # Return successful response + result = MagicMock() + result.content = "Success" + return result + + sub_agent.agent.run = mock_run + + # Act + result = await sub_agent.run("test message") + + # Assert + assert result == "Success" + assert call_count == 2 # 1 failure + 1 success + assert "Agent run failed on attempt 1/3" in caplog.text + assert "Retrying..." in caplog.text