[loop-cycle-38] fix: add retry logic for Ollama 500 errors (#131) (#133)

Co-authored-by: hermes <hermes@timmy.local> Co-committed-by: hermes <hermes@timmy.local>
2026-03-15 09:38:21 -04:00
parent f4e5148825
commit 76df262563
2 changed files with 152 additions and 9 deletions
--- a/src/timmy/agents/base.py
+++ b/src/timmy/agents/base.py
@@ -10,6 +10,7 @@ SubAgent is the single seed class for ALL agents.  Differentiation
 comes entirely from config (agents.yaml), not from Python subclasses.
 """

+import asyncio
 import logging
 from abc import ABC, abstractmethod
 from typing import Any
@@ -124,15 +125,34 @@ class BaseAgent(ABC):
        Returns:
            Agent response
        """
-        try:
-            result = self.agent.run(message, stream=False)
-            response = result.content if hasattr(result, "content") else str(result)
-        except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc:
-            logger.error("Ollama disconnected: %s", exc)
-            raise
-        except Exception as exc:
-            logger.error("Agent run failed: %s", exc)
-            raise
+        max_retries = 3
+        last_exception = None
+
+        for attempt in range(1, max_retries + 1):
+            try:
+                result = self.agent.run(message, stream=False)
+                response = result.content if hasattr(result, "content") else str(result)
+                break  # Success, exit the retry loop
+            except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc:
+                logger.error("Ollama disconnected: %s", exc)
+                raise
+            except Exception as exc:
+                last_exception = exc
+                if attempt < max_retries:
+                    logger.warning(
+                        "Agent run failed on attempt %d/%d: %s. Retrying...",
+                        attempt,
+                        max_retries,
+                        exc,
+                    )
+                    await asyncio.sleep(1)
+                else:
+                    logger.error(
+                        "Agent run failed after %d attempts: %s",
+                        max_retries,
+                        exc,
+                    )
+                    raise last_exception from exc

        # Emit completion event
        if self.event_bus:
--- a/tests/timmy/test_agent_retry.py
+++ b/tests/timmy/test_agent_retry.py
@@ -0,0 +1,123 @@
+"""Tests for agent retry logic on transient errors."""
+
+from unittest.mock import MagicMock
+
+import httpx
+import pytest
+
+from timmy.agents.base import SubAgent
+
+
+@pytest.fixture
+def sub_agent():
+    """Create a SubAgent with mocked agent."""
+    agent = SubAgent(
+        agent_id="test-agent",
+        name="TestAgent",
+        role="test",
+        system_prompt="You are a test agent.",
+        model="llama3.2",
+    )
+    # Mock the underlying agent.run method
+    agent.agent = MagicMock()
+    return agent
+
+
+@pytest.mark.asyncio
+async def test_run_retries_on_transient_error(sub_agent):
+    """Mock agent.run to fail twice then succeed. Verify result is returned."""
+    # Arrange: fail twice, then succeed
+    call_count = 0
+
+    def mock_run(*args, **kwargs):
+        nonlocal call_count
+        call_count += 1
+        if call_count <= 2:
+            raise Exception("Ollama 500 error: XML parse error")
+        # Return successful response
+        result = MagicMock()
+        result.content = "Success after retries"
+        return result
+
+    sub_agent.agent.run = mock_run
+
+    # Act
+    result = await sub_agent.run("test message")
+
+    # Assert
+    assert result == "Success after retries"
+    assert call_count == 3  # 2 failures + 1 success
+
+
+@pytest.mark.asyncio
+async def test_run_exhausts_retries(sub_agent):
+    """Mock agent.run to fail 3 times. Verify exception is raised."""
+    # Arrange: always fail
+    sub_agent.agent.run.side_effect = Exception("Ollama 500 error: XML parse error")
+
+    # Act & Assert
+    with pytest.raises(Exception, match="Ollama 500 error: XML parse error"):
+        await sub_agent.run("test message")
+
+    # Should have been called 3 times (max retries)
+    assert sub_agent.agent.run.call_count == 3
+
+
+@pytest.mark.asyncio
+async def test_run_no_retry_on_connection_error(sub_agent):
+    """Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry."""
+    # Arrange: raise ConnectError
+    sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused")
+
+    # Act & Assert
+    with pytest.raises(httpx.ConnectError, match="Connection refused"):
+        await sub_agent.run("test message")
+
+    # Should have been called only once (no retry)
+    assert sub_agent.agent.run.call_count == 1
+
+
+@pytest.mark.asyncio
+async def test_run_no_retry_on_read_error(sub_agent):
+    """Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry."""
+    # Arrange: raise ReadError
+    sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected")
+
+    # Act & Assert
+    with pytest.raises(httpx.ReadError, match="Server disconnected"):
+        await sub_agent.run("test message")
+
+    # Should have been called only once (no retry)
+    assert sub_agent.agent.run.call_count == 1
+
+
+@pytest.mark.asyncio
+async def test_run_logs_retry_attempts(sub_agent, caplog):
+    """Mock agent.run to fail once then succeed. Verify logger.warning was called with retry info."""
+    import logging
+
+    # Set logging level to capture warnings
+    with caplog.at_level(logging.WARNING):
+        # Arrange: fail once, then succeed
+        call_count = 0
+
+        def mock_run(*args, **kwargs):
+            nonlocal call_count
+            call_count += 1
+            if call_count == 1:
+                raise Exception("Transient error")
+            # Return successful response
+            result = MagicMock()
+            result.content = "Success"
+            return result
+
+        sub_agent.agent.run = mock_run
+
+        # Act
+        result = await sub_agent.run("test message")
+
+        # Assert
+        assert result == "Success"
+        assert call_count == 2  # 1 failure + 1 success
+        assert "Agent run failed on attempt 1/3" in caplog.text
+        assert "Retrying..." in caplog.text