1
0

[loop-cycle-38] fix: add retry logic for Ollama 500 errors (#131) (#133)

Co-authored-by: hermes <hermes@timmy.local>
Co-committed-by: hermes <hermes@timmy.local>
This commit is contained in:
2026-03-15 09:38:21 -04:00
committed by rockachopa
parent f4e5148825
commit 76df262563
2 changed files with 152 additions and 9 deletions

View File

@@ -10,6 +10,7 @@ SubAgent is the single seed class for ALL agents. Differentiation
comes entirely from config (agents.yaml), not from Python subclasses.
"""
import asyncio
import logging
from abc import ABC, abstractmethod
from typing import Any
@@ -124,15 +125,34 @@ class BaseAgent(ABC):
Returns:
Agent response
"""
try:
result = self.agent.run(message, stream=False)
response = result.content if hasattr(result, "content") else str(result)
except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc:
logger.error("Ollama disconnected: %s", exc)
raise
except Exception as exc:
logger.error("Agent run failed: %s", exc)
raise
max_retries = 3
last_exception = None
for attempt in range(1, max_retries + 1):
try:
result = self.agent.run(message, stream=False)
response = result.content if hasattr(result, "content") else str(result)
break # Success, exit the retry loop
except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc:
logger.error("Ollama disconnected: %s", exc)
raise
except Exception as exc:
last_exception = exc
if attempt < max_retries:
logger.warning(
"Agent run failed on attempt %d/%d: %s. Retrying...",
attempt,
max_retries,
exc,
)
await asyncio.sleep(1)
else:
logger.error(
"Agent run failed after %d attempts: %s",
max_retries,
exc,
)
raise last_exception from exc
# Emit completion event
if self.event_bus:

View File

@@ -0,0 +1,123 @@
"""Tests for agent retry logic on transient errors."""
from unittest.mock import MagicMock
import httpx
import pytest
from timmy.agents.base import SubAgent
@pytest.fixture
def sub_agent():
"""Create a SubAgent with mocked agent."""
agent = SubAgent(
agent_id="test-agent",
name="TestAgent",
role="test",
system_prompt="You are a test agent.",
model="llama3.2",
)
# Mock the underlying agent.run method
agent.agent = MagicMock()
return agent
@pytest.mark.asyncio
async def test_run_retries_on_transient_error(sub_agent):
"""Mock agent.run to fail twice then succeed. Verify result is returned."""
# Arrange: fail twice, then succeed
call_count = 0
def mock_run(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count <= 2:
raise Exception("Ollama 500 error: XML parse error")
# Return successful response
result = MagicMock()
result.content = "Success after retries"
return result
sub_agent.agent.run = mock_run
# Act
result = await sub_agent.run("test message")
# Assert
assert result == "Success after retries"
assert call_count == 3 # 2 failures + 1 success
@pytest.mark.asyncio
async def test_run_exhausts_retries(sub_agent):
"""Mock agent.run to fail 3 times. Verify exception is raised."""
# Arrange: always fail
sub_agent.agent.run.side_effect = Exception("Ollama 500 error: XML parse error")
# Act & Assert
with pytest.raises(Exception, match="Ollama 500 error: XML parse error"):
await sub_agent.run("test message")
# Should have been called 3 times (max retries)
assert sub_agent.agent.run.call_count == 3
@pytest.mark.asyncio
async def test_run_no_retry_on_connection_error(sub_agent):
"""Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry."""
# Arrange: raise ConnectError
sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused")
# Act & Assert
with pytest.raises(httpx.ConnectError, match="Connection refused"):
await sub_agent.run("test message")
# Should have been called only once (no retry)
assert sub_agent.agent.run.call_count == 1
@pytest.mark.asyncio
async def test_run_no_retry_on_read_error(sub_agent):
"""Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry."""
# Arrange: raise ReadError
sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected")
# Act & Assert
with pytest.raises(httpx.ReadError, match="Server disconnected"):
await sub_agent.run("test message")
# Should have been called only once (no retry)
assert sub_agent.agent.run.call_count == 1
@pytest.mark.asyncio
async def test_run_logs_retry_attempts(sub_agent, caplog):
"""Mock agent.run to fail once then succeed. Verify logger.warning was called with retry info."""
import logging
# Set logging level to capture warnings
with caplog.at_level(logging.WARNING):
# Arrange: fail once, then succeed
call_count = 0
def mock_run(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count == 1:
raise Exception("Transient error")
# Return successful response
result = MagicMock()
result.content = "Success"
return result
sub_agent.agent.run = mock_run
# Act
result = await sub_agent.run("test message")
# Assert
assert result == "Success"
assert call_count == 2 # 1 failure + 1 success
assert "Agent run failed on attempt 1/3" in caplog.text
assert "Retrying..." in caplog.text