Timmy-time-dashboard/tests/timmy/test_agent_retry.py

"""Tests for agent retry logic on transient errors."""

from unittest.mock import MagicMock, patch

import httpx
import pytest

from timmy.agents.base import SubAgent


@pytest.fixture
def sub_agent():
    """Create a SubAgent with mocked agent."""
    agent = SubAgent(
        agent_id="test-agent",
        name="TestAgent",
        role="test",
        system_prompt="You are a test agent.",
        model="llama3.2",
    )
    # Mock the underlying agent.run method
    agent.agent = MagicMock()
    return agent


@pytest.mark.asyncio
async def test_run_retries_on_transient_error(sub_agent):
    """Mock agent.run to fail twice then succeed. Verify result is returned."""
    # Arrange: fail twice, then succeed
    call_count = 0

    def mock_run(*args, **kwargs):
        nonlocal call_count
        call_count += 1
        if call_count <= 2:
            raise Exception("Ollama 500 error: XML parse error")
        # Return successful response
        result = MagicMock()
        result.content = "Success after retries"
        return result

    sub_agent.agent.run = mock_run

    # Act
    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
        result = await sub_agent.run("test message")

    # Assert
    assert result == "Success after retries"
    assert call_count == 3  # 2 failures + 1 success
    # Verify exponential backoff: attempt 1 = 1s, attempt 2 = 2s
    assert mock_sleep.call_count == 2
    mock_sleep.assert_any_call(1)
    mock_sleep.assert_any_call(2)


@pytest.mark.asyncio
async def test_run_exhausts_retries(sub_agent):
    """Mock agent.run to fail 3 times. Verify exception is raised."""
    # Arrange: always fail
    sub_agent.agent.run.side_effect = Exception("Ollama 500 error: XML parse error")

    # Act & Assert
    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
        with pytest.raises(Exception, match="Ollama 500 error: XML parse error"):
            await sub_agent.run("test message")

    # Should have been called 3 times (max retries)
    assert sub_agent.agent.run.call_count == 3
    # Verify exponential backoff: attempt 1 = 1s, attempt 2 = 2s
    assert mock_sleep.call_count == 2
    mock_sleep.assert_any_call(1)
    mock_sleep.assert_any_call(2)


@pytest.mark.asyncio
async def test_run_retries_on_connection_error(sub_agent):
    """ConnectError is transient (GPU contention) — retries with backoff (#70)."""
    sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused")

    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
        with pytest.raises(httpx.ConnectError, match="Connection refused"):
            await sub_agent.run("test message")

    # Should have retried all 3 attempts
    assert sub_agent.agent.run.call_count == 3
    # Contention backoff: 2**1=2, 2**2=4
    assert mock_sleep.call_count == 2
    mock_sleep.assert_any_call(2)
    mock_sleep.assert_any_call(4)


@pytest.mark.asyncio
async def test_run_retries_on_read_error(sub_agent):
    """ReadError is transient (GPU contention) — retries with backoff (#70)."""
    sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected")

    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
        with pytest.raises(httpx.ReadError, match="Server disconnected"):
            await sub_agent.run("test message")

    # Should have retried all 3 attempts
    assert sub_agent.agent.run.call_count == 3
    # Contention backoff: 2**1=2, 2**2=4
    assert mock_sleep.call_count == 2
    mock_sleep.assert_any_call(2)
    mock_sleep.assert_any_call(4)


@pytest.mark.asyncio
async def test_run_recovers_from_contention(sub_agent):
    """Simulate GPU contention: ReadError then success on retry (#70)."""
    call_count = 0

    def mock_run(*args, **kwargs):
        nonlocal call_count
        call_count += 1
        if call_count == 1:
            raise httpx.ReadError("Server disconnected")
        result = MagicMock()
        result.content = "Recovered after contention"
        return result

    sub_agent.agent.run = mock_run

    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
        result = await sub_agent.run("test message")

    assert result == "Recovered after contention"
    assert call_count == 2
    mock_sleep.assert_called_once_with(2)  # 2**1 contention backoff


@pytest.mark.asyncio
async def test_run_logs_retry_attempts(sub_agent, caplog):
    """Mock agent.run to fail once then succeed. Verify logger.warning was called with retry info."""
    import logging

    # Set logging level to capture warnings
    with caplog.at_level(logging.WARNING):
        # Arrange: fail once, then succeed
        call_count = 0

        def mock_run(*args, **kwargs):
            nonlocal call_count
            call_count += 1
            if call_count == 1:
                raise Exception("Transient error")
            # Return successful response
            result = MagicMock()
            result.content = "Success"
            return result

        sub_agent.agent.run = mock_run

        # Act
        with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
            result = await sub_agent.run("test message")

        # Assert
        assert result == "Success"
        assert call_count == 2  # 1 failure + 1 success
        assert "Agent run failed on attempt 1/3" in caplog.text
        assert "Retrying..." in caplog.text
        # Verify sleep was called with 1s for first attempt
        mock_sleep.assert_called_once_with(1)