[loop-cycle-60] fix: retry with backoff on Ollama GPU contention (#70) (#238)

2026-03-15 14:28:47 -04:00
parent 7f656fcf22
commit f9911c002c
6 changed files with 178 additions and 51 deletions
--- a/tests/timmy/test_agent_retry.py
+++ b/tests/timmy/test_agent_retry.py
@@ -74,31 +74,61 @@ async def test_run_exhausts_retries(sub_agent):


@pytest.mark.asyncio
-async def test_run_no_retry_on_connection_error(sub_agent):
-    """Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry."""
-    # Arrange: raise ConnectError
+async def test_run_retries_on_connection_error(sub_agent):
+    """ConnectError is transient (GPU contention) — retries with backoff (#70)."""
    sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused")

-    # Act & Assert
-    with pytest.raises(httpx.ConnectError, match="Connection refused"):
-        await sub_agent.run("test message")
+    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
+        with pytest.raises(httpx.ConnectError, match="Connection refused"):
+            await sub_agent.run("test message")

-    # Should have been called only once (no retry)
-    assert sub_agent.agent.run.call_count == 1
+    # Should have retried all 3 attempts
+    assert sub_agent.agent.run.call_count == 3
+    # Contention backoff: 2**1=2, 2**2=4
+    assert mock_sleep.call_count == 2
+    mock_sleep.assert_any_call(2)
+    mock_sleep.assert_any_call(4)


@pytest.mark.asyncio
-async def test_run_no_retry_on_read_error(sub_agent):
-    """Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry."""
-    # Arrange: raise ReadError
+async def test_run_retries_on_read_error(sub_agent):
+    """ReadError is transient (GPU contention) — retries with backoff (#70)."""
    sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected")

-    # Act & Assert
-    with pytest.raises(httpx.ReadError, match="Server disconnected"):
-        await sub_agent.run("test message")
+    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
+        with pytest.raises(httpx.ReadError, match="Server disconnected"):
+            await sub_agent.run("test message")

-    # Should have been called only once (no retry)
-    assert sub_agent.agent.run.call_count == 1
+    # Should have retried all 3 attempts
+    assert sub_agent.agent.run.call_count == 3
+    # Contention backoff: 2**1=2, 2**2=4
+    assert mock_sleep.call_count == 2
+    mock_sleep.assert_any_call(2)
+    mock_sleep.assert_any_call(4)
+
+
+@pytest.mark.asyncio
+async def test_run_recovers_from_contention(sub_agent):
+    """Simulate GPU contention: ReadError then success on retry (#70)."""
+    call_count = 0
+
+    def mock_run(*args, **kwargs):
+        nonlocal call_count
+        call_count += 1
+        if call_count == 1:
+            raise httpx.ReadError("Server disconnected")
+        result = MagicMock()
+        result.content = "Recovered after contention"
+        return result
+
+    sub_agent.agent.run = mock_run
+
+    with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
+        result = await sub_agent.run("test message")
+
+    assert result == "Recovered after contention"
+    assert call_count == 2
+    mock_sleep.assert_called_once_with(2)  # 2**1 contention backoff


@pytest.mark.asyncio
--- a/tests/timmy/test_gematria.py
+++ b/tests/timmy/test_gematria.py
@@ -20,7 +20,6 @@ from timmy.gematria import (
    reduce_number,
 )

-
 # ── Core cipher computation ──────────────────────────────────────────────────


--- a/tests/timmy/test_ollama_disconnect.py
+++ b/tests/timmy/test_ollama_disconnect.py
@@ -1,8 +1,8 @@
 """Test Ollama disconnection handling.

 Verifies that:
-1. BaseAgent.run() logs 'Ollama disconnected' when agent.run() raises connection errors
-2. BaseAgent.run() re-raises the error (not silently swallowed)
+1. BaseAgent.run() retries on transient errors (contention/disconnect) with backoff (#70)
+2. BaseAgent.run() re-raises the error after retries are exhausted
 3. session.chat() returns disconnect-specific message on connection errors
 4. session.chat_with_tools() returns _ErrorRunOutput with disconnect message on connection errors
 """
@@ -18,14 +18,15 @@ import pytest
 class TestBaseAgentDisconnect:
    """Test BaseAgent.run() disconnection handling."""

-    def test_base_agent_logs_on_connect_error(self, caplog):
-        """BaseAgent.run() logs 'Ollama disconnected' on httpx.ConnectError."""
-        caplog.set_level(logging.ERROR)
+    def test_base_agent_retries_and_logs_on_connect_error(self, caplog):
+        """BaseAgent.run() retries on ConnectError with backoff, then logs 'Ollama unreachable' (#70)."""
+        caplog.set_level(logging.WARNING)
        importlib.import_module("timmy.agents.base")

        with (
            patch("timmy.agents.base.Ollama") as mock_ollama,
            patch("timmy.agents.base.Agent") as mock_agent_class,
+            patch("timmy.agents.base.asyncio.sleep"),
        ):
            mock_ollama.return_value = MagicMock()
            mock_agent = MagicMock()
@@ -51,18 +52,24 @@ class TestBaseAgentDisconnect:

                asyncio.run(agent.run("test message"))

-            assert any("Ollama disconnected" in record.message for record in caplog.records), (
-                f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
+            # Should have retried 3 times total
+            assert mock_agent.run.call_count == 3
+            assert any("Ollama contention" in record.message for record in caplog.records), (
+                f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}"
+            )
+            assert any("Ollama unreachable" in record.message for record in caplog.records), (
+                f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}"
            )

-    def test_base_agent_logs_on_read_error(self, caplog):
-        """BaseAgent.run() logs 'Ollama disconnected' on httpx.ReadError."""
-        caplog.set_level(logging.ERROR)
+    def test_base_agent_retries_and_logs_on_read_error(self, caplog):
+        """BaseAgent.run() retries on ReadError with backoff, then logs 'Ollama unreachable' (#70)."""
+        caplog.set_level(logging.WARNING)
        importlib.import_module("timmy.agents.base")

        with (
            patch("timmy.agents.base.Ollama") as mock_ollama,
            patch("timmy.agents.base.Agent") as mock_agent_class,
+            patch("timmy.agents.base.asyncio.sleep"),
        ):
            mock_ollama.return_value = MagicMock()
            mock_agent = MagicMock()
@@ -88,18 +95,20 @@ class TestBaseAgentDisconnect:

                asyncio.run(agent.run("test message"))

-            assert any("Ollama disconnected" in record.message for record in caplog.records), (
-                f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
+            assert mock_agent.run.call_count == 3
+            assert any("Ollama contention" in record.message for record in caplog.records), (
+                f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}"
            )

-    def test_base_agent_logs_on_connection_error(self, caplog):
-        """BaseAgent.run() logs 'Ollama disconnected' on ConnectionError."""
-        caplog.set_level(logging.ERROR)
+    def test_base_agent_retries_and_logs_on_connection_error(self, caplog):
+        """BaseAgent.run() retries on ConnectionError with backoff (#70)."""
+        caplog.set_level(logging.WARNING)
        importlib.import_module("timmy.agents.base")

        with (
            patch("timmy.agents.base.Ollama") as mock_ollama,
            patch("timmy.agents.base.Agent") as mock_agent_class,
+            patch("timmy.agents.base.asyncio.sleep"),
        ):
            mock_ollama.return_value = MagicMock()
            mock_agent = MagicMock()
@@ -125,17 +134,19 @@ class TestBaseAgentDisconnect:

                asyncio.run(agent.run("test message"))

-            assert any("Ollama disconnected" in record.message for record in caplog.records), (
-                f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
+            assert mock_agent.run.call_count == 3
+            assert any("Ollama unreachable" in record.message for record in caplog.records), (
+                f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}"
            )

-    def test_base_agent_re_raises_connection_error(self):
-        """BaseAgent.run() re-raises the connection error (not silently swallowed)."""
+    def test_base_agent_re_raises_connection_error_after_retries(self):
+        """BaseAgent.run() re-raises the connection error after exhausting retries (#70)."""
        importlib.import_module("timmy.agents.base")

        with (
            patch("timmy.agents.base.Ollama") as mock_ollama,
            patch("timmy.agents.base.Agent") as mock_agent_class,
+            patch("timmy.agents.base.asyncio.sleep"),
        ):
            mock_ollama.return_value = MagicMock()
            mock_agent = MagicMock()