[loop-cycle-60] fix: retry with backoff on Ollama GPU contention (#70) (#238)
All checks were successful
Tests / lint (push) Successful in 4s
Tests / test (push) Successful in 54s

This commit was merged in pull request #238.
This commit is contained in:
2026-03-15 14:28:47 -04:00
parent 7f656fcf22
commit f9911c002c
6 changed files with 178 additions and 51 deletions

View File

@@ -74,31 +74,61 @@ async def test_run_exhausts_retries(sub_agent):
@pytest.mark.asyncio
async def test_run_no_retry_on_connection_error(sub_agent):
"""Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry."""
# Arrange: raise ConnectError
async def test_run_retries_on_connection_error(sub_agent):
"""ConnectError is transient (GPU contention) — retries with backoff (#70)."""
sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused")
# Act & Assert
with pytest.raises(httpx.ConnectError, match="Connection refused"):
await sub_agent.run("test message")
with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
with pytest.raises(httpx.ConnectError, match="Connection refused"):
await sub_agent.run("test message")
# Should have been called only once (no retry)
assert sub_agent.agent.run.call_count == 1
# Should have retried all 3 attempts
assert sub_agent.agent.run.call_count == 3
# Contention backoff: 2**1=2, 2**2=4
assert mock_sleep.call_count == 2
mock_sleep.assert_any_call(2)
mock_sleep.assert_any_call(4)
@pytest.mark.asyncio
async def test_run_no_retry_on_read_error(sub_agent):
"""Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry."""
# Arrange: raise ReadError
async def test_run_retries_on_read_error(sub_agent):
"""ReadError is transient (GPU contention) — retries with backoff (#70)."""
sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected")
# Act & Assert
with pytest.raises(httpx.ReadError, match="Server disconnected"):
await sub_agent.run("test message")
with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
with pytest.raises(httpx.ReadError, match="Server disconnected"):
await sub_agent.run("test message")
# Should have been called only once (no retry)
assert sub_agent.agent.run.call_count == 1
# Should have retried all 3 attempts
assert sub_agent.agent.run.call_count == 3
# Contention backoff: 2**1=2, 2**2=4
assert mock_sleep.call_count == 2
mock_sleep.assert_any_call(2)
mock_sleep.assert_any_call(4)
@pytest.mark.asyncio
async def test_run_recovers_from_contention(sub_agent):
"""Simulate GPU contention: ReadError then success on retry (#70)."""
call_count = 0
def mock_run(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count == 1:
raise httpx.ReadError("Server disconnected")
result = MagicMock()
result.content = "Recovered after contention"
return result
sub_agent.agent.run = mock_run
with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
result = await sub_agent.run("test message")
assert result == "Recovered after contention"
assert call_count == 2
mock_sleep.assert_called_once_with(2) # 2**1 contention backoff
@pytest.mark.asyncio

View File

@@ -20,7 +20,6 @@ from timmy.gematria import (
reduce_number,
)
# ── Core cipher computation ──────────────────────────────────────────────────

View File

@@ -1,8 +1,8 @@
"""Test Ollama disconnection handling.
Verifies that:
1. BaseAgent.run() logs 'Ollama disconnected' when agent.run() raises connection errors
2. BaseAgent.run() re-raises the error (not silently swallowed)
1. BaseAgent.run() retries on transient errors (contention/disconnect) with backoff (#70)
2. BaseAgent.run() re-raises the error after retries are exhausted
3. session.chat() returns disconnect-specific message on connection errors
4. session.chat_with_tools() returns _ErrorRunOutput with disconnect message on connection errors
"""
@@ -18,14 +18,15 @@ import pytest
class TestBaseAgentDisconnect:
"""Test BaseAgent.run() disconnection handling."""
def test_base_agent_logs_on_connect_error(self, caplog):
"""BaseAgent.run() logs 'Ollama disconnected' on httpx.ConnectError."""
caplog.set_level(logging.ERROR)
def test_base_agent_retries_and_logs_on_connect_error(self, caplog):
"""BaseAgent.run() retries on ConnectError with backoff, then logs 'Ollama unreachable' (#70)."""
caplog.set_level(logging.WARNING)
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()
@@ -51,18 +52,24 @@ class TestBaseAgentDisconnect:
asyncio.run(agent.run("test message"))
assert any("Ollama disconnected" in record.message for record in caplog.records), (
f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
# Should have retried 3 times total
assert mock_agent.run.call_count == 3
assert any("Ollama contention" in record.message for record in caplog.records), (
f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}"
)
assert any("Ollama unreachable" in record.message for record in caplog.records), (
f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}"
)
def test_base_agent_logs_on_read_error(self, caplog):
"""BaseAgent.run() logs 'Ollama disconnected' on httpx.ReadError."""
caplog.set_level(logging.ERROR)
def test_base_agent_retries_and_logs_on_read_error(self, caplog):
"""BaseAgent.run() retries on ReadError with backoff, then logs 'Ollama unreachable' (#70)."""
caplog.set_level(logging.WARNING)
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()
@@ -88,18 +95,20 @@ class TestBaseAgentDisconnect:
asyncio.run(agent.run("test message"))
assert any("Ollama disconnected" in record.message for record in caplog.records), (
f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
assert mock_agent.run.call_count == 3
assert any("Ollama contention" in record.message for record in caplog.records), (
f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}"
)
def test_base_agent_logs_on_connection_error(self, caplog):
"""BaseAgent.run() logs 'Ollama disconnected' on ConnectionError."""
caplog.set_level(logging.ERROR)
def test_base_agent_retries_and_logs_on_connection_error(self, caplog):
"""BaseAgent.run() retries on ConnectionError with backoff (#70)."""
caplog.set_level(logging.WARNING)
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()
@@ -125,17 +134,19 @@ class TestBaseAgentDisconnect:
asyncio.run(agent.run("test message"))
assert any("Ollama disconnected" in record.message for record in caplog.records), (
f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
assert mock_agent.run.call_count == 3
assert any("Ollama unreachable" in record.message for record in caplog.records), (
f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}"
)
def test_base_agent_re_raises_connection_error(self):
"""BaseAgent.run() re-raises the connection error (not silently swallowed)."""
def test_base_agent_re_raises_connection_error_after_retries(self):
"""BaseAgent.run() re-raises the connection error after exhausting retries (#70)."""
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()