[loop-cycle-60] fix: retry with backoff on Ollama GPU contention (#70) (#238)

This commit is contained in:
2026-03-15 14:28:47 -04:00
parent 7f656fcf22
commit f9911c002c
6 changed files with 178 additions and 51 deletions

View File

@@ -340,15 +340,47 @@ class TimmyWithMemory:
self.initial_context = self.memory.get_system_context()
def chat(self, message: str) -> str:
"""Simple chat interface that tracks in memory."""
"""Simple chat interface that tracks in memory.
Retries on transient Ollama errors (GPU contention, timeouts)
with exponential backoff (#70).
"""
import time
# Check for user facts to extract
self._extract_and_store_facts(message)
# Run agent
result = self.agent.run(message, stream=False)
response_text = result.content if hasattr(result, "content") else str(result)
return response_text
# Retry with backoff — GPU contention causes ReadError/ReadTimeout
max_retries = 3
for attempt in range(1, max_retries + 1):
try:
result = self.agent.run(message, stream=False)
return result.content if hasattr(result, "content") else str(result)
except (
httpx.ConnectError,
httpx.ReadError,
httpx.ReadTimeout,
httpx.ConnectTimeout,
ConnectionError,
TimeoutError,
) as exc:
if attempt < max_retries:
wait = min(2**attempt, 16)
logger.warning(
"Ollama contention on attempt %d/%d: %s. Waiting %ds before retry...",
attempt,
max_retries,
type(exc).__name__,
wait,
)
time.sleep(wait)
else:
logger.error(
"Ollama unreachable after %d attempts: %s",
max_retries,
exc,
)
raise
def _extract_and_store_facts(self, message: str) -> None:
"""Extract user facts from message and store in memory."""

View File

@@ -122,20 +122,53 @@ class BaseAgent(ABC):
async def run(self, message: str) -> str:
"""Run the agent with a message.
Retries on transient failures (connection errors, timeouts) with
exponential backoff. GPU contention from concurrent Ollama
requests causes ReadError / ReadTimeout — these are transient
and should be retried, not raised immediately (#70).
Returns:
Agent response
"""
max_retries = 3
last_exception = None
# Transient errors that indicate Ollama contention or temporary
# unavailability — these deserve a retry with backoff.
_transient = (
httpx.ConnectError,
httpx.ReadError,
httpx.ReadTimeout,
httpx.ConnectTimeout,
ConnectionError,
TimeoutError,
)
for attempt in range(1, max_retries + 1):
try:
result = self.agent.run(message, stream=False)
response = result.content if hasattr(result, "content") else str(result)
break # Success, exit the retry loop
except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc:
logger.error("Ollama disconnected: %s", exc)
raise
except _transient as exc:
last_exception = exc
if attempt < max_retries:
# Contention backoff — longer waits because the GPU
# needs time to finish the other request.
wait = min(2**attempt, 16)
logger.warning(
"Ollama contention on attempt %d/%d: %s. Waiting %ds before retry...",
attempt,
max_retries,
type(exc).__name__,
wait,
)
await asyncio.sleep(wait)
else:
logger.error(
"Ollama unreachable after %d attempts: %s",
max_retries,
exc,
)
raise last_exception from exc
except Exception as exc:
last_exception = exc
if attempt < max_retries:

View File

@@ -41,10 +41,32 @@ _SUMERIAN: dict[str, int] = {c: v * 6 for c, v in _SIMPLE.items()}
# Aleph=1..Tet=9, Yod=10..Tsade=90, Qoph=100..Tav=400
# Standard mapping for the 22 Hebrew letters extended to 26 Latin chars
_HEBREW: dict[str, int] = {
"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "I": 9,
"J": 10, "K": 20, "L": 30, "M": 40, "N": 50, "O": 60, "P": 70, "Q": 80,
"R": 90, "S": 100, "T": 200, "U": 300, "V": 400, "W": 500, "X": 600,
"Y": 700, "Z": 800,
"A": 1,
"B": 2,
"C": 3,
"D": 4,
"E": 5,
"F": 6,
"G": 7,
"H": 8,
"I": 9,
"J": 10,
"K": 20,
"L": 30,
"M": 40,
"N": 50,
"O": 60,
"P": 70,
"Q": 80,
"R": 90,
"S": 100,
"T": 200,
"U": 300,
"V": 400,
"W": 500,
"X": 600,
"Y": 700,
"Z": 800,
}
CIPHERS: dict[str, dict[str, int]] = {
@@ -317,9 +339,9 @@ def _format_number_analysis(n: int) -> str:
lines.append(f" Digit sum: {info['digit_sum']}")
if info.get("master_number"):
lines.append(f" ★ Master Number")
lines.append(" ★ Master Number")
if info.get("angel_number"):
lines.append(f" ★ Angel Number (repeating digits)")
lines.append(" ★ Angel Number (repeating digits)")
if info.get("significance"):
lines.append(f"\n Significance: {info['significance']}")

View File

@@ -74,31 +74,61 @@ async def test_run_exhausts_retries(sub_agent):
@pytest.mark.asyncio
async def test_run_no_retry_on_connection_error(sub_agent):
"""Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry."""
# Arrange: raise ConnectError
async def test_run_retries_on_connection_error(sub_agent):
"""ConnectError is transient (GPU contention) — retries with backoff (#70)."""
sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused")
# Act & Assert
with pytest.raises(httpx.ConnectError, match="Connection refused"):
await sub_agent.run("test message")
with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
with pytest.raises(httpx.ConnectError, match="Connection refused"):
await sub_agent.run("test message")
# Should have been called only once (no retry)
assert sub_agent.agent.run.call_count == 1
# Should have retried all 3 attempts
assert sub_agent.agent.run.call_count == 3
# Contention backoff: 2**1=2, 2**2=4
assert mock_sleep.call_count == 2
mock_sleep.assert_any_call(2)
mock_sleep.assert_any_call(4)
@pytest.mark.asyncio
async def test_run_no_retry_on_read_error(sub_agent):
"""Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry."""
# Arrange: raise ReadError
async def test_run_retries_on_read_error(sub_agent):
"""ReadError is transient (GPU contention) — retries with backoff (#70)."""
sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected")
# Act & Assert
with pytest.raises(httpx.ReadError, match="Server disconnected"):
await sub_agent.run("test message")
with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
with pytest.raises(httpx.ReadError, match="Server disconnected"):
await sub_agent.run("test message")
# Should have been called only once (no retry)
assert sub_agent.agent.run.call_count == 1
# Should have retried all 3 attempts
assert sub_agent.agent.run.call_count == 3
# Contention backoff: 2**1=2, 2**2=4
assert mock_sleep.call_count == 2
mock_sleep.assert_any_call(2)
mock_sleep.assert_any_call(4)
@pytest.mark.asyncio
async def test_run_recovers_from_contention(sub_agent):
"""Simulate GPU contention: ReadError then success on retry (#70)."""
call_count = 0
def mock_run(*args, **kwargs):
nonlocal call_count
call_count += 1
if call_count == 1:
raise httpx.ReadError("Server disconnected")
result = MagicMock()
result.content = "Recovered after contention"
return result
sub_agent.agent.run = mock_run
with patch("timmy.agents.base.asyncio.sleep") as mock_sleep:
result = await sub_agent.run("test message")
assert result == "Recovered after contention"
assert call_count == 2
mock_sleep.assert_called_once_with(2) # 2**1 contention backoff
@pytest.mark.asyncio

View File

@@ -20,7 +20,6 @@ from timmy.gematria import (
reduce_number,
)
# ── Core cipher computation ──────────────────────────────────────────────────

View File

@@ -1,8 +1,8 @@
"""Test Ollama disconnection handling.
Verifies that:
1. BaseAgent.run() logs 'Ollama disconnected' when agent.run() raises connection errors
2. BaseAgent.run() re-raises the error (not silently swallowed)
1. BaseAgent.run() retries on transient errors (contention/disconnect) with backoff (#70)
2. BaseAgent.run() re-raises the error after retries are exhausted
3. session.chat() returns disconnect-specific message on connection errors
4. session.chat_with_tools() returns _ErrorRunOutput with disconnect message on connection errors
"""
@@ -18,14 +18,15 @@ import pytest
class TestBaseAgentDisconnect:
"""Test BaseAgent.run() disconnection handling."""
def test_base_agent_logs_on_connect_error(self, caplog):
"""BaseAgent.run() logs 'Ollama disconnected' on httpx.ConnectError."""
caplog.set_level(logging.ERROR)
def test_base_agent_retries_and_logs_on_connect_error(self, caplog):
"""BaseAgent.run() retries on ConnectError with backoff, then logs 'Ollama unreachable' (#70)."""
caplog.set_level(logging.WARNING)
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()
@@ -51,18 +52,24 @@ class TestBaseAgentDisconnect:
asyncio.run(agent.run("test message"))
assert any("Ollama disconnected" in record.message for record in caplog.records), (
f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
# Should have retried 3 times total
assert mock_agent.run.call_count == 3
assert any("Ollama contention" in record.message for record in caplog.records), (
f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}"
)
assert any("Ollama unreachable" in record.message for record in caplog.records), (
f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}"
)
def test_base_agent_logs_on_read_error(self, caplog):
"""BaseAgent.run() logs 'Ollama disconnected' on httpx.ReadError."""
caplog.set_level(logging.ERROR)
def test_base_agent_retries_and_logs_on_read_error(self, caplog):
"""BaseAgent.run() retries on ReadError with backoff, then logs 'Ollama unreachable' (#70)."""
caplog.set_level(logging.WARNING)
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()
@@ -88,18 +95,20 @@ class TestBaseAgentDisconnect:
asyncio.run(agent.run("test message"))
assert any("Ollama disconnected" in record.message for record in caplog.records), (
f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
assert mock_agent.run.call_count == 3
assert any("Ollama contention" in record.message for record in caplog.records), (
f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}"
)
def test_base_agent_logs_on_connection_error(self, caplog):
"""BaseAgent.run() logs 'Ollama disconnected' on ConnectionError."""
caplog.set_level(logging.ERROR)
def test_base_agent_retries_and_logs_on_connection_error(self, caplog):
"""BaseAgent.run() retries on ConnectionError with backoff (#70)."""
caplog.set_level(logging.WARNING)
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()
@@ -125,17 +134,19 @@ class TestBaseAgentDisconnect:
asyncio.run(agent.run("test message"))
assert any("Ollama disconnected" in record.message for record in caplog.records), (
f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}"
assert mock_agent.run.call_count == 3
assert any("Ollama unreachable" in record.message for record in caplog.records), (
f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}"
)
def test_base_agent_re_raises_connection_error(self):
"""BaseAgent.run() re-raises the connection error (not silently swallowed)."""
def test_base_agent_re_raises_connection_error_after_retries(self):
"""BaseAgent.run() re-raises the connection error after exhausting retries (#70)."""
importlib.import_module("timmy.agents.base")
with (
patch("timmy.agents.base.Ollama") as mock_ollama,
patch("timmy.agents.base.Agent") as mock_agent_class,
patch("timmy.agents.base.asyncio.sleep"),
):
mock_ollama.return_value = MagicMock()
mock_agent = MagicMock()