diff --git a/src/timmy/agent.py b/src/timmy/agent.py index eeb02ef..a1d4d5e 100644 --- a/src/timmy/agent.py +++ b/src/timmy/agent.py @@ -340,15 +340,47 @@ class TimmyWithMemory: self.initial_context = self.memory.get_system_context() def chat(self, message: str) -> str: - """Simple chat interface that tracks in memory.""" + """Simple chat interface that tracks in memory. + + Retries on transient Ollama errors (GPU contention, timeouts) + with exponential backoff (#70). + """ + import time + # Check for user facts to extract self._extract_and_store_facts(message) - # Run agent - result = self.agent.run(message, stream=False) - response_text = result.content if hasattr(result, "content") else str(result) - - return response_text + # Retry with backoff — GPU contention causes ReadError/ReadTimeout + max_retries = 3 + for attempt in range(1, max_retries + 1): + try: + result = self.agent.run(message, stream=False) + return result.content if hasattr(result, "content") else str(result) + except ( + httpx.ConnectError, + httpx.ReadError, + httpx.ReadTimeout, + httpx.ConnectTimeout, + ConnectionError, + TimeoutError, + ) as exc: + if attempt < max_retries: + wait = min(2**attempt, 16) + logger.warning( + "Ollama contention on attempt %d/%d: %s. Waiting %ds before retry...", + attempt, + max_retries, + type(exc).__name__, + wait, + ) + time.sleep(wait) + else: + logger.error( + "Ollama unreachable after %d attempts: %s", + max_retries, + exc, + ) + raise def _extract_and_store_facts(self, message: str) -> None: """Extract user facts from message and store in memory.""" diff --git a/src/timmy/agents/base.py b/src/timmy/agents/base.py index 0ea9ea8..132b2b1 100644 --- a/src/timmy/agents/base.py +++ b/src/timmy/agents/base.py @@ -122,20 +122,53 @@ class BaseAgent(ABC): async def run(self, message: str) -> str: """Run the agent with a message. + Retries on transient failures (connection errors, timeouts) with + exponential backoff. GPU contention from concurrent Ollama + requests causes ReadError / ReadTimeout — these are transient + and should be retried, not raised immediately (#70). + Returns: Agent response """ max_retries = 3 last_exception = None + # Transient errors that indicate Ollama contention or temporary + # unavailability — these deserve a retry with backoff. + _transient = ( + httpx.ConnectError, + httpx.ReadError, + httpx.ReadTimeout, + httpx.ConnectTimeout, + ConnectionError, + TimeoutError, + ) for attempt in range(1, max_retries + 1): try: result = self.agent.run(message, stream=False) response = result.content if hasattr(result, "content") else str(result) break # Success, exit the retry loop - except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc: - logger.error("Ollama disconnected: %s", exc) - raise + except _transient as exc: + last_exception = exc + if attempt < max_retries: + # Contention backoff — longer waits because the GPU + # needs time to finish the other request. + wait = min(2**attempt, 16) + logger.warning( + "Ollama contention on attempt %d/%d: %s. Waiting %ds before retry...", + attempt, + max_retries, + type(exc).__name__, + wait, + ) + await asyncio.sleep(wait) + else: + logger.error( + "Ollama unreachable after %d attempts: %s", + max_retries, + exc, + ) + raise last_exception from exc except Exception as exc: last_exception = exc if attempt < max_retries: diff --git a/src/timmy/gematria.py b/src/timmy/gematria.py index cda4c8d..2ab6096 100644 --- a/src/timmy/gematria.py +++ b/src/timmy/gematria.py @@ -41,10 +41,32 @@ _SUMERIAN: dict[str, int] = {c: v * 6 for c, v in _SIMPLE.items()} # Aleph=1..Tet=9, Yod=10..Tsade=90, Qoph=100..Tav=400 # Standard mapping for the 22 Hebrew letters extended to 26 Latin chars _HEBREW: dict[str, int] = { - "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8, "I": 9, - "J": 10, "K": 20, "L": 30, "M": 40, "N": 50, "O": 60, "P": 70, "Q": 80, - "R": 90, "S": 100, "T": 200, "U": 300, "V": 400, "W": 500, "X": 600, - "Y": 700, "Z": 800, + "A": 1, + "B": 2, + "C": 3, + "D": 4, + "E": 5, + "F": 6, + "G": 7, + "H": 8, + "I": 9, + "J": 10, + "K": 20, + "L": 30, + "M": 40, + "N": 50, + "O": 60, + "P": 70, + "Q": 80, + "R": 90, + "S": 100, + "T": 200, + "U": 300, + "V": 400, + "W": 500, + "X": 600, + "Y": 700, + "Z": 800, } CIPHERS: dict[str, dict[str, int]] = { @@ -317,9 +339,9 @@ def _format_number_analysis(n: int) -> str: lines.append(f" Digit sum: {info['digit_sum']}") if info.get("master_number"): - lines.append(f" ★ Master Number") + lines.append(" ★ Master Number") if info.get("angel_number"): - lines.append(f" ★ Angel Number (repeating digits)") + lines.append(" ★ Angel Number (repeating digits)") if info.get("significance"): lines.append(f"\n Significance: {info['significance']}") diff --git a/tests/timmy/test_agent_retry.py b/tests/timmy/test_agent_retry.py index dfa5d1e..12a7db1 100644 --- a/tests/timmy/test_agent_retry.py +++ b/tests/timmy/test_agent_retry.py @@ -74,31 +74,61 @@ async def test_run_exhausts_retries(sub_agent): @pytest.mark.asyncio -async def test_run_no_retry_on_connection_error(sub_agent): - """Mock agent.run to raise httpx.ConnectError. Verify it raises immediately without retry.""" - # Arrange: raise ConnectError +async def test_run_retries_on_connection_error(sub_agent): + """ConnectError is transient (GPU contention) — retries with backoff (#70).""" sub_agent.agent.run.side_effect = httpx.ConnectError("Connection refused") - # Act & Assert - with pytest.raises(httpx.ConnectError, match="Connection refused"): - await sub_agent.run("test message") + with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: + with pytest.raises(httpx.ConnectError, match="Connection refused"): + await sub_agent.run("test message") - # Should have been called only once (no retry) - assert sub_agent.agent.run.call_count == 1 + # Should have retried all 3 attempts + assert sub_agent.agent.run.call_count == 3 + # Contention backoff: 2**1=2, 2**2=4 + assert mock_sleep.call_count == 2 + mock_sleep.assert_any_call(2) + mock_sleep.assert_any_call(4) @pytest.mark.asyncio -async def test_run_no_retry_on_read_error(sub_agent): - """Mock agent.run to raise httpx.ReadError. Verify it raises immediately without retry.""" - # Arrange: raise ReadError +async def test_run_retries_on_read_error(sub_agent): + """ReadError is transient (GPU contention) — retries with backoff (#70).""" sub_agent.agent.run.side_effect = httpx.ReadError("Server disconnected") - # Act & Assert - with pytest.raises(httpx.ReadError, match="Server disconnected"): - await sub_agent.run("test message") + with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: + with pytest.raises(httpx.ReadError, match="Server disconnected"): + await sub_agent.run("test message") - # Should have been called only once (no retry) - assert sub_agent.agent.run.call_count == 1 + # Should have retried all 3 attempts + assert sub_agent.agent.run.call_count == 3 + # Contention backoff: 2**1=2, 2**2=4 + assert mock_sleep.call_count == 2 + mock_sleep.assert_any_call(2) + mock_sleep.assert_any_call(4) + + +@pytest.mark.asyncio +async def test_run_recovers_from_contention(sub_agent): + """Simulate GPU contention: ReadError then success on retry (#70).""" + call_count = 0 + + def mock_run(*args, **kwargs): + nonlocal call_count + call_count += 1 + if call_count == 1: + raise httpx.ReadError("Server disconnected") + result = MagicMock() + result.content = "Recovered after contention" + return result + + sub_agent.agent.run = mock_run + + with patch("timmy.agents.base.asyncio.sleep") as mock_sleep: + result = await sub_agent.run("test message") + + assert result == "Recovered after contention" + assert call_count == 2 + mock_sleep.assert_called_once_with(2) # 2**1 contention backoff @pytest.mark.asyncio diff --git a/tests/timmy/test_gematria.py b/tests/timmy/test_gematria.py index a20fe79..c30de46 100644 --- a/tests/timmy/test_gematria.py +++ b/tests/timmy/test_gematria.py @@ -20,7 +20,6 @@ from timmy.gematria import ( reduce_number, ) - # ── Core cipher computation ────────────────────────────────────────────────── diff --git a/tests/timmy/test_ollama_disconnect.py b/tests/timmy/test_ollama_disconnect.py index c0ccd14..0ff3bdd 100644 --- a/tests/timmy/test_ollama_disconnect.py +++ b/tests/timmy/test_ollama_disconnect.py @@ -1,8 +1,8 @@ """Test Ollama disconnection handling. Verifies that: -1. BaseAgent.run() logs 'Ollama disconnected' when agent.run() raises connection errors -2. BaseAgent.run() re-raises the error (not silently swallowed) +1. BaseAgent.run() retries on transient errors (contention/disconnect) with backoff (#70) +2. BaseAgent.run() re-raises the error after retries are exhausted 3. session.chat() returns disconnect-specific message on connection errors 4. session.chat_with_tools() returns _ErrorRunOutput with disconnect message on connection errors """ @@ -18,14 +18,15 @@ import pytest class TestBaseAgentDisconnect: """Test BaseAgent.run() disconnection handling.""" - def test_base_agent_logs_on_connect_error(self, caplog): - """BaseAgent.run() logs 'Ollama disconnected' on httpx.ConnectError.""" - caplog.set_level(logging.ERROR) + def test_base_agent_retries_and_logs_on_connect_error(self, caplog): + """BaseAgent.run() retries on ConnectError with backoff, then logs 'Ollama unreachable' (#70).""" + caplog.set_level(logging.WARNING) importlib.import_module("timmy.agents.base") with ( patch("timmy.agents.base.Ollama") as mock_ollama, patch("timmy.agents.base.Agent") as mock_agent_class, + patch("timmy.agents.base.asyncio.sleep"), ): mock_ollama.return_value = MagicMock() mock_agent = MagicMock() @@ -51,18 +52,24 @@ class TestBaseAgentDisconnect: asyncio.run(agent.run("test message")) - assert any("Ollama disconnected" in record.message for record in caplog.records), ( - f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}" + # Should have retried 3 times total + assert mock_agent.run.call_count == 3 + assert any("Ollama contention" in record.message for record in caplog.records), ( + f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}" + ) + assert any("Ollama unreachable" in record.message for record in caplog.records), ( + f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}" ) - def test_base_agent_logs_on_read_error(self, caplog): - """BaseAgent.run() logs 'Ollama disconnected' on httpx.ReadError.""" - caplog.set_level(logging.ERROR) + def test_base_agent_retries_and_logs_on_read_error(self, caplog): + """BaseAgent.run() retries on ReadError with backoff, then logs 'Ollama unreachable' (#70).""" + caplog.set_level(logging.WARNING) importlib.import_module("timmy.agents.base") with ( patch("timmy.agents.base.Ollama") as mock_ollama, patch("timmy.agents.base.Agent") as mock_agent_class, + patch("timmy.agents.base.asyncio.sleep"), ): mock_ollama.return_value = MagicMock() mock_agent = MagicMock() @@ -88,18 +95,20 @@ class TestBaseAgentDisconnect: asyncio.run(agent.run("test message")) - assert any("Ollama disconnected" in record.message for record in caplog.records), ( - f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}" + assert mock_agent.run.call_count == 3 + assert any("Ollama contention" in record.message for record in caplog.records), ( + f"Expected 'Ollama contention' in logs, got: {[r.message for r in caplog.records]}" ) - def test_base_agent_logs_on_connection_error(self, caplog): - """BaseAgent.run() logs 'Ollama disconnected' on ConnectionError.""" - caplog.set_level(logging.ERROR) + def test_base_agent_retries_and_logs_on_connection_error(self, caplog): + """BaseAgent.run() retries on ConnectionError with backoff (#70).""" + caplog.set_level(logging.WARNING) importlib.import_module("timmy.agents.base") with ( patch("timmy.agents.base.Ollama") as mock_ollama, patch("timmy.agents.base.Agent") as mock_agent_class, + patch("timmy.agents.base.asyncio.sleep"), ): mock_ollama.return_value = MagicMock() mock_agent = MagicMock() @@ -125,17 +134,19 @@ class TestBaseAgentDisconnect: asyncio.run(agent.run("test message")) - assert any("Ollama disconnected" in record.message for record in caplog.records), ( - f"Expected 'Ollama disconnected' in logs, got: {[r.message for r in caplog.records]}" + assert mock_agent.run.call_count == 3 + assert any("Ollama unreachable" in record.message for record in caplog.records), ( + f"Expected 'Ollama unreachable' in logs, got: {[r.message for r in caplog.records]}" ) - def test_base_agent_re_raises_connection_error(self): - """BaseAgent.run() re-raises the connection error (not silently swallowed).""" + def test_base_agent_re_raises_connection_error_after_retries(self): + """BaseAgent.run() re-raises the connection error after exhausting retries (#70).""" importlib.import_module("timmy.agents.base") with ( patch("timmy.agents.base.Ollama") as mock_ollama, patch("timmy.agents.base.Agent") as mock_agent_class, + patch("timmy.agents.base.asyncio.sleep"), ): mock_ollama.return_value = MagicMock() mock_agent = MagicMock()