From 96e7961a0e2a96e2e7d05a3d1dbfd01f8aaf53c5 Mon Sep 17 00:00:00 2001 From: rockachopa Date: Sun, 15 Mar 2026 19:36:52 -0400 Subject: [PATCH] fix: make confidence visible to users when below 0.7 threshold (#259) Co-authored-by: rockachopa Co-committed-by: rockachopa --- src/timmy/session.py | 18 +++++ tests/timmy/test_session.py | 140 ++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/src/timmy/session.py b/src/timmy/session.py index ed5d93d7..e5d2c744 100644 --- a/src/timmy/session.py +++ b/src/timmy/session.py @@ -110,6 +110,10 @@ async def chat(message: str, session_id: str | None = None) -> str: confidence = estimate_confidence(response_text) logger.debug("Response confidence: %.2f", confidence) + # Make confidence visible to user when below threshold (SOUL.md requirement) + if confidence is not None and confidence < 0.7: + response_text += f"\n\n[confidence: {confidence:.0%}]" + # Record Timmy response after getting it session_logger.record_message("timmy", response_text, confidence=confidence) @@ -148,6 +152,13 @@ async def chat_with_tools(message: str, session_id: str | None = None): ) confidence = estimate_confidence(response_text) if response_text else None logger.debug("Response confidence: %.2f", confidence) + + # Make confidence visible to user when below threshold (SOUL.md requirement) + if confidence is not None and confidence < 0.7: + response_text += f"\n\n[confidence: {confidence:.0%}]" + # Update the run_output content to reflect the modified response + run_output.content = response_text + session_logger.record_message("timmy", response_text, confidence=confidence) session_logger.flush() return run_output @@ -187,6 +198,13 @@ async def continue_chat(run_output, session_id: str | None = None): response_text = result.content if hasattr(result, "content") and result.content else "" confidence = estimate_confidence(response_text) if response_text else None logger.debug("Response confidence: %.2f", confidence) + + # Make confidence visible to user when below threshold (SOUL.md requirement) + if confidence is not None and confidence < 0.7: + response_text += f"\n\n[confidence: {confidence:.0%}]" + # Update the result content to reflect the modified response + result.content = response_text + session_logger.record_message("timmy", response_text, confidence=confidence) session_logger.flush() return result diff --git a/tests/timmy/test_session.py b/tests/timmy/test_session.py index 0b9f4cf2..a20a6240 100644 --- a/tests/timmy/test_session.py +++ b/tests/timmy/test_session.py @@ -297,3 +297,143 @@ async def test_continue_chat_passes_confidence_to_record_message(): assert len(calls) >= 1 # should have timmy response _, kwargs = calls[-1] assert kwargs.get("confidence") == 0.91 + + +# --------------------------------------------------------------------------- +# Low confidence visibility (SOUL.md requirement) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_chat_shows_confidence_when_low(): + """chat() should append confidence indicator when confidence < 0.7.""" + mock_agent = MagicMock() + mock_agent.arun = AsyncMock(return_value=MagicMock(content="I think maybe this might work.")) + + with ( + patch("timmy.session._get_agent", return_value=mock_agent), + patch("timmy.session.estimate_confidence", return_value=0.45), + patch("timmy.session.get_session_logger") as mock_get_logger, + ): + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + from timmy.session import chat + + result = await chat("test message") + + assert "[confidence: 45%]" in result + + +@pytest.mark.asyncio +async def test_chat_hides_confidence_when_high(): + """chat() should NOT append confidence indicator when confidence >= 0.7.""" + mock_agent = MagicMock() + mock_agent.arun = AsyncMock(return_value=MagicMock(content="This is definitely correct.")) + + with ( + patch("timmy.session._get_agent", return_value=mock_agent), + patch("timmy.session.estimate_confidence", return_value=0.85), + patch("timmy.session.get_session_logger") as mock_get_logger, + ): + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + from timmy.session import chat + + result = await chat("test message") + + assert "[confidence:" not in result + assert result == "This is definitely correct." + + +@pytest.mark.asyncio +async def test_chat_with_tools_shows_confidence_when_low(): + """chat_with_tools() should include confidence indicator in response when low.""" + mock_agent = MagicMock() + mock_agent.arun = AsyncMock(return_value=MagicMock(content="I'm not sure about this.")) + + with ( + patch("timmy.session._get_agent", return_value=mock_agent), + patch("timmy.session.estimate_confidence", return_value=0.55), + patch("timmy.session.get_session_logger") as mock_get_logger, + ): + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + from timmy.session import chat_with_tools + + result = await chat_with_tools("test message") + + assert "[confidence: 55%]" in result.content + + +@pytest.mark.asyncio +async def test_chat_with_tools_hides_confidence_when_high(): + """chat_with_tools() should NOT include confidence indicator when high.""" + mock_agent = MagicMock() + mock_agent.arun = AsyncMock(return_value=MagicMock(content="The answer is definitely 42.")) + + with ( + patch("timmy.session._get_agent", return_value=mock_agent), + patch("timmy.session.estimate_confidence", return_value=0.92), + patch("timmy.session.get_session_logger") as mock_get_logger, + ): + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + from timmy.session import chat_with_tools + + result = await chat_with_tools("test message") + + assert "[confidence:" not in result.content + assert result.content == "The answer is definitely 42." + + +@pytest.mark.asyncio +async def test_continue_chat_shows_confidence_when_low(): + """continue_chat() should include confidence indicator when low.""" + mock_agent = MagicMock() + mock_result = MagicMock(content="Perhaps this is the right approach.") + mock_agent.acontinue_run = AsyncMock(return_value=mock_result) + + mock_run_output = MagicMock() + + with ( + patch("timmy.session._get_agent", return_value=mock_agent), + patch("timmy.session.estimate_confidence", return_value=0.35), + patch("timmy.session.get_session_logger") as mock_get_logger, + ): + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + from timmy.session import continue_chat + + result = await continue_chat(mock_run_output) + + assert "[confidence: 35%]" in result.content + + +@pytest.mark.asyncio +async def test_continue_chat_hides_confidence_when_high(): + """continue_chat() should NOT include confidence indicator when high.""" + mock_agent = MagicMock() + mock_result = MagicMock(content="This is absolutely the correct solution.") + mock_agent.acontinue_run = AsyncMock(return_value=mock_result) + + mock_run_output = MagicMock() + + with ( + patch("timmy.session._get_agent", return_value=mock_agent), + patch("timmy.session.estimate_confidence", return_value=0.88), + patch("timmy.session.get_session_logger") as mock_get_logger, + ): + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + from timmy.session import continue_chat + + result = await continue_chat(mock_run_output) + + assert "[confidence:" not in result.content + assert result.content == "This is absolutely the correct solution."