forked from Rockachopa/Timmy-time-dashboard
fix: make confidence visible to users when below 0.7 threshold (#259)
Co-authored-by: rockachopa <alexpaynex@gmail.com> Co-committed-by: rockachopa <alexpaynex@gmail.com>
This commit is contained in:
@@ -110,6 +110,10 @@ async def chat(message: str, session_id: str | None = None) -> str:
|
|||||||
confidence = estimate_confidence(response_text)
|
confidence = estimate_confidence(response_text)
|
||||||
logger.debug("Response confidence: %.2f", confidence)
|
logger.debug("Response confidence: %.2f", confidence)
|
||||||
|
|
||||||
|
# Make confidence visible to user when below threshold (SOUL.md requirement)
|
||||||
|
if confidence is not None and confidence < 0.7:
|
||||||
|
response_text += f"\n\n[confidence: {confidence:.0%}]"
|
||||||
|
|
||||||
# Record Timmy response after getting it
|
# Record Timmy response after getting it
|
||||||
session_logger.record_message("timmy", response_text, confidence=confidence)
|
session_logger.record_message("timmy", response_text, confidence=confidence)
|
||||||
|
|
||||||
@@ -148,6 +152,13 @@ async def chat_with_tools(message: str, session_id: str | None = None):
|
|||||||
)
|
)
|
||||||
confidence = estimate_confidence(response_text) if response_text else None
|
confidence = estimate_confidence(response_text) if response_text else None
|
||||||
logger.debug("Response confidence: %.2f", confidence)
|
logger.debug("Response confidence: %.2f", confidence)
|
||||||
|
|
||||||
|
# Make confidence visible to user when below threshold (SOUL.md requirement)
|
||||||
|
if confidence is not None and confidence < 0.7:
|
||||||
|
response_text += f"\n\n[confidence: {confidence:.0%}]"
|
||||||
|
# Update the run_output content to reflect the modified response
|
||||||
|
run_output.content = response_text
|
||||||
|
|
||||||
session_logger.record_message("timmy", response_text, confidence=confidence)
|
session_logger.record_message("timmy", response_text, confidence=confidence)
|
||||||
session_logger.flush()
|
session_logger.flush()
|
||||||
return run_output
|
return run_output
|
||||||
@@ -187,6 +198,13 @@ async def continue_chat(run_output, session_id: str | None = None):
|
|||||||
response_text = result.content if hasattr(result, "content") and result.content else ""
|
response_text = result.content if hasattr(result, "content") and result.content else ""
|
||||||
confidence = estimate_confidence(response_text) if response_text else None
|
confidence = estimate_confidence(response_text) if response_text else None
|
||||||
logger.debug("Response confidence: %.2f", confidence)
|
logger.debug("Response confidence: %.2f", confidence)
|
||||||
|
|
||||||
|
# Make confidence visible to user when below threshold (SOUL.md requirement)
|
||||||
|
if confidence is not None and confidence < 0.7:
|
||||||
|
response_text += f"\n\n[confidence: {confidence:.0%}]"
|
||||||
|
# Update the result content to reflect the modified response
|
||||||
|
result.content = response_text
|
||||||
|
|
||||||
session_logger.record_message("timmy", response_text, confidence=confidence)
|
session_logger.record_message("timmy", response_text, confidence=confidence)
|
||||||
session_logger.flush()
|
session_logger.flush()
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -297,3 +297,143 @@ async def test_continue_chat_passes_confidence_to_record_message():
|
|||||||
assert len(calls) >= 1 # should have timmy response
|
assert len(calls) >= 1 # should have timmy response
|
||||||
_, kwargs = calls[-1]
|
_, kwargs = calls[-1]
|
||||||
assert kwargs.get("confidence") == 0.91
|
assert kwargs.get("confidence") == 0.91
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Low confidence visibility (SOUL.md requirement)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chat_shows_confidence_when_low():
|
||||||
|
"""chat() should append confidence indicator when confidence < 0.7."""
|
||||||
|
mock_agent = MagicMock()
|
||||||
|
mock_agent.arun = AsyncMock(return_value=MagicMock(content="I think maybe this might work."))
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("timmy.session._get_agent", return_value=mock_agent),
|
||||||
|
patch("timmy.session.estimate_confidence", return_value=0.45),
|
||||||
|
patch("timmy.session.get_session_logger") as mock_get_logger,
|
||||||
|
):
|
||||||
|
mock_logger = MagicMock()
|
||||||
|
mock_get_logger.return_value = mock_logger
|
||||||
|
|
||||||
|
from timmy.session import chat
|
||||||
|
|
||||||
|
result = await chat("test message")
|
||||||
|
|
||||||
|
assert "[confidence: 45%]" in result
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chat_hides_confidence_when_high():
|
||||||
|
"""chat() should NOT append confidence indicator when confidence >= 0.7."""
|
||||||
|
mock_agent = MagicMock()
|
||||||
|
mock_agent.arun = AsyncMock(return_value=MagicMock(content="This is definitely correct."))
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("timmy.session._get_agent", return_value=mock_agent),
|
||||||
|
patch("timmy.session.estimate_confidence", return_value=0.85),
|
||||||
|
patch("timmy.session.get_session_logger") as mock_get_logger,
|
||||||
|
):
|
||||||
|
mock_logger = MagicMock()
|
||||||
|
mock_get_logger.return_value = mock_logger
|
||||||
|
|
||||||
|
from timmy.session import chat
|
||||||
|
|
||||||
|
result = await chat("test message")
|
||||||
|
|
||||||
|
assert "[confidence:" not in result
|
||||||
|
assert result == "This is definitely correct."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chat_with_tools_shows_confidence_when_low():
|
||||||
|
"""chat_with_tools() should include confidence indicator in response when low."""
|
||||||
|
mock_agent = MagicMock()
|
||||||
|
mock_agent.arun = AsyncMock(return_value=MagicMock(content="I'm not sure about this."))
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("timmy.session._get_agent", return_value=mock_agent),
|
||||||
|
patch("timmy.session.estimate_confidence", return_value=0.55),
|
||||||
|
patch("timmy.session.get_session_logger") as mock_get_logger,
|
||||||
|
):
|
||||||
|
mock_logger = MagicMock()
|
||||||
|
mock_get_logger.return_value = mock_logger
|
||||||
|
|
||||||
|
from timmy.session import chat_with_tools
|
||||||
|
|
||||||
|
result = await chat_with_tools("test message")
|
||||||
|
|
||||||
|
assert "[confidence: 55%]" in result.content
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_chat_with_tools_hides_confidence_when_high():
|
||||||
|
"""chat_with_tools() should NOT include confidence indicator when high."""
|
||||||
|
mock_agent = MagicMock()
|
||||||
|
mock_agent.arun = AsyncMock(return_value=MagicMock(content="The answer is definitely 42."))
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("timmy.session._get_agent", return_value=mock_agent),
|
||||||
|
patch("timmy.session.estimate_confidence", return_value=0.92),
|
||||||
|
patch("timmy.session.get_session_logger") as mock_get_logger,
|
||||||
|
):
|
||||||
|
mock_logger = MagicMock()
|
||||||
|
mock_get_logger.return_value = mock_logger
|
||||||
|
|
||||||
|
from timmy.session import chat_with_tools
|
||||||
|
|
||||||
|
result = await chat_with_tools("test message")
|
||||||
|
|
||||||
|
assert "[confidence:" not in result.content
|
||||||
|
assert result.content == "The answer is definitely 42."
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_continue_chat_shows_confidence_when_low():
|
||||||
|
"""continue_chat() should include confidence indicator when low."""
|
||||||
|
mock_agent = MagicMock()
|
||||||
|
mock_result = MagicMock(content="Perhaps this is the right approach.")
|
||||||
|
mock_agent.acontinue_run = AsyncMock(return_value=mock_result)
|
||||||
|
|
||||||
|
mock_run_output = MagicMock()
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("timmy.session._get_agent", return_value=mock_agent),
|
||||||
|
patch("timmy.session.estimate_confidence", return_value=0.35),
|
||||||
|
patch("timmy.session.get_session_logger") as mock_get_logger,
|
||||||
|
):
|
||||||
|
mock_logger = MagicMock()
|
||||||
|
mock_get_logger.return_value = mock_logger
|
||||||
|
|
||||||
|
from timmy.session import continue_chat
|
||||||
|
|
||||||
|
result = await continue_chat(mock_run_output)
|
||||||
|
|
||||||
|
assert "[confidence: 35%]" in result.content
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_continue_chat_hides_confidence_when_high():
|
||||||
|
"""continue_chat() should NOT include confidence indicator when high."""
|
||||||
|
mock_agent = MagicMock()
|
||||||
|
mock_result = MagicMock(content="This is absolutely the correct solution.")
|
||||||
|
mock_agent.acontinue_run = AsyncMock(return_value=mock_result)
|
||||||
|
|
||||||
|
mock_run_output = MagicMock()
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("timmy.session._get_agent", return_value=mock_agent),
|
||||||
|
patch("timmy.session.estimate_confidence", return_value=0.88),
|
||||||
|
patch("timmy.session.get_session_logger") as mock_get_logger,
|
||||||
|
):
|
||||||
|
mock_logger = MagicMock()
|
||||||
|
mock_get_logger.return_value = mock_logger
|
||||||
|
|
||||||
|
from timmy.session import continue_chat
|
||||||
|
|
||||||
|
result = await continue_chat(mock_run_output)
|
||||||
|
|
||||||
|
assert "[confidence:" not in result.content
|
||||||
|
assert result.content == "This is absolutely the correct solution."
|
||||||
|
|||||||
Reference in New Issue
Block a user