129 lines
5.4 KiB
Python
129 lines
5.4 KiB
Python
"""Tests for confidence estimation in src/timmy/confidence.py."""
|
|
|
|
from timmy.confidence import (
|
|
CERTAINTY_WORDS,
|
|
HEDGING_WORDS,
|
|
estimate_confidence,
|
|
)
|
|
|
|
|
|
class TestEstimateConfidence:
|
|
"""Test cases for estimate_confidence function."""
|
|
|
|
def test_empty_string_returns_zero(self):
|
|
"""Empty string should return 0.0 confidence."""
|
|
assert estimate_confidence("") == 0.0
|
|
|
|
def test_whitespace_only_returns_zero(self):
|
|
"""Whitespace-only string should return 0.0 confidence."""
|
|
assert estimate_confidence(" ") == 0.0
|
|
|
|
def test_normal_factual_response(self):
|
|
"""Factual response should have at least moderate confidence."""
|
|
result = estimate_confidence("Paris is the capital of France.")
|
|
assert 0.5 <= result <= 1.0
|
|
# 6 words doesn't get short-response boost, should be at base
|
|
assert result >= 0.5
|
|
|
|
def test_i_dont_know_gives_very_low_confidence(self):
|
|
"""Direct admission of not knowing should give very low confidence."""
|
|
result = estimate_confidence("I don't know the answer to that.")
|
|
assert result <= 0.2
|
|
|
|
def test_i_am_not_sure_gives_very_low_confidence(self):
|
|
"""Uncertainty admission should give very low confidence."""
|
|
result = estimate_confidence("I am not sure about this.")
|
|
assert result <= 0.2
|
|
|
|
def test_hedging_words_reduce_confidence(self):
|
|
"""Hedging words should reduce confidence below base."""
|
|
base = estimate_confidence("This is the answer.")
|
|
hedged = estimate_confidence("I think this is the answer.")
|
|
assert hedged < base
|
|
|
|
def test_maybe_reduces_confidence(self):
|
|
"""The word 'maybe' should reduce confidence."""
|
|
base = estimate_confidence("It will rain tomorrow.")
|
|
hedged = estimate_confidence("Maybe it will rain tomorrow.")
|
|
assert hedged < base
|
|
|
|
def test_perhaps_reduces_confidence(self):
|
|
"""The word 'perhaps' should reduce confidence."""
|
|
base = estimate_confidence("The solution is correct.")
|
|
hedged = estimate_confidence("Perhaps the solution is correct.")
|
|
assert hedged < base
|
|
|
|
def test_certainty_words_increase_confidence(self):
|
|
"""Certainty words should increase confidence above base."""
|
|
# Use longer sentence to avoid short-response boost confounding
|
|
base = estimate_confidence("This is a longer sentence with more words.")
|
|
certain = estimate_confidence(
|
|
"I definitely know this is a longer sentence with more words."
|
|
)
|
|
assert certain > base
|
|
|
|
def test_definitely_increases_confidence(self):
|
|
"""The word 'definitely' should increase confidence."""
|
|
base = estimate_confidence("This will work.")
|
|
certain = estimate_confidence("This will definitely work.")
|
|
assert certain > base
|
|
|
|
def test_question_reduces_confidence(self):
|
|
"""Questions in response should reduce confidence."""
|
|
base = estimate_confidence("The value is 10.")
|
|
questioning = estimate_confidence("The value is 10?")
|
|
assert questioning < base
|
|
|
|
def test_multiple_hedging_words_compound(self):
|
|
"""Multiple hedging words should compound to lower confidence."""
|
|
text = "I think maybe this could be the answer, but I'm not sure."
|
|
result = estimate_confidence(text)
|
|
assert result < 0.4
|
|
|
|
def test_output_always_in_valid_range(self):
|
|
"""Output should always be clamped to [0.0, 1.0]."""
|
|
# Test with text that has many hedging words
|
|
heavily_hedged = (
|
|
"I think maybe perhaps possibly I believe this might could be approximately right."
|
|
)
|
|
result = estimate_confidence(heavily_hedged)
|
|
assert 0.0 <= result <= 1.0
|
|
|
|
# Test with text that has many certainty words
|
|
heavily_certain = "I know definitely certainly absolutely without doubt the answer is specifically exactly correct."
|
|
result = estimate_confidence(heavily_certain)
|
|
assert 0.0 <= result <= 1.0
|
|
|
|
def test_hedging_words_list_populated(self):
|
|
"""HEDGING_WORDS list should contain expected hedging phrases."""
|
|
assert "i think" in HEDGING_WORDS
|
|
assert "maybe" in HEDGING_WORDS
|
|
assert "perhaps" in HEDGING_WORDS
|
|
assert "not sure" in HEDGING_WORDS
|
|
assert "possibly" in HEDGING_WORDS
|
|
|
|
def test_certainty_words_list_populated(self):
|
|
"""CERTAINTY_WORDS list should contain expected certainty phrases."""
|
|
assert "i know" in CERTAINTY_WORDS
|
|
assert "definitely" in CERTAINTY_WORDS
|
|
assert "certainly" in CERTAINTY_WORDS
|
|
assert "the answer is" in CERTAINTY_WORDS
|
|
|
|
def test_certainty_and_hedging_cancel(self):
|
|
"""Mix of certainty and hedging should balance out near base."""
|
|
text = "I definitely think this is correct."
|
|
result = estimate_confidence(text)
|
|
# Should be near base (0.5) but hedging slightly stronger
|
|
assert 0.3 <= result <= 0.7
|
|
|
|
def test_i_have_no_idea_gives_very_low_confidence(self):
|
|
"""I have no idea should give very low confidence."""
|
|
result = estimate_confidence("I have no idea what you're talking about.")
|
|
assert result <= 0.2
|
|
|
|
def test_short_response_gets_boost(self):
|
|
"""Very short factual responses should get confidence boost."""
|
|
short = estimate_confidence("42")
|
|
# Short factual should be higher due to boost
|
|
assert short > 0.5
|