forked from Rockachopa/Timmy-time-dashboard
@@ -37,6 +37,7 @@ class RunResult:
|
||||
"""Minimal Agno-compatible run result — carries the model's response text."""
|
||||
|
||||
content: str
|
||||
confidence: float | None = None
|
||||
|
||||
|
||||
def is_apple_silicon() -> bool:
|
||||
|
||||
128
src/timmy/confidence.py
Normal file
128
src/timmy/confidence.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Confidence estimation for Timmy's responses.
|
||||
|
||||
Implements SOUL.md requirement: "When I am uncertain, I must say so in
|
||||
proportion to my uncertainty."
|
||||
|
||||
This module provides heuristics to estimate confidence based on linguistic
|
||||
signals in the response text. It measures uncertainty without modifying
|
||||
the response content.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Hedging words that indicate uncertainty
|
||||
HEDGING_WORDS = [
|
||||
"i think",
|
||||
"maybe",
|
||||
"perhaps",
|
||||
"not sure",
|
||||
"might",
|
||||
"could be",
|
||||
"possibly",
|
||||
"i believe",
|
||||
"approximately",
|
||||
"roughly",
|
||||
"probably",
|
||||
"likely",
|
||||
"seems",
|
||||
"appears",
|
||||
"suggests",
|
||||
"i guess",
|
||||
"i suppose",
|
||||
"sort of",
|
||||
"kind of",
|
||||
"somewhat",
|
||||
"fairly",
|
||||
"relatively",
|
||||
"i'm not certain",
|
||||
"i am not certain",
|
||||
"uncertain",
|
||||
"unclear",
|
||||
]
|
||||
|
||||
# Certainty words that indicate confidence
|
||||
CERTAINTY_WORDS = [
|
||||
"i know",
|
||||
"definitely",
|
||||
"certainly",
|
||||
"the answer is",
|
||||
"specifically",
|
||||
"exactly",
|
||||
"absolutely",
|
||||
"without doubt",
|
||||
"i am certain",
|
||||
"i'm certain",
|
||||
"it is true that",
|
||||
"fact is",
|
||||
"in fact",
|
||||
"indeed",
|
||||
"undoubtedly",
|
||||
"clearly",
|
||||
"obviously",
|
||||
"conclusively",
|
||||
]
|
||||
|
||||
# Very low confidence indicators (direct admissions of ignorance)
|
||||
LOW_CONFIDENCE_PATTERNS = [
|
||||
r"i\s+(?:don't|do not)\s+know",
|
||||
r"i\s+(?:am|I'm|i'm)\s+(?:not\s+sure|unsure)",
|
||||
r"i\s+have\s+no\s+(?:idea|clue)",
|
||||
r"i\s+cannot\s+(?:say|tell|answer)",
|
||||
r"i\s+can't\s+(?:say|tell|answer)",
|
||||
]
|
||||
|
||||
|
||||
def estimate_confidence(text: str) -> float:
|
||||
"""Estimate confidence level of a response based on linguistic signals.
|
||||
|
||||
Analyzes the text for hedging words (reducing confidence) and certainty
|
||||
words (increasing confidence). Returns a score between 0.0 and 1.0.
|
||||
|
||||
Args:
|
||||
text: The response text to analyze.
|
||||
|
||||
Returns:
|
||||
A float between 0.0 (very uncertain) and 1.0 (very confident).
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return 0.0
|
||||
|
||||
text_lower = text.lower().strip()
|
||||
confidence = 0.5 # Start with neutral confidence
|
||||
|
||||
# Check for direct admissions of ignorance (very low confidence)
|
||||
for pattern in LOW_CONFIDENCE_PATTERNS:
|
||||
if re.search(pattern, text_lower):
|
||||
# Direct admission of not knowing - very low confidence
|
||||
confidence = 0.15
|
||||
break
|
||||
|
||||
# Count hedging words (reduce confidence)
|
||||
hedging_count = 0
|
||||
for hedge in HEDGING_WORDS:
|
||||
if hedge in text_lower:
|
||||
hedging_count += 1
|
||||
|
||||
# Count certainty words (increase confidence)
|
||||
certainty_count = 0
|
||||
for certain in CERTAINTY_WORDS:
|
||||
if certain in text_lower:
|
||||
certainty_count += 1
|
||||
|
||||
# Adjust confidence based on word counts
|
||||
# Each hedging word reduces confidence by 0.1
|
||||
# Each certainty word increases confidence by 0.1
|
||||
confidence -= hedging_count * 0.1
|
||||
confidence += certainty_count * 0.1
|
||||
|
||||
# Short factual answers get a small boost
|
||||
word_count = len(text.split())
|
||||
if word_count <= 5 and confidence > 0.3:
|
||||
confidence += 0.1
|
||||
|
||||
# Questions in response indicate uncertainty
|
||||
if "?" in text:
|
||||
confidence -= 0.15
|
||||
|
||||
# Clamp to valid range
|
||||
return max(0.0, min(1.0, confidence))
|
||||
@@ -38,21 +38,23 @@ class SessionLogger:
|
||||
# In-memory buffer
|
||||
self._buffer: list[dict] = []
|
||||
|
||||
def record_message(self, role: str, content: str) -> None:
|
||||
def record_message(self, role: str, content: str, confidence: float | None = None) -> None:
|
||||
"""Record a user message.
|
||||
|
||||
Args:
|
||||
role: "user" or "timmy"
|
||||
content: The message content
|
||||
confidence: Optional confidence score (0.0 to 1.0)
|
||||
"""
|
||||
self._buffer.append(
|
||||
{
|
||||
"type": "message",
|
||||
"role": role,
|
||||
"content": content,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
)
|
||||
entry = {
|
||||
"type": "message",
|
||||
"role": role,
|
||||
"content": content,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
if confidence is not None:
|
||||
entry["confidence"] = confidence
|
||||
self._buffer.append(entry)
|
||||
|
||||
def record_tool_call(self, tool_name: str, args: dict, result: str) -> None:
|
||||
"""Record a tool call.
|
||||
|
||||
128
tests/timmy/test_confidence.py
Normal file
128
tests/timmy/test_confidence.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Tests for confidence estimation in src/timmy/confidence.py."""
|
||||
|
||||
from timmy.confidence import (
|
||||
CERTAINTY_WORDS,
|
||||
HEDGING_WORDS,
|
||||
estimate_confidence,
|
||||
)
|
||||
|
||||
|
||||
class TestEstimateConfidence:
|
||||
"""Test cases for estimate_confidence function."""
|
||||
|
||||
def test_empty_string_returns_zero(self):
|
||||
"""Empty string should return 0.0 confidence."""
|
||||
assert estimate_confidence("") == 0.0
|
||||
|
||||
def test_whitespace_only_returns_zero(self):
|
||||
"""Whitespace-only string should return 0.0 confidence."""
|
||||
assert estimate_confidence(" ") == 0.0
|
||||
|
||||
def test_normal_factual_response(self):
|
||||
"""Factual response should have at least moderate confidence."""
|
||||
result = estimate_confidence("Paris is the capital of France.")
|
||||
assert 0.5 <= result <= 1.0
|
||||
# 6 words doesn't get short-response boost, should be at base
|
||||
assert result >= 0.5
|
||||
|
||||
def test_i_dont_know_gives_very_low_confidence(self):
|
||||
"""Direct admission of not knowing should give very low confidence."""
|
||||
result = estimate_confidence("I don't know the answer to that.")
|
||||
assert result <= 0.2
|
||||
|
||||
def test_i_am_not_sure_gives_very_low_confidence(self):
|
||||
"""Uncertainty admission should give very low confidence."""
|
||||
result = estimate_confidence("I am not sure about this.")
|
||||
assert result <= 0.2
|
||||
|
||||
def test_hedging_words_reduce_confidence(self):
|
||||
"""Hedging words should reduce confidence below base."""
|
||||
base = estimate_confidence("This is the answer.")
|
||||
hedged = estimate_confidence("I think this is the answer.")
|
||||
assert hedged < base
|
||||
|
||||
def test_maybe_reduces_confidence(self):
|
||||
"""The word 'maybe' should reduce confidence."""
|
||||
base = estimate_confidence("It will rain tomorrow.")
|
||||
hedged = estimate_confidence("Maybe it will rain tomorrow.")
|
||||
assert hedged < base
|
||||
|
||||
def test_perhaps_reduces_confidence(self):
|
||||
"""The word 'perhaps' should reduce confidence."""
|
||||
base = estimate_confidence("The solution is correct.")
|
||||
hedged = estimate_confidence("Perhaps the solution is correct.")
|
||||
assert hedged < base
|
||||
|
||||
def test_certainty_words_increase_confidence(self):
|
||||
"""Certainty words should increase confidence above base."""
|
||||
# Use longer sentence to avoid short-response boost confounding
|
||||
base = estimate_confidence("This is a longer sentence with more words.")
|
||||
certain = estimate_confidence(
|
||||
"I definitely know this is a longer sentence with more words."
|
||||
)
|
||||
assert certain > base
|
||||
|
||||
def test_definitely_increases_confidence(self):
|
||||
"""The word 'definitely' should increase confidence."""
|
||||
base = estimate_confidence("This will work.")
|
||||
certain = estimate_confidence("This will definitely work.")
|
||||
assert certain > base
|
||||
|
||||
def test_question_reduces_confidence(self):
|
||||
"""Questions in response should reduce confidence."""
|
||||
base = estimate_confidence("The value is 10.")
|
||||
questioning = estimate_confidence("The value is 10?")
|
||||
assert questioning < base
|
||||
|
||||
def test_multiple_hedging_words_compound(self):
|
||||
"""Multiple hedging words should compound to lower confidence."""
|
||||
text = "I think maybe this could be the answer, but I'm not sure."
|
||||
result = estimate_confidence(text)
|
||||
assert result < 0.4
|
||||
|
||||
def test_output_always_in_valid_range(self):
|
||||
"""Output should always be clamped to [0.0, 1.0]."""
|
||||
# Test with text that has many hedging words
|
||||
heavily_hedged = (
|
||||
"I think maybe perhaps possibly I believe this might could be approximately right."
|
||||
)
|
||||
result = estimate_confidence(heavily_hedged)
|
||||
assert 0.0 <= result <= 1.0
|
||||
|
||||
# Test with text that has many certainty words
|
||||
heavily_certain = "I know definitely certainly absolutely without doubt the answer is specifically exactly correct."
|
||||
result = estimate_confidence(heavily_certain)
|
||||
assert 0.0 <= result <= 1.0
|
||||
|
||||
def test_hedging_words_list_populated(self):
|
||||
"""HEDGING_WORDS list should contain expected hedging phrases."""
|
||||
assert "i think" in HEDGING_WORDS
|
||||
assert "maybe" in HEDGING_WORDS
|
||||
assert "perhaps" in HEDGING_WORDS
|
||||
assert "not sure" in HEDGING_WORDS
|
||||
assert "possibly" in HEDGING_WORDS
|
||||
|
||||
def test_certainty_words_list_populated(self):
|
||||
"""CERTAINTY_WORDS list should contain expected certainty phrases."""
|
||||
assert "i know" in CERTAINTY_WORDS
|
||||
assert "definitely" in CERTAINTY_WORDS
|
||||
assert "certainly" in CERTAINTY_WORDS
|
||||
assert "the answer is" in CERTAINTY_WORDS
|
||||
|
||||
def test_certainty_and_hedging_cancel(self):
|
||||
"""Mix of certainty and hedging should balance out near base."""
|
||||
text = "I definitely think this is correct."
|
||||
result = estimate_confidence(text)
|
||||
# Should be near base (0.5) but hedging slightly stronger
|
||||
assert 0.3 <= result <= 0.7
|
||||
|
||||
def test_i_have_no_idea_gives_very_low_confidence(self):
|
||||
"""I have no idea should give very low confidence."""
|
||||
result = estimate_confidence("I have no idea what you're talking about.")
|
||||
assert result <= 0.2
|
||||
|
||||
def test_short_response_gets_boost(self):
|
||||
"""Very short factual responses should get confidence boost."""
|
||||
short = estimate_confidence("42")
|
||||
# Short factual should be higher due to boost
|
||||
assert short > 0.5
|
||||
Reference in New Issue
Block a user