Files
hermes-agent/tests/agent/test_fact_calibration.py
Alexander Whitestone f3fd5142ac
Some checks failed
Forge CI / smoke-and-build (pull_request) Failing after 53s
Fix #252: Automatic fact trust calibration from usage feedback
fact_feedback tool existed but was never called automatically. Trust
scores never changed after initial assignment. Facts lived forever
regardless of accuracy.

Changes:
- MemoryProvider: add get_prefetched_fact_ids() for feedback loop
- HolographicMemoryProvider: track fact IDs returned by prefetch()
- MemoryManager: auto_calibrate_feedback() detects corrections and
  applies helpful/unhelpful feedback automatically
- Correction detection: regex patterns for 'no', 'wrong', 'actually',
  'i said', 'correction:', 'undo', etc.
- MemoryManager: get_pruning_candidates() for below-threshold facts
- Wired into run_agent.py: calibration runs after prefetch, before tool loop

Trust mechanics:
- Successful interaction: trust += 0.05 per fact (helpful)
- Correction detected: trust -= 0.10 per fact (unhelpful)
- Trust clamped to [0.0, 1.0]
- Facts below threshold (default 0.15) are pruning candidates

Tests: 23 new tests, all passing. 139 total memory tests green.

Refs: Timmy_Foundation/hermes-agent#252
2026-04-13 18:22:58 -04:00

253 lines
8.4 KiB
Python

"""Tests for automatic fact trust calibration (Issue #252)."""
import json
import pytest
from agent.memory_manager import MemoryManager, _detect_correction
from plugins.memory.holographic import HolographicMemoryProvider
def _make_holographic_provider(db_path=":memory:"):
"""Create a holographic provider backed by an in-memory SQLite DB."""
provider = HolographicMemoryProvider(config={
"db_path": db_path,
"default_trust": 0.5,
"min_trust_threshold": 0.3,
"hrr_dim": 64, # small for speed
})
provider.initialize(session_id="test")
return provider
class TestDetectCorrection:
"""Correction detection pattern matching."""
@pytest.mark.parametrize("msg", [
"No, that's wrong",
"Actually, it's Python 3.12",
"That's not right",
"I said the config is in YAML",
"Correction: the port is 8080",
"Nope, wrong file",
"Not quite what I meant",
"Undo that last change",
"that is not correct",
"what i meant was different",
])
def test_correction_detected(self, msg):
assert _detect_correction(msg) is True
@pytest.mark.parametrize("msg", [
"",
"Hello",
"What's the weather today?",
"I need you to build a new feature. " * 10,
"yes that's correct",
])
def test_not_a_correction(self, msg):
assert _detect_correction(msg) is False
class TestAutoCalibrateFeedback:
"""Auto-calibration integration."""
def test_correction_marks_unhelpful(self):
provider = _make_holographic_provider()
manager = MemoryManager()
manager.add_provider(provider)
# Store a fact
result = manager.handle_tool_call(
"fact_store",
{"action": "add", "content": "The project uses Flask framework"},
)
fact_id = json.loads(result)["fact_id"]
# Simulate: this fact was prefetched
provider._last_prefetch_ids = [fact_id]
# User corrects: "No, it uses FastAPI"
manager.auto_calibrate_feedback("No, it uses FastAPI")
# Check trust dropped
result = manager.handle_tool_call(
"fact_store",
{"action": "list", "min_trust": 0.0},
)
facts = json.loads(result)["facts"]
target = next(f for f in facts if f["fact_id"] == fact_id)
assert target["trust_score"] < 0.5 # dropped from default 0.5
assert target["trust_score"] == pytest.approx(0.4, abs=0.01) # 0.5 - 0.1
def test_successful_interaction_gains_trust(self):
provider = _make_holographic_provider()
manager = MemoryManager()
manager.add_provider(provider)
# Store a fact
result = manager.handle_tool_call(
"fact_store",
{"action": "add", "content": "The project uses Django framework"},
)
fact_id = json.loads(result)["fact_id"]
# Simulate: this fact was prefetched
provider._last_prefetch_ids = [fact_id]
# User says something normal (not a correction)
manager.auto_calibrate_feedback("What version of Django?")
# Check trust increased
result = manager.handle_tool_call(
"fact_store",
{"action": "list", "min_trust": 0.0},
)
facts = json.loads(result)["facts"]
target = next(f for f in facts if f["fact_id"] == fact_id)
assert target["trust_score"] > 0.5 # rose from default 0.5
assert target["trust_score"] == pytest.approx(0.55, abs=0.01) # 0.5 + 0.05
def test_no_prefetch_no_calibration(self):
provider = _make_holographic_provider()
manager = MemoryManager()
manager.add_provider(provider)
# Store a fact
result = manager.handle_tool_call(
"fact_store",
{"action": "add", "content": "The database is PostgreSQL"},
)
fact_id = json.loads(result)["fact_id"]
# No prefetched facts
provider._last_prefetch_ids = []
# Calibrate — should be no-op
manager.auto_calibrate_feedback("No, it's MySQL")
# Trust should be unchanged
result = manager.handle_tool_call(
"fact_store",
{"action": "list", "min_trust": 0.0},
)
facts = json.loads(result)["facts"]
target = next(f for f in facts if f["fact_id"] == fact_id)
assert target["trust_score"] == 0.5 # unchanged
def test_multiple_corrections_drives_trust_low(self):
provider = _make_holographic_provider()
manager = MemoryManager()
manager.add_provider(provider)
# Store a fact
result = manager.handle_tool_call(
"fact_store",
{"action": "add", "content": "The server runs on port 3000"},
)
fact_id = json.loads(result)["fact_id"]
provider._last_prefetch_ids = [fact_id]
# Simulate 5 corrections
for _ in range(5):
manager.auto_calibrate_feedback("Wrong, it's port 8080")
# Trust should be much lower
result = manager.handle_tool_call(
"fact_store",
{"action": "list", "min_trust": 0.0},
)
facts = json.loads(result)["facts"]
target = next(f for f in facts if f["fact_id"] == fact_id)
assert target["trust_score"] < 0.2 # 0.5 - 5*0.1 = 0.0 (clamped)
def test_trust_floor_at_zero(self):
provider = _make_holographic_provider()
manager = MemoryManager()
manager.add_provider(provider)
result = manager.handle_tool_call(
"fact_store",
{"action": "add", "content": "Test fact for floor"},
)
fact_id = json.loads(result)["fact_id"]
provider._last_prefetch_ids = [fact_id]
# 10 corrections should clamp at 0.0, not go negative
for _ in range(10):
manager.auto_calibrate_feedback("Wrong!")
result = manager.handle_tool_call(
"fact_store",
{"action": "list", "min_trust": 0.0},
)
facts = json.loads(result)["facts"]
target = next(f for f in facts if f["fact_id"] == fact_id)
assert target["trust_score"] == 0.0
def test_trust_ceiling_at_one(self):
provider = _make_holographic_provider()
manager = MemoryManager()
manager.add_provider(provider)
result = manager.handle_tool_call(
"fact_store",
{"action": "add", "content": "Test fact for ceiling"},
)
fact_id = json.loads(result)["fact_id"]
provider._last_prefetch_ids = [fact_id]
# 20 successful interactions should cap at 1.0
for _ in range(20):
manager.auto_calibrate_feedback("Thanks, what else?")
result = manager.handle_tool_call(
"fact_store",
{"action": "list", "min_trust": 0.0},
)
facts = json.loads(result)["facts"]
target = next(f for f in facts if f["fact_id"] == fact_id)
assert target["trust_score"] == 1.0
def test_get_pruning_candidates(self):
provider = _make_holographic_provider()
manager = MemoryManager()
manager.add_provider(provider)
# Add a fact and drive its trust below threshold via corrections
result = manager.handle_tool_call(
"fact_store",
{"action": "add", "content": "Bad fact to be pruned"},
)
fact_id = json.loads(result)["fact_id"]
provider._last_prefetch_ids = [fact_id]
for _ in range(5):
manager.auto_calibrate_feedback("Wrong!")
# Get pruning candidates
candidates = manager.get_pruning_candidates(threshold=0.15)
assert any(c["fact_id"] == fact_id for c in candidates)
def test_prefetch_tracks_fact_ids(self):
"""Verify prefetch populates _last_prefetch_ids."""
provider = _make_holographic_provider()
# Add facts
provider.handle_tool_call("fact_store", {
"action": "add",
"content": "Alexander uses Python for development",
})
provider.handle_tool_call("fact_store", {
"action": "add",
"content": "Alexander prefers dark mode editors",
})
# Prefetch should find them and track IDs
result = provider.prefetch("Alexander")
assert "Holographic Memory" in result
assert len(provider._last_prefetch_ids) > 0
# Empty query clears IDs
provider.prefetch("")
assert provider._last_prefetch_ids == []