Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 37s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 43s
Tests / e2e (pull_request) Successful in 1m57s
Tests / test (pull_request) Failing after 18m57s
Marathon sessions show tool fixation: agent latches onto one tool and calls it repeatedly. Observed streaks of 8-25 identical calls. New agent/tool_fixation_detector.py: - ToolFixationDetector: tracks consecutive tool calls - record(tool_name): returns nudge prompt when threshold reached - Default threshold: 5 consecutive calls (configurable via TOOL_FIXATION_THRESHOLD env var) - Nudge prompt explains the fixation and suggests alternatives: 1. Read error carefully 2. Try different tool 3. Ask user for clarification 4. Check if task is complete - get_streak_info(): current streak state - format_report(): human-readable fixation events - Singleton via get_fixation_detector() Config: - TOOL_FIXATION_THRESHOLD (default: 5) - TOOL_FIXATION_WINDOW (default: 10) Tests: tests/test_tool_fixation_detector.py (9 tests) Closes #886
77 lines
2.3 KiB
Python
77 lines
2.3 KiB
Python
"""Tests for tool fixation detection."""
|
|
|
|
import pytest
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
from agent.tool_fixation_detector import ToolFixationDetector, get_fixation_detector
|
|
|
|
|
|
class TestFixationDetection:
|
|
def test_no_fixation_below_threshold(self):
|
|
d = ToolFixationDetector(threshold=5)
|
|
for i in range(4):
|
|
assert d.record("execute_code") is None
|
|
|
|
def test_fixation_at_threshold(self):
|
|
d = ToolFixationDetector(threshold=3)
|
|
d.record("execute_code")
|
|
d.record("execute_code")
|
|
nudge = d.record("execute_code")
|
|
assert nudge is not None
|
|
assert "execute_code" in nudge
|
|
assert "3 times" in nudge
|
|
|
|
def test_fixation_above_threshold(self):
|
|
d = ToolFixationDetector(threshold=3)
|
|
d.record("execute_code")
|
|
d.record("execute_code")
|
|
d.record("execute_code") # threshold hit
|
|
nudge = d.record("execute_code") # still nudging
|
|
assert nudge is not None
|
|
|
|
def test_streak_resets_on_different_tool(self):
|
|
d = ToolFixationDetector(threshold=3)
|
|
d.record("execute_code")
|
|
d.record("execute_code")
|
|
d.record("terminal") # breaks streak
|
|
assert d._streak_count == 1
|
|
assert d._current_streak == "terminal"
|
|
|
|
def test_nudges_sent_counter(self):
|
|
d = ToolFixationDetector(threshold=2)
|
|
d.record("a")
|
|
d.record("a") # nudge 1
|
|
d.record("a") # nudge 2
|
|
assert d.nudges_sent == 2
|
|
|
|
def test_events_recorded(self):
|
|
d = ToolFixationDetector(threshold=2)
|
|
d.record("x")
|
|
d.record("x")
|
|
assert len(d.events) == 1
|
|
assert d.events[0].tool_name == "x"
|
|
assert d.events[0].streak_length == 2
|
|
|
|
def test_report(self):
|
|
d = ToolFixationDetector(threshold=2)
|
|
d.record("x")
|
|
d.record("x")
|
|
report = d.format_report()
|
|
assert "x" in report
|
|
|
|
def test_reset(self):
|
|
d = ToolFixationDetector(threshold=2)
|
|
d.record("x")
|
|
d.record("x")
|
|
d.reset()
|
|
assert d._streak_count == 0
|
|
assert d._current_streak == ""
|
|
|
|
def test_singleton(self):
|
|
d1 = get_fixation_detector()
|
|
d2 = get_fixation_detector()
|
|
assert d1 is d2
|