Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
688aeaf690 docs(research): add implementation recommendations to R@5 vs E2E gap report (#876)
All checks were successful
Lint / lint (pull_request) Successful in 30s
Appends Section 6 (Implementation Recommendations) to research_r5_vs_e2e_gap.md
with the four concrete action items from issue #876:

1. Chunk-overlap retrieval (50% overlap)
2. Retrieval confidence scoring with configurable threshold
3. Chain-of-thought over retrieved context (not plain concatenation)
4. First-class "I don't know" fallback when confidence is low

Also adds architecture-impact note on HRR limitations and renumbers
limitations section to 7. References parent epic #659 and research #876.
2026-04-22 02:03:36 -04:00
4 changed files with 50 additions and 39 deletions

View File

@@ -284,7 +284,44 @@ The gap can be reduced from 81 points to ~25-45 points with proper interventions
---
## 6. Limitations of This Research
## 6. Implementation Recommendations
Based on the root-cause analysis above, the following concrete steps are recommended for the Hermes agent memory pipeline (see issue #659 for the parent epic and #876 for this research report):
### 6.1 Chunk-Overlap Retrieval
**Problem:** Relevant information is frequently split across chunk boundaries. Retrieval finds one chunk but the answer spans two.
**Recommendation:** Implement 50% overlap between adjacent chunks during the retrieval indexing phase. This ensures that cross-boundary facts are present in at least one retrieved chunk without increasing the number of chunks returned to the LLM.
### 6.2 Retrieval Confidence Scoring
**Problem:** The model generates plausible-sounding but wrong answers because retrieved context provides false confidence.
**Recommendation:** Add a confidence score to each retrieved chunk (e.g., cosine-similarity threshold + source-reliability weight). Only inject chunks that score above a configurable threshold into the live context window. Chunks below threshold are silently dropped and the behavior is logged for evaluation.
### 6.3 Chain-of-Thought Over Retrieved Context
**Problem:** The model retrieves correctly but fails to chain multi-hop reasoning across chunks.
**Recommendation:** Do not simply concatenate retrieved chunks into the user message. Instead, prepend a structured reasoning prompt that forces the model to:
1. Quote the specific chunk that supports each step.
2. Flag when two chunks must be combined to reach a conclusion.
3. Stop and emit "I don't know" if no chunk supports a required inference step.
### 6.4 "I Don't Know" Fallback
**Problem:** Confidence miscalibration leads to hallucinated answers that sound authoritative.
**Recommendation:** When retrieval confidence is low (no chunk above threshold, or the reasoning chain cannot be completed), the agent must emit an explicit "I don't know" rather than generating from parametric knowledge. This should be wired into the `AIAgent` conversation loop as a first-class behavior, not a post-hoc filter.
### 6.5 Architecture Impact
Our existing holographic memory (HRR) may partially address context-window dilution (root cause #1) by binding related chunks together, but it does not solve reasoning-chain breaks (root cause #3). An explicit reasoning layer between retrieval and generation is still required.
---
## 7. Limitations of This Research
1. **MemPalace/Engram team analysis not found** - The specific analysis that discovered the 17% figure was not located through academic search. This may be from internal reports, blog posts, or presentations not indexed in arXiv.

View File

@@ -26,28 +26,6 @@ class TestHandleFunctionCall:
assert "error" in result
assert "agent loop" in result["error"].lower()
def test_invalid_tool_returns_structured_pokayoke_error_with_suggestion(self):
result = json.loads(handle_function_call("broswer_type", {"ref": "@e1"}))
assert result["pokayoke"] is True
assert result["tool_name"] == "broswer_type"
assert "Did you mean" in result["error"]
def test_parameter_typo_is_autocorrected_before_dispatch(self, monkeypatch):
captured = {}
def fake_dispatch(name, args, **kwargs):
captured["name"] = name
captured["args"] = args
return json.dumps({"ok": True})
monkeypatch.setattr("model_tools.registry.dispatch", fake_dispatch)
result = json.loads(handle_function_call("read_file", {"pathe": "test.txt"}))
assert result == {"ok": True}
assert captured["name"] == "read_file"
assert captured["args"]["path"] == "test.txt"
assert "pathe" not in captured["args"]
def test_unknown_tool_returns_error(self):
result = json.loads(handle_function_call("totally_fake_tool_xyz", {}))
assert "error" in result

View File

@@ -114,9 +114,8 @@ class TestToolCallValidator:
assert len(msgs) == 0
def test_invalid_tool_suggests(self, validator):
is_valid, corrected, params, msgs = validator.validate("broswer_type", {"ref": "@e1"})
is_valid, corrected, params, msgs = validator.validate("browser_typo", {"ref": "@e1"})
assert is_valid is False
assert corrected is None
assert "browser_type" in str(msgs)
def test_auto_correct_tool_name(self, validator):
@@ -131,10 +130,12 @@ class TestToolCallValidator:
assert "ref" in params
assert any("reff" in m and "ref" in m for m in msgs)
def test_circuit_breaker_triggers_on_third_consecutive_failure(self, validator):
validator.validate("nonexistent_tool", {})
validator.validate("nonexistent_tool", {})
def test_circuit_breaker(self, validator):
# Fail 3 times
for _ in range(3):
validator.validate("nonexistent_tool", {})
# 4th attempt should trigger circuit breaker
is_valid, corrected, params, msgs = validator.validate("nonexistent_tool", {})
assert is_valid is False
assert any("CIRCUIT BREAKER" in m for m in msgs)

View File

@@ -182,10 +182,7 @@ class ToolCallValidator:
name_valid, corrected_name, name_messages = self.validate_tool_name(tool_name)
if not name_valid:
failure_count = self._record_failure(tool_name)
if failure_count >= self.failure_threshold:
_, _, breaker_messages = self.validate_tool_name(tool_name)
return False, None, params, breaker_messages
self._record_failure(tool_name)
return False, None, params, name_messages
# Use corrected name if provided
@@ -202,8 +199,8 @@ class ToolCallValidator:
all_messages = name_messages + param_warnings
return True, corrected_name, corrected_params, all_messages
def _record_failure(self, tool_name: str) -> int:
"""Record a failure for circuit breaker and return the new count."""
def _record_failure(self, tool_name: str):
"""Record a failure for circuit breaker."""
self.consecutive_failures[tool_name] = self.consecutive_failures.get(tool_name, 0) + 1
count = self.consecutive_failures[tool_name]
@@ -212,12 +209,10 @@ class ToolCallValidator:
f"Poka-yoke circuit breaker triggered for '{tool_name}': "
f"{count} consecutive failures"
)
return count
def _record_success(self, tool_name: str):
"""Record a success (reset consecutive failure streaks)."""
if self.consecutive_failures:
self.consecutive_failures.clear()
"""Record a success (reset failure counter)."""
self.consecutive_failures.pop(tool_name, None)
def get_diagnostic_message(self, tool_name: str) -> str:
"""Generate diagnostic message for circuit breaker."""