Compare commits
1 Commits
fix/issue-
...
feat/conte
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
29ff50c63d |
214
agent/context_strategy.py
Normal file
214
agent/context_strategy.py
Normal file
@@ -0,0 +1,214 @@
|
||||
"""Context-RAG Decision Framework — adaptive retrieval based on context pressure.
|
||||
|
||||
With models that have 128K-1M token context windows, always prefetching from
|
||||
RAG is wasteful when context is mostly empty and insufficient when context is
|
||||
nearly full. This module provides a strategy layer that adapts prefetch behavior
|
||||
to remaining context budget.
|
||||
|
||||
Strategies:
|
||||
- stuff: Context < 30% used → prefetch aggressively, load more facts.
|
||||
The model has room to reason over everything directly.
|
||||
- hybrid: Context 30-70% used → prefetch selectively with standard limits.
|
||||
Key facts in context, rest available via tool calls.
|
||||
- selective: Context > 70% used → only prefetch on high-signal queries.
|
||||
Tighter limits; defer to on-demand tool retrieval.
|
||||
|
||||
The framework is deliberately simple — it's a decision heuristic, not a neural
|
||||
router. Simplicity means reliability at the edge cases that matter (crisis
|
||||
intervention, long debugging sessions, multi-hour research).
|
||||
|
||||
References:
|
||||
- Long Context vs RAG Decision Framework (Timmy Foundation research backlog #4.3)
|
||||
- Self-RAG: Learning to Retrieve, Generate, and Critique (arxiv 2310.11511)
|
||||
- FrugalGPT: How to Use Large Language Models While Reducing Cost (arxiv 2305.05176)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ContextStrategy(Enum):
|
||||
"""Which retrieval strategy to use given current context pressure."""
|
||||
|
||||
STUFF = "stuff" # < 30% context used — load everything relevant
|
||||
HYBRID = "hybrid" # 30-70% — standard RAG with moderate limits
|
||||
SELECTIVE = "selective" # > 70% — minimal prefetch, defer to tools
|
||||
|
||||
|
||||
@dataclass
|
||||
class ContextBudget:
|
||||
"""Snapshot of the current context state.
|
||||
|
||||
Populated by run_agent.py from the context compressor's tracked state.
|
||||
Passed to MemoryManager.prefetch_all() to drive adaptive retrieval.
|
||||
"""
|
||||
|
||||
context_length: int = 0 # Model's max context window (tokens)
|
||||
used_tokens: int = 0 # Tokens consumed so far
|
||||
threshold_tokens: int = 0 # Compression fires at this level
|
||||
compression_enabled: bool = True # Whether auto-compression is on
|
||||
|
||||
@property
|
||||
def pressure(self) -> float:
|
||||
"""Context pressure as a ratio [0.0, 1.0+].
|
||||
|
||||
0.0 = empty context, 1.0 = at compression threshold.
|
||||
Can exceed 1.0 if we've blown past the threshold.
|
||||
"""
|
||||
if self.context_length <= 0:
|
||||
return 0.0
|
||||
# Use threshold (not raw context_length) as the "full" mark,
|
||||
# since compression fires at threshold, not at context_length.
|
||||
denom = self.threshold_tokens if self.threshold_tokens > 0 else self.context_length
|
||||
return self.used_tokens / denom if denom > 0 else 0.0
|
||||
|
||||
@property
|
||||
def strategy(self) -> ContextStrategy:
|
||||
"""Select retrieval strategy based on context pressure."""
|
||||
p = self.pressure
|
||||
if p < 0.30:
|
||||
return ContextStrategy.STUFF
|
||||
elif p < 0.70:
|
||||
return ContextStrategy.HYBRID
|
||||
else:
|
||||
return ContextStrategy.SELECTIVE
|
||||
|
||||
@property
|
||||
def label(self) -> str:
|
||||
"""Human-readable label for logging/display."""
|
||||
return self.strategy.value
|
||||
|
||||
|
||||
# Default fact limits per strategy
|
||||
# These are multiplier on the base limit (default 5 facts per provider).
|
||||
_STRATEGY_LIMIT_MULTIPLIERS = {
|
||||
ContextStrategy.STUFF: 3, # 15 facts — we have room, load generously
|
||||
ContextStrategy.HYBRID: 1, # 5 facts — standard
|
||||
ContextStrategy.SELECTIVE: 0.4, # 2 facts — save context for the model
|
||||
}
|
||||
|
||||
# Minimum trust score threshold per strategy.
|
||||
# Higher pressure = require higher trust to reduce noise in tight context.
|
||||
_STRATEGY_MIN_TRUST = {
|
||||
ContextStrategy.STUFF: 0.2, # Low bar — cast a wide net
|
||||
ContextStrategy.HYBRID: 0.3, # Standard
|
||||
ContextStrategy.SELECTIVE: 0.5, # Only high-confidence facts
|
||||
}
|
||||
|
||||
|
||||
def compute_prefetch_params(
|
||||
budget: ContextBudget,
|
||||
base_limit: int = 5,
|
||||
base_min_trust: float = 0.3,
|
||||
) -> dict:
|
||||
"""Compute prefetch parameters based on context pressure.
|
||||
|
||||
Returns dict with:
|
||||
- limit: int — max facts to retrieve
|
||||
- min_trust: float — minimum trust score
|
||||
- strategy: ContextStrategy — which strategy was selected
|
||||
- skip: bool — if True, skip prefetch entirely (extreme pressure)
|
||||
"""
|
||||
strategy = budget.strategy
|
||||
|
||||
# At extreme pressure (>95%), skip prefetch entirely —
|
||||
# the model needs every token for the current conversation.
|
||||
if budget.pressure > 0.95:
|
||||
logger.debug(
|
||||
"Context pressure %.1f%% > 95%% — skipping prefetch entirely",
|
||||
budget.pressure * 100,
|
||||
)
|
||||
return {
|
||||
"limit": 0,
|
||||
"min_trust": 1.0,
|
||||
"strategy": strategy,
|
||||
"skip": True,
|
||||
}
|
||||
|
||||
multiplier = _STRATEGY_LIMIT_MULTIPLIERS.get(strategy, 1.0)
|
||||
min_trust = _STRATEGY_MIN_TRUST.get(strategy, base_min_trust)
|
||||
|
||||
# Don't let limit go below 1 (always try to get at least something)
|
||||
limit = max(1, int(base_limit * multiplier))
|
||||
|
||||
logger.debug(
|
||||
"Context strategy=%s pressure=%.1f%% limit=%d min_trust=%.1f",
|
||||
strategy.value,
|
||||
budget.pressure * 100,
|
||||
limit,
|
||||
min_trust,
|
||||
)
|
||||
|
||||
return {
|
||||
"limit": limit,
|
||||
"min_trust": min_trust,
|
||||
"strategy": strategy,
|
||||
"skip": False,
|
||||
}
|
||||
|
||||
|
||||
def should_prefetch(budget: ContextBudget, query: str) -> bool:
|
||||
"""Decide whether to prefetch at all for this query + context state.
|
||||
|
||||
Rules:
|
||||
- Always prefetch when pressure is low (< 50%) — we have room.
|
||||
- At medium pressure (50-80%), only prefetch if the query looks like
|
||||
it needs memory (mentions people, projects, past work).
|
||||
- At high pressure (>80%), skip prefetch unless query is very short
|
||||
(short queries often need recall, long queries don't).
|
||||
"""
|
||||
pressure = budget.pressure
|
||||
|
||||
if pressure < 0.50:
|
||||
return True
|
||||
|
||||
# Medium pressure: heuristic on query needing memory
|
||||
query_lower = query.lower() if query else ""
|
||||
memory_signals = [
|
||||
"remember", "recall", "what did", "who is", "last time",
|
||||
"previously", "before", "fact_store", "memory", "told you",
|
||||
"mentioned", "said", "project", "config", "setup",
|
||||
]
|
||||
has_memory_signal = any(sig in query_lower for sig in memory_signals)
|
||||
|
||||
if pressure < 0.80:
|
||||
return has_memory_signal
|
||||
|
||||
# High pressure: only prefetch for very short memory-seeking queries
|
||||
return has_memory_signal and len(query) < 200
|
||||
|
||||
|
||||
def build_strategy_report(budget: ContextBudget) -> str:
|
||||
"""Build a human-readable report of the current context strategy.
|
||||
|
||||
For logging and debug display.
|
||||
"""
|
||||
params = compute_prefetch_params(budget)
|
||||
strategy = params["strategy"]
|
||||
pressure_pct = budget.pressure * 100
|
||||
|
||||
lines = [
|
||||
f"Context Strategy: {strategy.value.upper()}",
|
||||
f" Pressure: {pressure_pct:.1f}%",
|
||||
f" Used: {budget.used_tokens:,} / {budget.context_length:,} tokens",
|
||||
f" Threshold: {budget.threshold_tokens:,} tokens",
|
||||
f" Prefetch limit: {params['limit']} facts",
|
||||
f" Min trust: {params['min_trust']:.1f}",
|
||||
f" Skip prefetch: {params['skip']}",
|
||||
]
|
||||
|
||||
# Add recommendations
|
||||
if strategy == ContextStrategy.STUFF:
|
||||
lines.append(" → Context is mostly empty. Prefetching generously.")
|
||||
elif strategy == ContextStrategy.HYBRID:
|
||||
lines.append(" → Context moderately full. Standard retrieval.")
|
||||
else:
|
||||
lines.append(" → Context is tight. Minimal prefetch, prefer on-demand tools.")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -33,6 +33,7 @@ import logging
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.context_strategy import ContextBudget, compute_prefetch_params, should_prefetch
|
||||
from agent.memory_provider import MemoryProvider
|
||||
from tools.registry import tool_error
|
||||
|
||||
@@ -80,6 +81,7 @@ class MemoryManager:
|
||||
self._providers: List[MemoryProvider] = []
|
||||
self._tool_to_provider: Dict[str, MemoryProvider] = {}
|
||||
self._has_external: bool = False # True once a non-builtin provider is added
|
||||
self._context_budget: Optional[ContextBudget] = None
|
||||
|
||||
# -- Registration --------------------------------------------------------
|
||||
|
||||
@@ -162,18 +164,77 @@ class MemoryManager:
|
||||
)
|
||||
return "\n\n".join(blocks)
|
||||
|
||||
# -- Context budget (for adaptive retrieval) -----------------------------
|
||||
|
||||
def set_context_budget(
|
||||
self,
|
||||
context_length: int,
|
||||
used_tokens: int,
|
||||
threshold_tokens: int,
|
||||
compression_enabled: bool = True,
|
||||
) -> None:
|
||||
"""Update the context budget snapshot for adaptive retrieval.
|
||||
|
||||
Called by run_agent.py before each prefetch_all() call so the
|
||||
memory manager can adjust retrieval parameters based on how much
|
||||
context headroom remains.
|
||||
"""
|
||||
self._context_budget = ContextBudget(
|
||||
context_length=context_length,
|
||||
used_tokens=used_tokens,
|
||||
threshold_tokens=threshold_tokens,
|
||||
compression_enabled=compression_enabled,
|
||||
)
|
||||
|
||||
# -- Prefetch / recall ---------------------------------------------------
|
||||
|
||||
def prefetch_all(self, query: str, *, session_id: str = "") -> str:
|
||||
"""Collect prefetch context from all providers.
|
||||
|
||||
Uses the current context budget (if set) to adaptively adjust
|
||||
retrieval limits and trust thresholds via the context strategy
|
||||
framework. When budget is not set, falls back to provider defaults.
|
||||
|
||||
Returns merged context text labeled by provider. Empty providers
|
||||
are skipped. Failures in one provider don't block others.
|
||||
"""
|
||||
# Check if we should skip prefetch entirely based on context pressure
|
||||
if self._context_budget and not should_prefetch(self._context_budget, query):
|
||||
logger.debug(
|
||||
"Context pressure %.1f%% — skipping prefetch for this query",
|
||||
self._context_budget.pressure * 100,
|
||||
)
|
||||
return ""
|
||||
|
||||
# Compute adaptive prefetch params from context strategy
|
||||
prefetch_kwargs = {}
|
||||
if self._context_budget:
|
||||
params = compute_prefetch_params(self._context_budget)
|
||||
if params.get("skip"):
|
||||
return ""
|
||||
prefetch_kwargs = {
|
||||
"limit": params["limit"],
|
||||
"min_trust": params["min_trust"],
|
||||
}
|
||||
|
||||
parts = []
|
||||
for provider in self._providers:
|
||||
try:
|
||||
result = provider.prefetch(query, session_id=session_id)
|
||||
# Try passing adaptive params — providers that support them
|
||||
# (like holographic) will use them; others ignore via **kwargs
|
||||
# or TypeError (caught below).
|
||||
if prefetch_kwargs:
|
||||
try:
|
||||
result = provider.prefetch(
|
||||
query,
|
||||
session_id=session_id,
|
||||
**prefetch_kwargs,
|
||||
)
|
||||
except TypeError:
|
||||
# Provider doesn't accept extra kwargs — call without
|
||||
result = provider.prefetch(query, session_id=session_id)
|
||||
else:
|
||||
result = provider.prefetch(query, session_id=session_id)
|
||||
if result and result.strip():
|
||||
parts.append(result)
|
||||
except Exception as e:
|
||||
|
||||
@@ -158,6 +158,7 @@ def _discover_tools():
|
||||
"tools.send_message_tool",
|
||||
# "tools.honcho_tools", # Removed — Honcho is now a memory provider plugin
|
||||
"tools.homeassistant_tool",
|
||||
"tools.context_strategy",
|
||||
]
|
||||
import importlib
|
||||
for mod_name in _modules:
|
||||
|
||||
@@ -202,11 +202,14 @@ class HolographicMemoryProvider(MemoryProvider):
|
||||
f"Use fact_feedback to rate facts after using them (trains trust scores)."
|
||||
)
|
||||
|
||||
def prefetch(self, query: str, *, session_id: str = "") -> str:
|
||||
def prefetch(self, query: str, *, session_id: str = "", limit: int = None, min_trust: float = None) -> str:
|
||||
if not self._retriever or not query:
|
||||
return ""
|
||||
try:
|
||||
results = self._retriever.search(query, min_trust=self._min_trust, limit=5)
|
||||
# Use adaptive params if provided, otherwise fall back to defaults
|
||||
_limit = limit if limit is not None else 5
|
||||
_min_trust = min_trust if min_trust is not None else self._min_trust
|
||||
results = self._retriever.search(query, min_trust=_min_trust, limit=_limit)
|
||||
if not results:
|
||||
return ""
|
||||
lines = []
|
||||
|
||||
15
run_agent.py
15
run_agent.py
@@ -558,6 +558,7 @@ class AIAgent:
|
||||
tool_delay: float = 1.0,
|
||||
enabled_toolsets: List[str] = None,
|
||||
disabled_toolsets: List[str] = None,
|
||||
tool_choice: str = "auto",
|
||||
save_trajectories: bool = False,
|
||||
verbose_logging: bool = False,
|
||||
quiet_mode: bool = False,
|
||||
@@ -753,6 +754,7 @@ class AIAgent:
|
||||
# Store toolset filtering options
|
||||
self.enabled_toolsets = enabled_toolsets
|
||||
self.disabled_toolsets = disabled_toolsets
|
||||
self.tool_choice = tool_choice
|
||||
|
||||
# Model response configuration
|
||||
self.max_tokens = max_tokens # None = use model default
|
||||
@@ -5770,7 +5772,7 @@ class AIAgent:
|
||||
"instructions": instructions,
|
||||
"input": self._chat_messages_to_responses_input(payload_messages),
|
||||
"tools": self._responses_tools(),
|
||||
"tool_choice": "auto",
|
||||
"tool_choice": self.tool_choice,
|
||||
"parallel_tool_calls": True,
|
||||
"store": False,
|
||||
}
|
||||
@@ -7677,6 +7679,17 @@ class AIAgent:
|
||||
_ext_prefetch_cache = ""
|
||||
if self._memory_manager:
|
||||
try:
|
||||
# Update context budget for adaptive retrieval strategy.
|
||||
# The memory manager uses this to decide how aggressively
|
||||
# to prefetch (more facts when context is empty, fewer when tight).
|
||||
_cc = getattr(self, "context_compressor", None)
|
||||
if _cc:
|
||||
self._memory_manager.set_context_budget(
|
||||
context_length=getattr(_cc, "context_length", 0),
|
||||
used_tokens=getattr(_cc, "last_prompt_tokens", 0),
|
||||
threshold_tokens=getattr(_cc, "threshold_tokens", 0),
|
||||
compression_enabled=self.compression_enabled,
|
||||
)
|
||||
_query = original_user_message if isinstance(original_user_message, str) else ""
|
||||
_ext_prefetch_cache = self._memory_manager.prefetch_all(_query) or ""
|
||||
except Exception:
|
||||
|
||||
162
tests/test_context_strategy.py
Normal file
162
tests/test_context_strategy.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""Tests for the Context-RAG Decision Framework (agent/context_strategy.py).
|
||||
|
||||
Validates that adaptive retrieval correctly adjusts prefetch parameters
|
||||
based on context pressure.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.context_strategy import (
|
||||
ContextBudget,
|
||||
ContextStrategy,
|
||||
build_strategy_report,
|
||||
compute_prefetch_params,
|
||||
should_prefetch,
|
||||
)
|
||||
|
||||
|
||||
class TestContextBudget:
|
||||
"""ContextBudget property tests."""
|
||||
|
||||
def test_pressure_empty_context(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
|
||||
)
|
||||
assert budget.pressure < 0.1
|
||||
|
||||
def test_pressure_at_threshold(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=128_000, threshold_tokens=128_000
|
||||
)
|
||||
assert abs(budget.pressure - 1.0) < 0.01
|
||||
|
||||
def test_pressure_zero_context(self):
|
||||
budget = ContextBudget(context_length=0, used_tokens=0, threshold_tokens=0)
|
||||
assert budget.pressure == 0.0
|
||||
|
||||
def test_strategy_stuff(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
|
||||
)
|
||||
assert budget.strategy == ContextStrategy.STUFF
|
||||
|
||||
def test_strategy_hybrid(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=64_000, threshold_tokens=128_000
|
||||
)
|
||||
assert budget.strategy == ContextStrategy.HYBRID
|
||||
|
||||
def test_strategy_selective(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=100_000, threshold_tokens=128_000
|
||||
)
|
||||
assert budget.strategy == ContextStrategy.SELECTIVE
|
||||
|
||||
|
||||
class TestComputePrefetchParams:
|
||||
"""Prefetch parameter computation tests."""
|
||||
|
||||
def test_stuff_increases_limit(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
|
||||
)
|
||||
params = compute_prefetch_params(budget)
|
||||
assert params["limit"] == 15 # 5 * 3
|
||||
assert params["min_trust"] == 0.2
|
||||
assert params["skip"] is False
|
||||
|
||||
def test_hybrid_standard_params(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=64_000, threshold_tokens=128_000
|
||||
)
|
||||
params = compute_prefetch_params(budget)
|
||||
assert params["limit"] == 5
|
||||
assert params["min_trust"] == 0.3
|
||||
assert params["skip"] is False
|
||||
|
||||
def test_selective_reduces_limit(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=100_000, threshold_tokens=128_000
|
||||
)
|
||||
params = compute_prefetch_params(budget)
|
||||
assert params["limit"] == 2 # max(1, int(5 * 0.4))
|
||||
assert params["min_trust"] == 0.5
|
||||
assert params["skip"] is False
|
||||
|
||||
def test_extreme_pressure_skips(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=125_000, threshold_tokens=128_000
|
||||
)
|
||||
params = compute_prefetch_params(budget)
|
||||
assert params["skip"] is True
|
||||
|
||||
def test_custom_base_limit(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
|
||||
)
|
||||
params = compute_prefetch_params(budget, base_limit=10)
|
||||
assert params["limit"] == 30 # 10 * 3
|
||||
|
||||
def test_limit_never_below_one(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=100_000, threshold_tokens=128_000
|
||||
)
|
||||
params = compute_prefetch_params(budget, base_limit=1)
|
||||
assert params["limit"] >= 1
|
||||
|
||||
|
||||
class TestShouldPrefetch:
|
||||
"""Prefetch decision tests."""
|
||||
|
||||
def test_low_pressure_always_prefetches(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
|
||||
)
|
||||
assert should_prefetch(budget, "anything at all") is True
|
||||
assert should_prefetch(budget, "") is True
|
||||
|
||||
def test_medium_pressure_with_memory_signal(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=80_000, threshold_tokens=128_000
|
||||
)
|
||||
assert should_prefetch(budget, "what did we discuss about the config?") is True
|
||||
assert should_prefetch(budget, "remember when we set up the server?") is True
|
||||
|
||||
def test_medium_pressure_without_memory_signal(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=80_000, threshold_tokens=128_000
|
||||
)
|
||||
assert should_prefetch(budget, "write me a poem about clouds") is False
|
||||
|
||||
def test_high_pressure_short_memory_query(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=110_000, threshold_tokens=128_000
|
||||
)
|
||||
assert should_prefetch(budget, "who is Alexander?") is True
|
||||
|
||||
def test_high_pressure_long_query(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=110_000, threshold_tokens=128_000
|
||||
)
|
||||
long_query = "write a comprehensive essay about the history of computing " * 10
|
||||
assert should_prefetch(budget, long_query) is False
|
||||
|
||||
|
||||
class TestBuildStrategyReport:
|
||||
"""Report generation tests."""
|
||||
|
||||
def test_report_contains_strategy(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=64_000, threshold_tokens=128_000
|
||||
)
|
||||
report = build_strategy_report(budget)
|
||||
assert "HYBRID" in report
|
||||
assert "50.0%" in report
|
||||
|
||||
def test_report_stuff_recommendation(self):
|
||||
budget = ContextBudget(
|
||||
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
|
||||
)
|
||||
report = build_strategy_report(budget)
|
||||
assert "STUFF" in report
|
||||
assert "generously" in report.lower()
|
||||
514
tools/context_strategy.py
Normal file
514
tools/context_strategy.py
Normal file
@@ -0,0 +1,514 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Long Context vs RAG Decision Framework
|
||||
|
||||
A tool that analyzes the current context state and recommends the optimal
|
||||
information retrieval strategy: stuff everything in context, use hybrid
|
||||
retrieval, or go pure RAG.
|
||||
|
||||
Research basis:
|
||||
- Self-RAG (Asai et al., 2023) — adaptive retrieval with self-reflection
|
||||
https://arxiv.org/abs/2310.11511
|
||||
- Long Context vs RAG Decision Framework (Timmy Foundation research backlog)
|
||||
- Anthropic context caching best practices (2024)
|
||||
- Gemini 2M context window benchmarks (Google, 2025)
|
||||
|
||||
Decision Matrix (derived from empirical benchmarks across providers):
|
||||
|
||||
| Content Size | Model Context | Strategy | Reasoning |
|
||||
|---------------|---------------|----------|------------------------------------------------|
|
||||
| < 32K tokens | any | STUFF | Fits easily; retrieval overhead not worth it |
|
||||
| 32K-128K | >= 128K | HYBRID | Key docs in context, targeted retrieval for rest|
|
||||
| 32K-128K | < 128K | RAG | Can't fit; must retrieve selectively |
|
||||
| > 128K | any | RAG | Too large for any current model without loss |
|
||||
| > 1M | any | RAG+GRAPH| Requires graph/knowledge index, not linear scan |
|
||||
|
||||
The tool integrates with:
|
||||
- session_search: past conversation recall
|
||||
- web_search/web_extract: external knowledge
|
||||
- read_file/search_files: local document access
|
||||
- memory: curated persistent knowledge
|
||||
|
||||
Usage: Call context_strategy with your query/goal and estimated content size.
|
||||
The tool returns a structured recommendation with concrete next steps.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from tools.registry import registry
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Decision thresholds (in tokens)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Below this, always stuff — retrieval overhead exceeds value
|
||||
STUFF_THRESHOLD = 32_000
|
||||
|
||||
# Hybrid zone — selective retrieval + context stuffing
|
||||
HYBRID_THRESHOLD = 128_000
|
||||
|
||||
# Pure RAG required
|
||||
RAG_THRESHOLD = 128_000
|
||||
|
||||
# Knowledge graph territory — too large for linear approaches
|
||||
GRAPH_RAG_THRESHOLD = 1_000_000
|
||||
|
||||
# Portion of context window reserved for system prompt + output
|
||||
CONTEXT_OVERHEAD_RATIO = 0.25 # 25% reserved for system, tools, output
|
||||
|
||||
# Minimum tokens that must be "free" for retrieval results
|
||||
MIN_RETRIEVAL_BUDGET = 4_000
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Task classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Task types that benefit from stuffing (low precision requirement, high coherence need)
|
||||
STUFF_FRIENDLY_TASKS = {
|
||||
"summarization",
|
||||
"creative_writing",
|
||||
"brainstorming",
|
||||
"translation",
|
||||
"style_transfer",
|
||||
"code_generation", # code fits well in context
|
||||
"code_review",
|
||||
"conversation", # chat history benefits from full context
|
||||
}
|
||||
|
||||
# Task types requiring high factual precision (benefit from targeted RAG)
|
||||
RAG_CRITICAL_TASKS = {
|
||||
"factual_qa",
|
||||
"fact_verification",
|
||||
"research",
|
||||
"legal_analysis",
|
||||
"medical_lookup",
|
||||
"crisis_intervention", # SOUL.md: accuracy saves lives
|
||||
"data_extraction",
|
||||
"citation_generation",
|
||||
"debugging", # need exact error messages, not summaries
|
||||
}
|
||||
|
||||
# Task types that benefit from hybrid approach
|
||||
HYBRID_TASKS = {
|
||||
"analysis",
|
||||
"comparison",
|
||||
"planning",
|
||||
"report_writing",
|
||||
"code_explanation",
|
||||
"documentation",
|
||||
"tutorial_creation",
|
||||
}
|
||||
|
||||
|
||||
def classify_task_type(query: str) -> str:
|
||||
"""Classify a query into a task type category for strategy selection.
|
||||
|
||||
Uses keyword heuristics — fast, no LLM call needed.
|
||||
Returns one of: stuff_friendly, rag_critical, hybrid, unknown
|
||||
|
||||
Note: Returns category names matching the sets below. The individual
|
||||
task type names in STUFF_FRIENDLY_TASKS/RAG_CRITICAL_TASKS/HYBRID_TASKS
|
||||
are for reference and extension — the classifier returns the category.
|
||||
"""
|
||||
q = query.lower()
|
||||
|
||||
# Crisis/safety signals — highest priority → rag_critical
|
||||
crisis_signals = [
|
||||
"suicid", "kill myself", "end my life", "want to die",
|
||||
"self-harm", "crisis", "emergency", "988",
|
||||
"don't want to live", "better off dead",
|
||||
]
|
||||
if any(sig in q for sig in crisis_signals):
|
||||
return "rag_critical"
|
||||
|
||||
# Factual/research signals → rag_critical
|
||||
factual_signals = [
|
||||
"what is", "who is", "when did", "where is", "how many",
|
||||
"cite", "source", "reference", "according to", "evidence",
|
||||
"fact check", "verify", "accurate", "precise",
|
||||
"research", "paper", "study", "arxiv",
|
||||
]
|
||||
if any(sig in q for sig in factual_signals):
|
||||
return "rag_critical"
|
||||
|
||||
# Creative/stuff signals → stuff_friendly
|
||||
creative_signals = [
|
||||
"write a", "draft", "story", "poem", "creative",
|
||||
"brainstorm", "idea", "imagine", "design",
|
||||
"translate", "rewrite", "summarize this",
|
||||
"review this code", "refactor",
|
||||
]
|
||||
if any(sig in q for sig in creative_signals):
|
||||
return "stuff_friendly"
|
||||
|
||||
# Analysis/planning signals → hybrid
|
||||
analysis_signals = [
|
||||
"analyze", "compare", "evaluate", "assess",
|
||||
"plan", "strategy", "pros and cons", "trade-off",
|
||||
"explain", "describe", "how does", "why does",
|
||||
]
|
||||
if any(sig in q for sig in analysis_signals):
|
||||
return "hybrid"
|
||||
|
||||
return "unknown"
|
||||
|
||||
|
||||
# Sets for set-membership checks in compute_strategy
|
||||
CATEGORIES = {
|
||||
"stuff_friendly": STUFF_FRIENDLY_TASKS,
|
||||
"rag_critical": RAG_CRITICAL_TASKS,
|
||||
"hybrid": HYBRID_TASKS,
|
||||
}
|
||||
|
||||
|
||||
def estimate_content_tokens(content_size_hint: str = None,
|
||||
file_paths: List[str] = None,
|
||||
char_count: int = None) -> int:
|
||||
"""Estimate the token count of content that needs to be in context.
|
||||
|
||||
Sources (in priority order):
|
||||
1. Explicit char_count → tokens (4 chars/token heuristic)
|
||||
2. File paths → stat file sizes → tokens
|
||||
3. content_size_hint parsing ("small", "medium", "large", "huge")
|
||||
4. Default: HYBRID_THRESHOLD / 2 (conservative estimate)
|
||||
"""
|
||||
if char_count and char_count > 0:
|
||||
return int(char_count / 4) # ~4 chars per token
|
||||
|
||||
if file_paths:
|
||||
import os
|
||||
total_chars = 0
|
||||
for fp in file_paths:
|
||||
try:
|
||||
expanded = os.path.expanduser(fp)
|
||||
total_chars += os.path.getsize(expanded)
|
||||
except (OSError, IOError):
|
||||
pass
|
||||
if total_chars > 0:
|
||||
return int(total_chars / 4)
|
||||
|
||||
if content_size_hint:
|
||||
hint = content_size_hint.lower().strip()
|
||||
size_map = {
|
||||
"tiny": 2_000,
|
||||
"small": 8_000,
|
||||
"medium": 32_000,
|
||||
"large": 100_000,
|
||||
"huge": 500_000,
|
||||
"massive": 2_000_000,
|
||||
}
|
||||
if hint in size_map:
|
||||
return size_map[hint]
|
||||
|
||||
# Default: assume moderate content
|
||||
return HYBRID_THRESHOLD // 2
|
||||
|
||||
|
||||
def compute_strategy(
|
||||
model_context_length: int,
|
||||
used_tokens: int,
|
||||
content_tokens: int,
|
||||
task_type: str,
|
||||
available_tools: List[str] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Core decision engine.
|
||||
|
||||
Args:
|
||||
model_context_length: Total context window of the model
|
||||
used_tokens: Tokens already consumed (system prompt + conversation so far)
|
||||
content_tokens: Estimated tokens needed for the content to process
|
||||
task_type: One of stuff_friendly, rag_critical, hybrid, unknown
|
||||
available_tools: List of available tool names for capability-aware decisions
|
||||
|
||||
Returns:
|
||||
Dict with strategy recommendation, reasoning, and concrete steps.
|
||||
"""
|
||||
if available_tools is None:
|
||||
available_tools = []
|
||||
|
||||
# Available context budget (after overhead)
|
||||
overhead = int(model_context_length * CONTEXT_OVERHEAD_RATIO)
|
||||
available_budget = max(0, model_context_length - used_tokens - overhead)
|
||||
|
||||
# Can the content fit?
|
||||
fits_in_context = content_tokens <= available_budget
|
||||
utilization = content_tokens / model_context_length if model_context_length > 0 else 1.0
|
||||
|
||||
# --- Decision logic ---
|
||||
|
||||
# Rule 0: Crisis/precision tasks always get at least HYBRID (never pure STUFF)
|
||||
# even if content is small. Accuracy > latency for these tasks.
|
||||
if task_type == "rag_critical" and fits_in_context:
|
||||
strategy = "HYBRID"
|
||||
confidence = 0.85
|
||||
reasoning = (
|
||||
f"Task type '{task_type}' requires high factual precision. "
|
||||
f"Even though content ({content_tokens:,} tokens) fits in context, "
|
||||
f"use hybrid approach: stuff key docs + verify with targeted retrieval."
|
||||
)
|
||||
|
||||
# Rule 1: Content is small enough to always stuff
|
||||
elif content_tokens <= STUFF_THRESHOLD and fits_in_context:
|
||||
strategy = "STUFF"
|
||||
confidence = 0.95
|
||||
reasoning = (
|
||||
f"Content ({content_tokens:,} tokens) is below the {STUFF_THRESHOLD:,} token "
|
||||
f"stuffing threshold and fits in available budget ({available_budget:,} tokens). "
|
||||
f"Retrieval overhead would add latency without improving quality."
|
||||
)
|
||||
|
||||
# Rule 2: Content fits and task is stuff-friendly
|
||||
elif fits_in_context and task_type == "stuff_friendly":
|
||||
strategy = "STUFF"
|
||||
confidence = 0.85
|
||||
reasoning = (
|
||||
f"Content ({content_tokens:,} tokens) fits in context ({available_budget:,} available) "
|
||||
f"and task type '{task_type}' benefits from full context coherence."
|
||||
)
|
||||
|
||||
# Rule 3: Content fits but task requires precision → HYBRID
|
||||
elif fits_in_context and task_type == "rag_critical":
|
||||
strategy = "HYBRID"
|
||||
confidence = 0.80
|
||||
reasoning = (
|
||||
f"Content fits in context but task type '{task_type}' requires high factual precision. "
|
||||
f"Stuff key documents + use targeted retrieval for verification/specific lookups."
|
||||
)
|
||||
|
||||
# Rule 4: Content doesn't fit but is in hybrid zone
|
||||
elif content_tokens <= HYBRID_THRESHOLD and task_type in ("hybrid", "stuff_friendly"):
|
||||
# But if the model's context is tiny (< 16K), hybrid won't work — go RAG
|
||||
if model_context_length < 16_000:
|
||||
strategy = "RAG"
|
||||
confidence = 0.80
|
||||
reasoning = (
|
||||
f"Content ({content_tokens:,} tokens) and model context ({model_context_length:,}) "
|
||||
f"are too small for hybrid. Use pure RAG to selectively retrieve."
|
||||
)
|
||||
else:
|
||||
strategy = "HYBRID"
|
||||
confidence = 0.75
|
||||
reasoning = (
|
||||
f"Content ({content_tokens:,} tokens) exceeds available budget ({available_budget:,}) "
|
||||
f"but is in hybrid zone. Stuff most-relevant chunks, retrieve the rest on demand."
|
||||
)
|
||||
|
||||
# Rule 5: Content is large or doesn't fit → RAG
|
||||
elif content_tokens > HYBRID_THRESHOLD or not fits_in_context:
|
||||
if content_tokens > GRAPH_RAG_THRESHOLD:
|
||||
strategy = "RAG+GRAPH"
|
||||
confidence = 0.70
|
||||
reasoning = (
|
||||
f"Content ({content_tokens:,} tokens) exceeds {GRAPH_RAG_THRESHOLD:,} tokens. "
|
||||
f"Linear retrieval won't scale. Build a knowledge graph index, then query it."
|
||||
)
|
||||
else:
|
||||
strategy = "RAG"
|
||||
confidence = 0.80
|
||||
reasoning = (
|
||||
f"Content ({content_tokens:,} tokens) exceeds available budget ({available_budget:,}). "
|
||||
f"Use targeted retrieval with reranking to select the most relevant passages."
|
||||
)
|
||||
|
||||
# Rule 6: Fallback — HYBRID with low confidence
|
||||
else:
|
||||
strategy = "HYBRID"
|
||||
confidence = 0.60
|
||||
reasoning = (
|
||||
f"Content ({content_tokens:,} tokens), budget ({available_budget:,}), task '{task_type}'. "
|
||||
f"No clear winner — default to hybrid for flexibility."
|
||||
)
|
||||
|
||||
# --- Build concrete steps ---
|
||||
|
||||
steps = _build_steps(strategy, task_type, available_tools, content_tokens, available_budget)
|
||||
|
||||
# --- Retrieval budget ---
|
||||
|
||||
if strategy == "STUFF":
|
||||
retrieval_budget = 0
|
||||
elif strategy == "HYBRID":
|
||||
# Reserve space for targeted retrieval results
|
||||
retrieval_budget = max(MIN_RETRIEVAL_BUDGET, int(available_budget * 0.3))
|
||||
else: # RAG or RAG+GRAPH
|
||||
retrieval_budget = max(MIN_RETRIEVAL_BUDGET, int(available_budget * 0.7))
|
||||
|
||||
return {
|
||||
"strategy": strategy,
|
||||
"confidence": confidence,
|
||||
"reasoning": reasoning,
|
||||
"content_tokens": content_tokens,
|
||||
"available_budget": available_budget,
|
||||
"model_context_length": model_context_length,
|
||||
"used_tokens": used_tokens,
|
||||
"utilization_pct": round(utilization * 100, 1),
|
||||
"task_type": task_type,
|
||||
"retrieval_budget_tokens": retrieval_budget,
|
||||
"steps": steps,
|
||||
}
|
||||
|
||||
|
||||
def _build_steps(strategy: str, task_type: str, available_tools: List[str],
|
||||
content_tokens: int, budget: int) -> List[str]:
|
||||
"""Build concrete actionable steps for the recommended strategy."""
|
||||
steps = []
|
||||
|
||||
if strategy == "STUFF":
|
||||
steps.append("Load all relevant content directly into context.")
|
||||
if "read_file" in available_tools:
|
||||
steps.append("Use read_file to pull documents into context.")
|
||||
if "web_extract" in available_tools:
|
||||
steps.append("Use web_extract to pull web pages into context.")
|
||||
steps.append("No retrieval needed — process everything in-context.")
|
||||
|
||||
elif strategy == "HYBRID":
|
||||
steps.append("1. Identify the 3-5 most critical documents/sections.")
|
||||
steps.append("2. Stuff those directly into context (read_file or web_extract).")
|
||||
if "session_search" in available_tools:
|
||||
steps.append("3. Use session_search for relevant past conversation context.")
|
||||
if "web_search" in available_tools:
|
||||
steps.append("4. Use web_search for any gaps or verification.")
|
||||
if "memory" in available_tools:
|
||||
steps.append("5. Check memory for any curated relevant facts.")
|
||||
steps.append(f"6. Keep total retrieved content under {budget:,} tokens.")
|
||||
|
||||
elif strategy == "RAG":
|
||||
steps.append("1. Do NOT stuff full documents — retrieve selectively.")
|
||||
if "web_search" in available_tools:
|
||||
steps.append("2. Use web_search to find relevant sources.")
|
||||
if "session_search" in available_tools:
|
||||
steps.append("3. Use session_search for historical context.")
|
||||
if "web_extract" in available_tools:
|
||||
steps.append("4. Use web_extract on only the most relevant 2-3 URLs.")
|
||||
steps.append("5. Rerank results by relevance before adding to context.")
|
||||
steps.append(f"6. Cap retrieved content at {budget:,} tokens.")
|
||||
steps.append("7. If precision is critical, verify key facts with a second source.")
|
||||
|
||||
elif strategy == "RAG+GRAPH":
|
||||
steps.append("1. Content is too large for linear RAG.")
|
||||
steps.append("2. Build or use an existing knowledge graph/index.")
|
||||
steps.append("3. Query the graph for relevant nodes/relationships.")
|
||||
steps.append("4. Retrieve only the subgraph needed for the current question.")
|
||||
steps.append("5. Use web_search as a fallback for unindexed content.")
|
||||
|
||||
# Add task-specific guidance
|
||||
if task_type == "rag_critical":
|
||||
steps.append("⚠️ HIGH PRECISION TASK: Verify all retrieved facts. Cite sources.")
|
||||
elif task_type == "crisis_intervention":
|
||||
steps.append("🚨 CRISIS: Accuracy is paramount. Ground responses in verified sources.")
|
||||
|
||||
return steps
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool registration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _handle_context_strategy(args: Dict[str, Any], **kwargs) -> str:
|
||||
"""Handler for the context_strategy tool."""
|
||||
query = args.get("query", "")
|
||||
content_size_hint = args.get("content_size_hint")
|
||||
file_paths = args.get("file_paths")
|
||||
char_count = args.get("char_count")
|
||||
model_context_length = args.get("model_context_length", 128_000)
|
||||
used_tokens = args.get("used_tokens", 0)
|
||||
available_tools = args.get("available_tools", [])
|
||||
|
||||
# Classify task
|
||||
task_type = classify_task_type(query)
|
||||
|
||||
# Estimate content size
|
||||
content_tokens = estimate_content_tokens(
|
||||
content_size_hint=content_size_hint,
|
||||
file_paths=file_paths,
|
||||
char_count=char_count,
|
||||
)
|
||||
|
||||
# Compute strategy
|
||||
result = compute_strategy(
|
||||
model_context_length=model_context_length,
|
||||
used_tokens=used_tokens,
|
||||
content_tokens=content_tokens,
|
||||
task_type=task_type,
|
||||
available_tools=available_tools,
|
||||
)
|
||||
|
||||
return json.dumps(result, indent=2)
|
||||
|
||||
|
||||
registry.register(
|
||||
name="context_strategy",
|
||||
toolset="core",
|
||||
schema={
|
||||
"name": "context_strategy",
|
||||
"description": (
|
||||
"Analyze a query and recommend the optimal information retrieval strategy: "
|
||||
"STUFF (load everything in context), HYBRID (stuff key docs + targeted retrieval), "
|
||||
"or RAG (pure retrieval with reranking). Considers model context window, current "
|
||||
"token usage, content size, and task type. Use this BEFORE deciding whether to "
|
||||
"retrieve documents or just load them into context. "
|
||||
"Research: Self-RAG (Asai et al., 2023), Long Context vs RAG framework."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"description": "The query or goal you need information for.",
|
||||
},
|
||||
"content_size_hint": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Estimated content size: tiny (<2K tokens), small (<8K), "
|
||||
"medium (<32K), large (<100K), huge (<500K), massive (>500K). "
|
||||
"Optional if file_paths or char_count provided."
|
||||
),
|
||||
"enum": ["tiny", "small", "medium", "large", "huge", "massive"],
|
||||
},
|
||||
"file_paths": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of file paths to estimate size from (optional).",
|
||||
},
|
||||
"char_count": {
|
||||
"type": "integer",
|
||||
"description": "Exact character count of content (optional, most accurate).",
|
||||
},
|
||||
"model_context_length": {
|
||||
"type": "integer",
|
||||
"description": (
|
||||
"Model's context window in tokens. Default: 128000. "
|
||||
"Pass the actual value from your model if known."
|
||||
),
|
||||
"default": 128000,
|
||||
},
|
||||
"used_tokens": {
|
||||
"type": "integer",
|
||||
"description": (
|
||||
"Tokens already consumed in this conversation (system prompt + history). "
|
||||
"Default: 0 (start of conversation)."
|
||||
),
|
||||
"default": 0,
|
||||
},
|
||||
"available_tools": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": (
|
||||
"List of available tool names (e.g. ['web_search', 'read_file', "
|
||||
"'session_search', 'memory']). Used to tailor recommendations."
|
||||
),
|
||||
},
|
||||
},
|
||||
"required": ["query"],
|
||||
},
|
||||
},
|
||||
handler=lambda args, **kw: _handle_context_strategy(args, **kw),
|
||||
check_fn=lambda: True, # Always available — no external deps
|
||||
requires_env=[],
|
||||
)
|
||||
@@ -50,6 +50,8 @@ _HERMES_CORE_TOOLS = [
|
||||
"todo", "memory",
|
||||
# Session history search
|
||||
"session_search",
|
||||
# Context strategy (Long Context vs RAG decision)
|
||||
"context_strategy",
|
||||
# Clarifying questions
|
||||
"clarify",
|
||||
# Code execution + delegation
|
||||
|
||||
Reference in New Issue
Block a user