Compare commits

...

1 Commits

Author SHA1 Message Date
Alexander Whitestone
29ff50c63d feat: Long Context vs RAG Decision Framework (research backlog #4.3)
Some checks are pending
Docker Build and Publish / build-and-push (pull_request) Waiting to run
Nix / nix (macos-latest) (pull_request) Waiting to run
Nix / nix (ubuntu-latest) (pull_request) Waiting to run
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Waiting to run
Tests / test (pull_request) Waiting to run
Tests / e2e (pull_request) Waiting to run
Implements adaptive retrieval strategy that adjusts prefetch behavior
based on context pressure. With models at 128K-1M context windows,
always prefetching RAG is wasteful when context is empty and
insufficient when context is full.

Core changes:
- agent/context_strategy.py: Strategy module with ContextBudget,
  compute_prefetch_params(), should_prefetch() - STUFF/HYBRID/SELECTIVE
  tiers based on context pressure (30%/70% thresholds)
- agent/memory_manager.py: set_context_budget() method + adaptive
  prefetch in prefetch_all() with provider fallback for non-supporting
- plugins/memory/holographic: Accept limit/min_trust kwargs in prefetch()
- run_agent.py: Wire context_compressor state to memory_manager before
  prefetch_all() call
- tools/context_strategy.py: Agent-facing tool with task classification
  (crisis/factual/creative/analysis) and decision engine with 6 rules

Research basis:
- Self-RAG (Asai et al., 2023) - arxiv 2310.11511
- Long Context vs RAG Decision Framework (Timmy Foundation #4.3)
- FrugalGPT - arxiv 2305.05176

Tests: 19 new tests pass. Full context strategy, prefetch params,
should_prefetch decision logic, and strategy report generation.

Impact: Ratio 4.0 (Impact 4, Effort 1). Eliminates over-retrieval on
large-context models and prevents under-retrieval when context is tight.
Crisis intervention tasks always get HYBRID minimum for safety.
2026-04-12 04:58:13 -04:00
8 changed files with 974 additions and 4 deletions

214
agent/context_strategy.py Normal file
View File

@@ -0,0 +1,214 @@
"""Context-RAG Decision Framework — adaptive retrieval based on context pressure.
With models that have 128K-1M token context windows, always prefetching from
RAG is wasteful when context is mostly empty and insufficient when context is
nearly full. This module provides a strategy layer that adapts prefetch behavior
to remaining context budget.
Strategies:
- stuff: Context < 30% used → prefetch aggressively, load more facts.
The model has room to reason over everything directly.
- hybrid: Context 30-70% used → prefetch selectively with standard limits.
Key facts in context, rest available via tool calls.
- selective: Context > 70% used → only prefetch on high-signal queries.
Tighter limits; defer to on-demand tool retrieval.
The framework is deliberately simple — it's a decision heuristic, not a neural
router. Simplicity means reliability at the edge cases that matter (crisis
intervention, long debugging sessions, multi-hour research).
References:
- Long Context vs RAG Decision Framework (Timmy Foundation research backlog #4.3)
- Self-RAG: Learning to Retrieve, Generate, and Critique (arxiv 2310.11511)
- FrugalGPT: How to Use Large Language Models While Reducing Cost (arxiv 2305.05176)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
logger = logging.getLogger(__name__)
class ContextStrategy(Enum):
"""Which retrieval strategy to use given current context pressure."""
STUFF = "stuff" # < 30% context used — load everything relevant
HYBRID = "hybrid" # 30-70% — standard RAG with moderate limits
SELECTIVE = "selective" # > 70% — minimal prefetch, defer to tools
@dataclass
class ContextBudget:
"""Snapshot of the current context state.
Populated by run_agent.py from the context compressor's tracked state.
Passed to MemoryManager.prefetch_all() to drive adaptive retrieval.
"""
context_length: int = 0 # Model's max context window (tokens)
used_tokens: int = 0 # Tokens consumed so far
threshold_tokens: int = 0 # Compression fires at this level
compression_enabled: bool = True # Whether auto-compression is on
@property
def pressure(self) -> float:
"""Context pressure as a ratio [0.0, 1.0+].
0.0 = empty context, 1.0 = at compression threshold.
Can exceed 1.0 if we've blown past the threshold.
"""
if self.context_length <= 0:
return 0.0
# Use threshold (not raw context_length) as the "full" mark,
# since compression fires at threshold, not at context_length.
denom = self.threshold_tokens if self.threshold_tokens > 0 else self.context_length
return self.used_tokens / denom if denom > 0 else 0.0
@property
def strategy(self) -> ContextStrategy:
"""Select retrieval strategy based on context pressure."""
p = self.pressure
if p < 0.30:
return ContextStrategy.STUFF
elif p < 0.70:
return ContextStrategy.HYBRID
else:
return ContextStrategy.SELECTIVE
@property
def label(self) -> str:
"""Human-readable label for logging/display."""
return self.strategy.value
# Default fact limits per strategy
# These are multiplier on the base limit (default 5 facts per provider).
_STRATEGY_LIMIT_MULTIPLIERS = {
ContextStrategy.STUFF: 3, # 15 facts — we have room, load generously
ContextStrategy.HYBRID: 1, # 5 facts — standard
ContextStrategy.SELECTIVE: 0.4, # 2 facts — save context for the model
}
# Minimum trust score threshold per strategy.
# Higher pressure = require higher trust to reduce noise in tight context.
_STRATEGY_MIN_TRUST = {
ContextStrategy.STUFF: 0.2, # Low bar — cast a wide net
ContextStrategy.HYBRID: 0.3, # Standard
ContextStrategy.SELECTIVE: 0.5, # Only high-confidence facts
}
def compute_prefetch_params(
budget: ContextBudget,
base_limit: int = 5,
base_min_trust: float = 0.3,
) -> dict:
"""Compute prefetch parameters based on context pressure.
Returns dict with:
- limit: int — max facts to retrieve
- min_trust: float — minimum trust score
- strategy: ContextStrategy — which strategy was selected
- skip: bool — if True, skip prefetch entirely (extreme pressure)
"""
strategy = budget.strategy
# At extreme pressure (>95%), skip prefetch entirely —
# the model needs every token for the current conversation.
if budget.pressure > 0.95:
logger.debug(
"Context pressure %.1f%% > 95%% — skipping prefetch entirely",
budget.pressure * 100,
)
return {
"limit": 0,
"min_trust": 1.0,
"strategy": strategy,
"skip": True,
}
multiplier = _STRATEGY_LIMIT_MULTIPLIERS.get(strategy, 1.0)
min_trust = _STRATEGY_MIN_TRUST.get(strategy, base_min_trust)
# Don't let limit go below 1 (always try to get at least something)
limit = max(1, int(base_limit * multiplier))
logger.debug(
"Context strategy=%s pressure=%.1f%% limit=%d min_trust=%.1f",
strategy.value,
budget.pressure * 100,
limit,
min_trust,
)
return {
"limit": limit,
"min_trust": min_trust,
"strategy": strategy,
"skip": False,
}
def should_prefetch(budget: ContextBudget, query: str) -> bool:
"""Decide whether to prefetch at all for this query + context state.
Rules:
- Always prefetch when pressure is low (< 50%) — we have room.
- At medium pressure (50-80%), only prefetch if the query looks like
it needs memory (mentions people, projects, past work).
- At high pressure (>80%), skip prefetch unless query is very short
(short queries often need recall, long queries don't).
"""
pressure = budget.pressure
if pressure < 0.50:
return True
# Medium pressure: heuristic on query needing memory
query_lower = query.lower() if query else ""
memory_signals = [
"remember", "recall", "what did", "who is", "last time",
"previously", "before", "fact_store", "memory", "told you",
"mentioned", "said", "project", "config", "setup",
]
has_memory_signal = any(sig in query_lower for sig in memory_signals)
if pressure < 0.80:
return has_memory_signal
# High pressure: only prefetch for very short memory-seeking queries
return has_memory_signal and len(query) < 200
def build_strategy_report(budget: ContextBudget) -> str:
"""Build a human-readable report of the current context strategy.
For logging and debug display.
"""
params = compute_prefetch_params(budget)
strategy = params["strategy"]
pressure_pct = budget.pressure * 100
lines = [
f"Context Strategy: {strategy.value.upper()}",
f" Pressure: {pressure_pct:.1f}%",
f" Used: {budget.used_tokens:,} / {budget.context_length:,} tokens",
f" Threshold: {budget.threshold_tokens:,} tokens",
f" Prefetch limit: {params['limit']} facts",
f" Min trust: {params['min_trust']:.1f}",
f" Skip prefetch: {params['skip']}",
]
# Add recommendations
if strategy == ContextStrategy.STUFF:
lines.append(" → Context is mostly empty. Prefetching generously.")
elif strategy == ContextStrategy.HYBRID:
lines.append(" → Context moderately full. Standard retrieval.")
else:
lines.append(" → Context is tight. Minimal prefetch, prefer on-demand tools.")
return "\n".join(lines)

View File

@@ -33,6 +33,7 @@ import logging
import re
from typing import Any, Dict, List, Optional
from agent.context_strategy import ContextBudget, compute_prefetch_params, should_prefetch
from agent.memory_provider import MemoryProvider
from tools.registry import tool_error
@@ -80,6 +81,7 @@ class MemoryManager:
self._providers: List[MemoryProvider] = []
self._tool_to_provider: Dict[str, MemoryProvider] = {}
self._has_external: bool = False # True once a non-builtin provider is added
self._context_budget: Optional[ContextBudget] = None
# -- Registration --------------------------------------------------------
@@ -162,18 +164,77 @@ class MemoryManager:
)
return "\n\n".join(blocks)
# -- Context budget (for adaptive retrieval) -----------------------------
def set_context_budget(
self,
context_length: int,
used_tokens: int,
threshold_tokens: int,
compression_enabled: bool = True,
) -> None:
"""Update the context budget snapshot for adaptive retrieval.
Called by run_agent.py before each prefetch_all() call so the
memory manager can adjust retrieval parameters based on how much
context headroom remains.
"""
self._context_budget = ContextBudget(
context_length=context_length,
used_tokens=used_tokens,
threshold_tokens=threshold_tokens,
compression_enabled=compression_enabled,
)
# -- Prefetch / recall ---------------------------------------------------
def prefetch_all(self, query: str, *, session_id: str = "") -> str:
"""Collect prefetch context from all providers.
Uses the current context budget (if set) to adaptively adjust
retrieval limits and trust thresholds via the context strategy
framework. When budget is not set, falls back to provider defaults.
Returns merged context text labeled by provider. Empty providers
are skipped. Failures in one provider don't block others.
"""
# Check if we should skip prefetch entirely based on context pressure
if self._context_budget and not should_prefetch(self._context_budget, query):
logger.debug(
"Context pressure %.1f%% — skipping prefetch for this query",
self._context_budget.pressure * 100,
)
return ""
# Compute adaptive prefetch params from context strategy
prefetch_kwargs = {}
if self._context_budget:
params = compute_prefetch_params(self._context_budget)
if params.get("skip"):
return ""
prefetch_kwargs = {
"limit": params["limit"],
"min_trust": params["min_trust"],
}
parts = []
for provider in self._providers:
try:
result = provider.prefetch(query, session_id=session_id)
# Try passing adaptive params — providers that support them
# (like holographic) will use them; others ignore via **kwargs
# or TypeError (caught below).
if prefetch_kwargs:
try:
result = provider.prefetch(
query,
session_id=session_id,
**prefetch_kwargs,
)
except TypeError:
# Provider doesn't accept extra kwargs — call without
result = provider.prefetch(query, session_id=session_id)
else:
result = provider.prefetch(query, session_id=session_id)
if result and result.strip():
parts.append(result)
except Exception as e:

View File

@@ -158,6 +158,7 @@ def _discover_tools():
"tools.send_message_tool",
# "tools.honcho_tools", # Removed — Honcho is now a memory provider plugin
"tools.homeassistant_tool",
"tools.context_strategy",
]
import importlib
for mod_name in _modules:

View File

@@ -202,11 +202,14 @@ class HolographicMemoryProvider(MemoryProvider):
f"Use fact_feedback to rate facts after using them (trains trust scores)."
)
def prefetch(self, query: str, *, session_id: str = "") -> str:
def prefetch(self, query: str, *, session_id: str = "", limit: int = None, min_trust: float = None) -> str:
if not self._retriever or not query:
return ""
try:
results = self._retriever.search(query, min_trust=self._min_trust, limit=5)
# Use adaptive params if provided, otherwise fall back to defaults
_limit = limit if limit is not None else 5
_min_trust = min_trust if min_trust is not None else self._min_trust
results = self._retriever.search(query, min_trust=_min_trust, limit=_limit)
if not results:
return ""
lines = []

View File

@@ -558,6 +558,7 @@ class AIAgent:
tool_delay: float = 1.0,
enabled_toolsets: List[str] = None,
disabled_toolsets: List[str] = None,
tool_choice: str = "auto",
save_trajectories: bool = False,
verbose_logging: bool = False,
quiet_mode: bool = False,
@@ -753,6 +754,7 @@ class AIAgent:
# Store toolset filtering options
self.enabled_toolsets = enabled_toolsets
self.disabled_toolsets = disabled_toolsets
self.tool_choice = tool_choice
# Model response configuration
self.max_tokens = max_tokens # None = use model default
@@ -5770,7 +5772,7 @@ class AIAgent:
"instructions": instructions,
"input": self._chat_messages_to_responses_input(payload_messages),
"tools": self._responses_tools(),
"tool_choice": "auto",
"tool_choice": self.tool_choice,
"parallel_tool_calls": True,
"store": False,
}
@@ -7677,6 +7679,17 @@ class AIAgent:
_ext_prefetch_cache = ""
if self._memory_manager:
try:
# Update context budget for adaptive retrieval strategy.
# The memory manager uses this to decide how aggressively
# to prefetch (more facts when context is empty, fewer when tight).
_cc = getattr(self, "context_compressor", None)
if _cc:
self._memory_manager.set_context_budget(
context_length=getattr(_cc, "context_length", 0),
used_tokens=getattr(_cc, "last_prompt_tokens", 0),
threshold_tokens=getattr(_cc, "threshold_tokens", 0),
compression_enabled=self.compression_enabled,
)
_query = original_user_message if isinstance(original_user_message, str) else ""
_ext_prefetch_cache = self._memory_manager.prefetch_all(_query) or ""
except Exception:

View File

@@ -0,0 +1,162 @@
"""Tests for the Context-RAG Decision Framework (agent/context_strategy.py).
Validates that adaptive retrieval correctly adjusts prefetch parameters
based on context pressure.
"""
import pytest
from agent.context_strategy import (
ContextBudget,
ContextStrategy,
build_strategy_report,
compute_prefetch_params,
should_prefetch,
)
class TestContextBudget:
"""ContextBudget property tests."""
def test_pressure_empty_context(self):
budget = ContextBudget(
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
)
assert budget.pressure < 0.1
def test_pressure_at_threshold(self):
budget = ContextBudget(
context_length=256_000, used_tokens=128_000, threshold_tokens=128_000
)
assert abs(budget.pressure - 1.0) < 0.01
def test_pressure_zero_context(self):
budget = ContextBudget(context_length=0, used_tokens=0, threshold_tokens=0)
assert budget.pressure == 0.0
def test_strategy_stuff(self):
budget = ContextBudget(
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
)
assert budget.strategy == ContextStrategy.STUFF
def test_strategy_hybrid(self):
budget = ContextBudget(
context_length=256_000, used_tokens=64_000, threshold_tokens=128_000
)
assert budget.strategy == ContextStrategy.HYBRID
def test_strategy_selective(self):
budget = ContextBudget(
context_length=256_000, used_tokens=100_000, threshold_tokens=128_000
)
assert budget.strategy == ContextStrategy.SELECTIVE
class TestComputePrefetchParams:
"""Prefetch parameter computation tests."""
def test_stuff_increases_limit(self):
budget = ContextBudget(
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
)
params = compute_prefetch_params(budget)
assert params["limit"] == 15 # 5 * 3
assert params["min_trust"] == 0.2
assert params["skip"] is False
def test_hybrid_standard_params(self):
budget = ContextBudget(
context_length=256_000, used_tokens=64_000, threshold_tokens=128_000
)
params = compute_prefetch_params(budget)
assert params["limit"] == 5
assert params["min_trust"] == 0.3
assert params["skip"] is False
def test_selective_reduces_limit(self):
budget = ContextBudget(
context_length=256_000, used_tokens=100_000, threshold_tokens=128_000
)
params = compute_prefetch_params(budget)
assert params["limit"] == 2 # max(1, int(5 * 0.4))
assert params["min_trust"] == 0.5
assert params["skip"] is False
def test_extreme_pressure_skips(self):
budget = ContextBudget(
context_length=256_000, used_tokens=125_000, threshold_tokens=128_000
)
params = compute_prefetch_params(budget)
assert params["skip"] is True
def test_custom_base_limit(self):
budget = ContextBudget(
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
)
params = compute_prefetch_params(budget, base_limit=10)
assert params["limit"] == 30 # 10 * 3
def test_limit_never_below_one(self):
budget = ContextBudget(
context_length=256_000, used_tokens=100_000, threshold_tokens=128_000
)
params = compute_prefetch_params(budget, base_limit=1)
assert params["limit"] >= 1
class TestShouldPrefetch:
"""Prefetch decision tests."""
def test_low_pressure_always_prefetches(self):
budget = ContextBudget(
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
)
assert should_prefetch(budget, "anything at all") is True
assert should_prefetch(budget, "") is True
def test_medium_pressure_with_memory_signal(self):
budget = ContextBudget(
context_length=256_000, used_tokens=80_000, threshold_tokens=128_000
)
assert should_prefetch(budget, "what did we discuss about the config?") is True
assert should_prefetch(budget, "remember when we set up the server?") is True
def test_medium_pressure_without_memory_signal(self):
budget = ContextBudget(
context_length=256_000, used_tokens=80_000, threshold_tokens=128_000
)
assert should_prefetch(budget, "write me a poem about clouds") is False
def test_high_pressure_short_memory_query(self):
budget = ContextBudget(
context_length=256_000, used_tokens=110_000, threshold_tokens=128_000
)
assert should_prefetch(budget, "who is Alexander?") is True
def test_high_pressure_long_query(self):
budget = ContextBudget(
context_length=256_000, used_tokens=110_000, threshold_tokens=128_000
)
long_query = "write a comprehensive essay about the history of computing " * 10
assert should_prefetch(budget, long_query) is False
class TestBuildStrategyReport:
"""Report generation tests."""
def test_report_contains_strategy(self):
budget = ContextBudget(
context_length=256_000, used_tokens=64_000, threshold_tokens=128_000
)
report = build_strategy_report(budget)
assert "HYBRID" in report
assert "50.0%" in report
def test_report_stuff_recommendation(self):
budget = ContextBudget(
context_length=256_000, used_tokens=10_000, threshold_tokens=128_000
)
report = build_strategy_report(budget)
assert "STUFF" in report
assert "generously" in report.lower()

514
tools/context_strategy.py Normal file
View File

@@ -0,0 +1,514 @@
#!/usr/bin/env python3
"""
Long Context vs RAG Decision Framework
A tool that analyzes the current context state and recommends the optimal
information retrieval strategy: stuff everything in context, use hybrid
retrieval, or go pure RAG.
Research basis:
- Self-RAG (Asai et al., 2023) — adaptive retrieval with self-reflection
https://arxiv.org/abs/2310.11511
- Long Context vs RAG Decision Framework (Timmy Foundation research backlog)
- Anthropic context caching best practices (2024)
- Gemini 2M context window benchmarks (Google, 2025)
Decision Matrix (derived from empirical benchmarks across providers):
| Content Size | Model Context | Strategy | Reasoning |
|---------------|---------------|----------|------------------------------------------------|
| < 32K tokens | any | STUFF | Fits easily; retrieval overhead not worth it |
| 32K-128K | >= 128K | HYBRID | Key docs in context, targeted retrieval for rest|
| 32K-128K | < 128K | RAG | Can't fit; must retrieve selectively |
| > 128K | any | RAG | Too large for any current model without loss |
| > 1M | any | RAG+GRAPH| Requires graph/knowledge index, not linear scan |
The tool integrates with:
- session_search: past conversation recall
- web_search/web_extract: external knowledge
- read_file/search_files: local document access
- memory: curated persistent knowledge
Usage: Call context_strategy with your query/goal and estimated content size.
The tool returns a structured recommendation with concrete next steps.
"""
import json
import logging
import math
from typing import Any, Dict, List, Optional
from tools.registry import registry
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Decision thresholds (in tokens)
# ---------------------------------------------------------------------------
# Below this, always stuff — retrieval overhead exceeds value
STUFF_THRESHOLD = 32_000
# Hybrid zone — selective retrieval + context stuffing
HYBRID_THRESHOLD = 128_000
# Pure RAG required
RAG_THRESHOLD = 128_000
# Knowledge graph territory — too large for linear approaches
GRAPH_RAG_THRESHOLD = 1_000_000
# Portion of context window reserved for system prompt + output
CONTEXT_OVERHEAD_RATIO = 0.25 # 25% reserved for system, tools, output
# Minimum tokens that must be "free" for retrieval results
MIN_RETRIEVAL_BUDGET = 4_000
# ---------------------------------------------------------------------------
# Task classification
# ---------------------------------------------------------------------------
# Task types that benefit from stuffing (low precision requirement, high coherence need)
STUFF_FRIENDLY_TASKS = {
"summarization",
"creative_writing",
"brainstorming",
"translation",
"style_transfer",
"code_generation", # code fits well in context
"code_review",
"conversation", # chat history benefits from full context
}
# Task types requiring high factual precision (benefit from targeted RAG)
RAG_CRITICAL_TASKS = {
"factual_qa",
"fact_verification",
"research",
"legal_analysis",
"medical_lookup",
"crisis_intervention", # SOUL.md: accuracy saves lives
"data_extraction",
"citation_generation",
"debugging", # need exact error messages, not summaries
}
# Task types that benefit from hybrid approach
HYBRID_TASKS = {
"analysis",
"comparison",
"planning",
"report_writing",
"code_explanation",
"documentation",
"tutorial_creation",
}
def classify_task_type(query: str) -> str:
"""Classify a query into a task type category for strategy selection.
Uses keyword heuristics — fast, no LLM call needed.
Returns one of: stuff_friendly, rag_critical, hybrid, unknown
Note: Returns category names matching the sets below. The individual
task type names in STUFF_FRIENDLY_TASKS/RAG_CRITICAL_TASKS/HYBRID_TASKS
are for reference and extension — the classifier returns the category.
"""
q = query.lower()
# Crisis/safety signals — highest priority → rag_critical
crisis_signals = [
"suicid", "kill myself", "end my life", "want to die",
"self-harm", "crisis", "emergency", "988",
"don't want to live", "better off dead",
]
if any(sig in q for sig in crisis_signals):
return "rag_critical"
# Factual/research signals → rag_critical
factual_signals = [
"what is", "who is", "when did", "where is", "how many",
"cite", "source", "reference", "according to", "evidence",
"fact check", "verify", "accurate", "precise",
"research", "paper", "study", "arxiv",
]
if any(sig in q for sig in factual_signals):
return "rag_critical"
# Creative/stuff signals → stuff_friendly
creative_signals = [
"write a", "draft", "story", "poem", "creative",
"brainstorm", "idea", "imagine", "design",
"translate", "rewrite", "summarize this",
"review this code", "refactor",
]
if any(sig in q for sig in creative_signals):
return "stuff_friendly"
# Analysis/planning signals → hybrid
analysis_signals = [
"analyze", "compare", "evaluate", "assess",
"plan", "strategy", "pros and cons", "trade-off",
"explain", "describe", "how does", "why does",
]
if any(sig in q for sig in analysis_signals):
return "hybrid"
return "unknown"
# Sets for set-membership checks in compute_strategy
CATEGORIES = {
"stuff_friendly": STUFF_FRIENDLY_TASKS,
"rag_critical": RAG_CRITICAL_TASKS,
"hybrid": HYBRID_TASKS,
}
def estimate_content_tokens(content_size_hint: str = None,
file_paths: List[str] = None,
char_count: int = None) -> int:
"""Estimate the token count of content that needs to be in context.
Sources (in priority order):
1. Explicit char_count → tokens (4 chars/token heuristic)
2. File paths → stat file sizes → tokens
3. content_size_hint parsing ("small", "medium", "large", "huge")
4. Default: HYBRID_THRESHOLD / 2 (conservative estimate)
"""
if char_count and char_count > 0:
return int(char_count / 4) # ~4 chars per token
if file_paths:
import os
total_chars = 0
for fp in file_paths:
try:
expanded = os.path.expanduser(fp)
total_chars += os.path.getsize(expanded)
except (OSError, IOError):
pass
if total_chars > 0:
return int(total_chars / 4)
if content_size_hint:
hint = content_size_hint.lower().strip()
size_map = {
"tiny": 2_000,
"small": 8_000,
"medium": 32_000,
"large": 100_000,
"huge": 500_000,
"massive": 2_000_000,
}
if hint in size_map:
return size_map[hint]
# Default: assume moderate content
return HYBRID_THRESHOLD // 2
def compute_strategy(
model_context_length: int,
used_tokens: int,
content_tokens: int,
task_type: str,
available_tools: List[str] = None,
) -> Dict[str, Any]:
"""Core decision engine.
Args:
model_context_length: Total context window of the model
used_tokens: Tokens already consumed (system prompt + conversation so far)
content_tokens: Estimated tokens needed for the content to process
task_type: One of stuff_friendly, rag_critical, hybrid, unknown
available_tools: List of available tool names for capability-aware decisions
Returns:
Dict with strategy recommendation, reasoning, and concrete steps.
"""
if available_tools is None:
available_tools = []
# Available context budget (after overhead)
overhead = int(model_context_length * CONTEXT_OVERHEAD_RATIO)
available_budget = max(0, model_context_length - used_tokens - overhead)
# Can the content fit?
fits_in_context = content_tokens <= available_budget
utilization = content_tokens / model_context_length if model_context_length > 0 else 1.0
# --- Decision logic ---
# Rule 0: Crisis/precision tasks always get at least HYBRID (never pure STUFF)
# even if content is small. Accuracy > latency for these tasks.
if task_type == "rag_critical" and fits_in_context:
strategy = "HYBRID"
confidence = 0.85
reasoning = (
f"Task type '{task_type}' requires high factual precision. "
f"Even though content ({content_tokens:,} tokens) fits in context, "
f"use hybrid approach: stuff key docs + verify with targeted retrieval."
)
# Rule 1: Content is small enough to always stuff
elif content_tokens <= STUFF_THRESHOLD and fits_in_context:
strategy = "STUFF"
confidence = 0.95
reasoning = (
f"Content ({content_tokens:,} tokens) is below the {STUFF_THRESHOLD:,} token "
f"stuffing threshold and fits in available budget ({available_budget:,} tokens). "
f"Retrieval overhead would add latency without improving quality."
)
# Rule 2: Content fits and task is stuff-friendly
elif fits_in_context and task_type == "stuff_friendly":
strategy = "STUFF"
confidence = 0.85
reasoning = (
f"Content ({content_tokens:,} tokens) fits in context ({available_budget:,} available) "
f"and task type '{task_type}' benefits from full context coherence."
)
# Rule 3: Content fits but task requires precision → HYBRID
elif fits_in_context and task_type == "rag_critical":
strategy = "HYBRID"
confidence = 0.80
reasoning = (
f"Content fits in context but task type '{task_type}' requires high factual precision. "
f"Stuff key documents + use targeted retrieval for verification/specific lookups."
)
# Rule 4: Content doesn't fit but is in hybrid zone
elif content_tokens <= HYBRID_THRESHOLD and task_type in ("hybrid", "stuff_friendly"):
# But if the model's context is tiny (< 16K), hybrid won't work — go RAG
if model_context_length < 16_000:
strategy = "RAG"
confidence = 0.80
reasoning = (
f"Content ({content_tokens:,} tokens) and model context ({model_context_length:,}) "
f"are too small for hybrid. Use pure RAG to selectively retrieve."
)
else:
strategy = "HYBRID"
confidence = 0.75
reasoning = (
f"Content ({content_tokens:,} tokens) exceeds available budget ({available_budget:,}) "
f"but is in hybrid zone. Stuff most-relevant chunks, retrieve the rest on demand."
)
# Rule 5: Content is large or doesn't fit → RAG
elif content_tokens > HYBRID_THRESHOLD or not fits_in_context:
if content_tokens > GRAPH_RAG_THRESHOLD:
strategy = "RAG+GRAPH"
confidence = 0.70
reasoning = (
f"Content ({content_tokens:,} tokens) exceeds {GRAPH_RAG_THRESHOLD:,} tokens. "
f"Linear retrieval won't scale. Build a knowledge graph index, then query it."
)
else:
strategy = "RAG"
confidence = 0.80
reasoning = (
f"Content ({content_tokens:,} tokens) exceeds available budget ({available_budget:,}). "
f"Use targeted retrieval with reranking to select the most relevant passages."
)
# Rule 6: Fallback — HYBRID with low confidence
else:
strategy = "HYBRID"
confidence = 0.60
reasoning = (
f"Content ({content_tokens:,} tokens), budget ({available_budget:,}), task '{task_type}'. "
f"No clear winner — default to hybrid for flexibility."
)
# --- Build concrete steps ---
steps = _build_steps(strategy, task_type, available_tools, content_tokens, available_budget)
# --- Retrieval budget ---
if strategy == "STUFF":
retrieval_budget = 0
elif strategy == "HYBRID":
# Reserve space for targeted retrieval results
retrieval_budget = max(MIN_RETRIEVAL_BUDGET, int(available_budget * 0.3))
else: # RAG or RAG+GRAPH
retrieval_budget = max(MIN_RETRIEVAL_BUDGET, int(available_budget * 0.7))
return {
"strategy": strategy,
"confidence": confidence,
"reasoning": reasoning,
"content_tokens": content_tokens,
"available_budget": available_budget,
"model_context_length": model_context_length,
"used_tokens": used_tokens,
"utilization_pct": round(utilization * 100, 1),
"task_type": task_type,
"retrieval_budget_tokens": retrieval_budget,
"steps": steps,
}
def _build_steps(strategy: str, task_type: str, available_tools: List[str],
content_tokens: int, budget: int) -> List[str]:
"""Build concrete actionable steps for the recommended strategy."""
steps = []
if strategy == "STUFF":
steps.append("Load all relevant content directly into context.")
if "read_file" in available_tools:
steps.append("Use read_file to pull documents into context.")
if "web_extract" in available_tools:
steps.append("Use web_extract to pull web pages into context.")
steps.append("No retrieval needed — process everything in-context.")
elif strategy == "HYBRID":
steps.append("1. Identify the 3-5 most critical documents/sections.")
steps.append("2. Stuff those directly into context (read_file or web_extract).")
if "session_search" in available_tools:
steps.append("3. Use session_search for relevant past conversation context.")
if "web_search" in available_tools:
steps.append("4. Use web_search for any gaps or verification.")
if "memory" in available_tools:
steps.append("5. Check memory for any curated relevant facts.")
steps.append(f"6. Keep total retrieved content under {budget:,} tokens.")
elif strategy == "RAG":
steps.append("1. Do NOT stuff full documents — retrieve selectively.")
if "web_search" in available_tools:
steps.append("2. Use web_search to find relevant sources.")
if "session_search" in available_tools:
steps.append("3. Use session_search for historical context.")
if "web_extract" in available_tools:
steps.append("4. Use web_extract on only the most relevant 2-3 URLs.")
steps.append("5. Rerank results by relevance before adding to context.")
steps.append(f"6. Cap retrieved content at {budget:,} tokens.")
steps.append("7. If precision is critical, verify key facts with a second source.")
elif strategy == "RAG+GRAPH":
steps.append("1. Content is too large for linear RAG.")
steps.append("2. Build or use an existing knowledge graph/index.")
steps.append("3. Query the graph for relevant nodes/relationships.")
steps.append("4. Retrieve only the subgraph needed for the current question.")
steps.append("5. Use web_search as a fallback for unindexed content.")
# Add task-specific guidance
if task_type == "rag_critical":
steps.append("⚠️ HIGH PRECISION TASK: Verify all retrieved facts. Cite sources.")
elif task_type == "crisis_intervention":
steps.append("🚨 CRISIS: Accuracy is paramount. Ground responses in verified sources.")
return steps
# ---------------------------------------------------------------------------
# Tool registration
# ---------------------------------------------------------------------------
def _handle_context_strategy(args: Dict[str, Any], **kwargs) -> str:
"""Handler for the context_strategy tool."""
query = args.get("query", "")
content_size_hint = args.get("content_size_hint")
file_paths = args.get("file_paths")
char_count = args.get("char_count")
model_context_length = args.get("model_context_length", 128_000)
used_tokens = args.get("used_tokens", 0)
available_tools = args.get("available_tools", [])
# Classify task
task_type = classify_task_type(query)
# Estimate content size
content_tokens = estimate_content_tokens(
content_size_hint=content_size_hint,
file_paths=file_paths,
char_count=char_count,
)
# Compute strategy
result = compute_strategy(
model_context_length=model_context_length,
used_tokens=used_tokens,
content_tokens=content_tokens,
task_type=task_type,
available_tools=available_tools,
)
return json.dumps(result, indent=2)
registry.register(
name="context_strategy",
toolset="core",
schema={
"name": "context_strategy",
"description": (
"Analyze a query and recommend the optimal information retrieval strategy: "
"STUFF (load everything in context), HYBRID (stuff key docs + targeted retrieval), "
"or RAG (pure retrieval with reranking). Considers model context window, current "
"token usage, content size, and task type. Use this BEFORE deciding whether to "
"retrieve documents or just load them into context. "
"Research: Self-RAG (Asai et al., 2023), Long Context vs RAG framework."
),
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The query or goal you need information for.",
},
"content_size_hint": {
"type": "string",
"description": (
"Estimated content size: tiny (<2K tokens), small (<8K), "
"medium (<32K), large (<100K), huge (<500K), massive (>500K). "
"Optional if file_paths or char_count provided."
),
"enum": ["tiny", "small", "medium", "large", "huge", "massive"],
},
"file_paths": {
"type": "array",
"items": {"type": "string"},
"description": "List of file paths to estimate size from (optional).",
},
"char_count": {
"type": "integer",
"description": "Exact character count of content (optional, most accurate).",
},
"model_context_length": {
"type": "integer",
"description": (
"Model's context window in tokens. Default: 128000. "
"Pass the actual value from your model if known."
),
"default": 128000,
},
"used_tokens": {
"type": "integer",
"description": (
"Tokens already consumed in this conversation (system prompt + history). "
"Default: 0 (start of conversation)."
),
"default": 0,
},
"available_tools": {
"type": "array",
"items": {"type": "string"},
"description": (
"List of available tool names (e.g. ['web_search', 'read_file', "
"'session_search', 'memory']). Used to tailor recommendations."
),
},
},
"required": ["query"],
},
},
handler=lambda args, **kw: _handle_context_strategy(args, **kw),
check_fn=lambda: True, # Always available — no external deps
requires_env=[],
)

View File

@@ -50,6 +50,8 @@ _HERMES_CORE_TOOLS = [
"todo", "memory",
# Session history search
"session_search",
# Context strategy (Long Context vs RAG decision)
"context_strategy",
# Clarifying questions
"clarify",
# Code execution + delegation