Compare commits
1 Commits
queue/327-
...
burn/conte
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c762dd73f |
@@ -93,6 +93,14 @@ class ContextCompressor:
|
||||
self.threshold_tokens = int(self.context_length * threshold_percent)
|
||||
self.compression_count = 0
|
||||
|
||||
# Poka-yoke #296: Hard safety triggers at 85% and 95% of model limit.
|
||||
# These are MANDATORY — they fire regardless of the configurable threshold.
|
||||
# At 85%: force compression (even if threshold_percent is higher).
|
||||
# At 95%: block tool calls and force compression before any API call.
|
||||
self.hard_warn_tokens = int(self.context_length * 0.85)
|
||||
self.hard_block_tokens = int(self.context_length * 0.95)
|
||||
self._overflow_events: list = [] # timestamps of overflow triggers
|
||||
|
||||
# Derive token budgets: ratio is relative to the threshold, not total context
|
||||
target_tokens = int(self.threshold_tokens * self.summary_target_ratio)
|
||||
self.tail_token_budget = target_tokens
|
||||
@@ -138,6 +146,73 @@ class ContextCompressor:
|
||||
rough_estimate = estimate_messages_tokens_rough(messages)
|
||||
return rough_estimate >= self.threshold_tokens
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Poka-yoke #296: Hard context overflow prevention
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def should_hard_compress(self, current_tokens: int = None) -> bool:
|
||||
"""Check if context has reached the 85% hard compression trigger.
|
||||
|
||||
This is a MANDATORY compression — fires regardless of threshold_percent.
|
||||
Prevents silent context overflow when the configurable threshold is
|
||||
set too high or the agent ignores voluntary compression.
|
||||
"""
|
||||
tokens = current_tokens if current_tokens is not None else (
|
||||
self.last_prompt_tokens + self.last_completion_tokens
|
||||
)
|
||||
if tokens > 0 and tokens >= self.hard_warn_tokens:
|
||||
self._overflow_events.append(("hard_compress", time.time()))
|
||||
return True
|
||||
return False
|
||||
|
||||
def should_block_tools(self, current_tokens: int = None) -> bool:
|
||||
"""Check if context has reached the 95% tool-blocking trigger.
|
||||
|
||||
At 95% of the model's context limit, tool calls are stripped to
|
||||
force the model into a final-response-only mode. Compression must
|
||||
happen before tools become available again.
|
||||
"""
|
||||
tokens = current_tokens if current_tokens is not None else (
|
||||
self.last_prompt_tokens + self.last_completion_tokens
|
||||
)
|
||||
if tokens > 0 and tokens >= self.hard_block_tokens:
|
||||
self._overflow_events.append(("block_tools", time.time()))
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_detailed_status(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive context status for /context-status display."""
|
||||
last_total = self.last_prompt_tokens + self.last_completion_tokens
|
||||
usage_pct = min(100, (last_total / self.context_length * 100)) if self.context_length else 0
|
||||
threshold_pct = (self.threshold_tokens / self.context_length * 100) if self.context_length else 0
|
||||
|
||||
# Count recent overflow events (last hour)
|
||||
now = time.time()
|
||||
recent_overflows = sum(
|
||||
1 for _, ts in self._overflow_events if now - ts < 3600
|
||||
)
|
||||
|
||||
return {
|
||||
"last_prompt_tokens": self.last_prompt_tokens,
|
||||
"last_completion_tokens": self.last_completion_tokens,
|
||||
"last_total_tokens": last_total,
|
||||
"context_length": self.context_length,
|
||||
"usage_percent": usage_pct,
|
||||
"threshold_tokens": self.threshold_tokens,
|
||||
"threshold_percent": threshold_pct,
|
||||
"hard_warn_tokens": self.hard_warn_tokens,
|
||||
"hard_warn_percent": 85.0,
|
||||
"hard_block_tokens": self.hard_block_tokens,
|
||||
"hard_block_percent": 95.0,
|
||||
"remaining_before_threshold": max(0, self.threshold_tokens - last_total),
|
||||
"remaining_before_hard_warn": max(0, self.hard_warn_tokens - last_total),
|
||||
"remaining_before_hard_block": max(0, self.hard_block_tokens - last_total),
|
||||
"compression_count": self.compression_count,
|
||||
"overflow_events_total": len(self._overflow_events),
|
||||
"overflow_events_recent": recent_overflows,
|
||||
"previous_summary": self._previous_summary[:200] if self._previous_summary else None,
|
||||
}
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""Get current compression status for display/logging."""
|
||||
return {
|
||||
|
||||
48
cli.py
48
cli.py
@@ -4466,6 +4466,8 @@ class HermesCLI:
|
||||
self._handle_reasoning_command(cmd_original)
|
||||
elif canonical == "compress":
|
||||
self._manual_compress()
|
||||
elif canonical == "context-status":
|
||||
self._show_context_status()
|
||||
elif canonical == "usage":
|
||||
self._show_usage()
|
||||
elif canonical == "insights":
|
||||
@@ -5282,6 +5284,52 @@ class HermesCLI:
|
||||
except Exception as e:
|
||||
print(f" ❌ Compression failed: {e}")
|
||||
|
||||
def _show_context_status(self):
|
||||
"""Show detailed context window status, compression history, and overflow events."""
|
||||
if not self.agent:
|
||||
print("(._.) No active agent -- send a message first.")
|
||||
return
|
||||
|
||||
compressor = getattr(self.agent, "context_compressor", None)
|
||||
if not compressor:
|
||||
print("(._.) Context compressor not available.")
|
||||
return
|
||||
|
||||
status = compressor.get_detailed_status()
|
||||
|
||||
print()
|
||||
print(" Context Window Status")
|
||||
print(" " + "=" * 50)
|
||||
print(f" Context limit: {status['context_length']:>12,} tokens")
|
||||
print(f" Last prompt: {status['last_prompt_tokens']:>12,} tokens")
|
||||
print(f" Last completion: {status['last_completion_tokens']:>12,} tokens")
|
||||
print(f" Last total: {status['last_total_tokens']:>12,} tokens")
|
||||
print(f" Usage: {status['usage_percent']:>11.1f}%")
|
||||
|
||||
pct = status["usage_percent"]
|
||||
bar_width = 40
|
||||
filled = int(bar_width * pct / 100)
|
||||
bar = chr(9608) * filled + chr(9617) * (bar_width - filled)
|
||||
print(f" [{bar}] {pct:.1f}%")
|
||||
print()
|
||||
print(f" Compression threshold: {status['threshold_tokens']:>8,} tokens ({status['threshold_percent']:.0f}%)")
|
||||
print(f" Hard warn (85%): {status['hard_warn_tokens']:>8,} tokens")
|
||||
print(f" Hard block (95%): {status['hard_block_tokens']:>8,} tokens")
|
||||
print()
|
||||
print(f" Remaining before threshold: {status['remaining_before_threshold']:>8,} tokens")
|
||||
print(f" Remaining before hard warn: {status['remaining_before_hard_warn']:>8,} tokens")
|
||||
print(f" Remaining before hard block: {status['remaining_before_hard_block']:>8,} tokens")
|
||||
print()
|
||||
print(f" Compressions this session: {status['compression_count']}")
|
||||
print(f" Overflow events (total): {status['overflow_events_total']}")
|
||||
print(f" Overflow events (last hour): {status['overflow_events_recent']}")
|
||||
if status["previous_summary"]:
|
||||
print()
|
||||
print(f" Last summary: {status['previous_summary']}")
|
||||
msg_count = len(self.conversation_history) if self.conversation_history else 0
|
||||
print(f" Messages in history: {msg_count}")
|
||||
print()
|
||||
|
||||
def _show_usage(self):
|
||||
"""Show cumulative token usage for the current session."""
|
||||
if not self.agent:
|
||||
|
||||
@@ -60,6 +60,8 @@ COMMAND_REGISTRY: list[CommandDef] = [
|
||||
CommandDef("branch", "Branch the current session (explore a different path)", "Session",
|
||||
aliases=("fork",), args_hint="[name]"),
|
||||
CommandDef("compress", "Manually compress conversation context", "Session"),
|
||||
CommandDef("context-status", "Show context window usage, compression history, and overflow events", "Session",
|
||||
aliases=("ctx", "cs"), args_hint=""),
|
||||
CommandDef("rollback", "List or restore filesystem checkpoints", "Session",
|
||||
args_hint="[number]"),
|
||||
CommandDef("stop", "Kill all running background processes", "Session"),
|
||||
|
||||
76
run_agent.py
76
run_agent.py
@@ -7514,7 +7514,54 @@ class AIAgent:
|
||||
|
||||
while retry_count < max_retries:
|
||||
try:
|
||||
# ── Poka-yoke #296: 95% tool-blocking trigger ──────────
|
||||
# At 95% of the model's context limit, strip tools from the
|
||||
# API call to force a final-response-only mode. The model
|
||||
# cannot make tool calls — it must summarize and exit.
|
||||
# Compression happens before the next turn restores tools.
|
||||
_tools_blocked = False
|
||||
if self.compression_enabled and hasattr(self, 'context_compressor'):
|
||||
_cc = self.context_compressor
|
||||
_tok_check = _cc.last_prompt_tokens + _cc.last_completion_tokens
|
||||
if _cc.should_block_tools(_tok_check):
|
||||
_tools_blocked = True
|
||||
logger.warning(
|
||||
"[CONTEXT OVERFLOW] Tool-block trigger at %s tokens (%.0f%% of %s limit). "
|
||||
"Stripping tools — model must provide final response.",
|
||||
f"{_tok_check:,}",
|
||||
_tok_check / _cc.context_length * 100,
|
||||
f"{_cc.context_length:,}",
|
||||
)
|
||||
if not self.quiet_mode:
|
||||
self._safe_print(
|
||||
f"🛑 Context at {_tok_check:,} tokens "
|
||||
f"({_tok_check / _cc.context_length * 100:.0f}% of "
|
||||
f"{_cc.context_length:,} limit) — "
|
||||
f"tools BLOCKED. Forcing final response."
|
||||
)
|
||||
|
||||
api_kwargs = self._build_api_kwargs(api_messages)
|
||||
if _tools_blocked:
|
||||
# Strip tools from all API modes
|
||||
api_kwargs.pop("tools", None)
|
||||
api_kwargs.pop("tool_choice", None)
|
||||
api_kwargs.pop("parallel_tool_calls", None)
|
||||
# Inject a user message forcing the model to summarize
|
||||
api_messages.append({
|
||||
"role": "user",
|
||||
"content": (
|
||||
"[CONTEXT LIMIT REACHED: You have exceeded 95% of the model's context window. "
|
||||
"All tools are disabled. You MUST provide your final response now — "
|
||||
"summarize what was accomplished and any critical next steps. "
|
||||
"Do NOT attempt to call any tools.]"
|
||||
),
|
||||
})
|
||||
# Rebuild kwargs with the injected message
|
||||
api_kwargs = self._build_api_kwargs(api_messages)
|
||||
api_kwargs.pop("tools", None)
|
||||
api_kwargs.pop("tool_choice", None)
|
||||
api_kwargs.pop("parallel_tool_calls", None)
|
||||
|
||||
if self.api_mode == "codex_responses":
|
||||
api_kwargs = self._preflight_codex_api_kwargs(api_kwargs, allow_stream=False)
|
||||
|
||||
@@ -9031,6 +9078,35 @@ class AIAgent:
|
||||
# _flush_messages_to_session_db writes compressed messages
|
||||
# to the new session (see preflight compression comment).
|
||||
conversation_history = None
|
||||
|
||||
# ── Poka-yoke #296: Hard context overflow prevention ──────────
|
||||
# Mandatory compression at 85% of model limit. Fires even when
|
||||
# the configurable threshold hasn't been reached (e.g. threshold
|
||||
# is 50% but agent skipped voluntary compression). Logs as
|
||||
# "context_overflow" separate from normal compression.
|
||||
elif self.compression_enabled and _compressor.should_hard_compress(_real_tokens):
|
||||
logger.warning(
|
||||
"[CONTEXT OVERFLOW] Hard trigger at %s tokens (%.0f%% of %s limit). "
|
||||
"Forcing compression. threshold was at %s tokens (%.0f%%).",
|
||||
f"{_real_tokens:,}",
|
||||
_real_tokens / _compressor.context_length * 100,
|
||||
f"{_compressor.context_length:,}",
|
||||
f"{_compressor.threshold_tokens:,}",
|
||||
_compressor.threshold_percent * 100,
|
||||
)
|
||||
if not self.quiet_mode:
|
||||
self._safe_print(
|
||||
f"⚠️ Context at {_real_tokens:,} tokens "
|
||||
f"({_real_tokens / _compressor.context_length * 100:.0f}% of "
|
||||
f"{_compressor.context_length:,} limit) — "
|
||||
f"mandatory compression triggered."
|
||||
)
|
||||
messages, active_system_prompt = self._compress_context(
|
||||
messages, system_message,
|
||||
approx_tokens=_real_tokens,
|
||||
task_id=effective_task_id,
|
||||
)
|
||||
conversation_history = None
|
||||
|
||||
# Save session log incrementally (so progress is visible even if interrupted)
|
||||
self._session_messages = messages
|
||||
|
||||
221
tests/test_context_overflow_poka_yoke.py
Normal file
221
tests/test_context_overflow_poka_yoke.py
Normal file
@@ -0,0 +1,221 @@
|
||||
"""Tests for poka-yoke #296: mandatory context overflow prevention."""
|
||||
|
||||
import time
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch, PropertyMock
|
||||
|
||||
from agent.context_compressor import ContextCompressor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def compressor():
|
||||
"""Create a compressor with a 100K context window for testing."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
return ContextCompressor(
|
||||
model="test-model",
|
||||
threshold_percent=0.50,
|
||||
quiet_mode=True,
|
||||
)
|
||||
|
||||
|
||||
class TestHardLimitProperties:
|
||||
"""Test that hard limit tokens are computed correctly."""
|
||||
|
||||
def test_hard_warn_at_85_percent(self, compressor):
|
||||
assert compressor.hard_warn_tokens == 85_000
|
||||
|
||||
def test_hard_block_at_95_percent(self, compressor):
|
||||
assert compressor.hard_block_tokens == 95_000
|
||||
|
||||
def test_threshold_at_configured_percent(self, compressor):
|
||||
assert compressor.threshold_tokens == 50_000
|
||||
|
||||
def test_custom_threshold_does_not_affect_hard_limits(self):
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
|
||||
comp = ContextCompressor(
|
||||
model="test-model",
|
||||
threshold_percent=0.90,
|
||||
quiet_mode=True,
|
||||
)
|
||||
# Hard limits are always at 85% and 95% of model limit
|
||||
assert comp.hard_warn_tokens == 170_000
|
||||
assert comp.hard_block_tokens == 190_000
|
||||
# But threshold follows the config
|
||||
assert comp.threshold_tokens == 180_000
|
||||
|
||||
|
||||
class TestShouldHardCompress:
|
||||
"""Test the 85% hard compression trigger."""
|
||||
|
||||
def test_triggers_at_85_percent(self, compressor):
|
||||
compressor.last_prompt_tokens = 85_000
|
||||
compressor.last_completion_tokens = 0
|
||||
assert compressor.should_hard_compress() is True
|
||||
|
||||
def test_triggers_above_85_percent(self, compressor):
|
||||
assert compressor.should_hard_compress(90_000) is True
|
||||
|
||||
def test_no_trigger_below_85_percent(self, compressor):
|
||||
assert compressor.should_hard_compress(84_999) is False
|
||||
|
||||
def test_no_trigger_at_zero(self, compressor):
|
||||
assert compressor.should_hard_compress(0) is False
|
||||
|
||||
def test_no_trigger_when_no_tokens(self, compressor):
|
||||
compressor.last_prompt_tokens = 0
|
||||
compressor.last_completion_tokens = 0
|
||||
assert compressor.should_hard_compress() is False
|
||||
|
||||
def test_records_overflow_event(self, compressor):
|
||||
compressor.should_hard_compress(85_000)
|
||||
assert len(compressor._overflow_events) == 1
|
||||
assert compressor._overflow_events[0][0] == "hard_compress"
|
||||
|
||||
def test_uses_last_tokens_when_none_provided(self, compressor):
|
||||
compressor.last_prompt_tokens = 86_000
|
||||
compressor.last_completion_tokens = 0
|
||||
assert compressor.should_hard_compress() is True
|
||||
|
||||
|
||||
class TestShouldBlockTools:
|
||||
"""Test the 95% tool-blocking trigger."""
|
||||
|
||||
def test_triggers_at_95_percent(self, compressor):
|
||||
compressor.last_prompt_tokens = 95_000
|
||||
compressor.last_completion_tokens = 0
|
||||
assert compressor.should_block_tools() is True
|
||||
|
||||
def test_triggers_above_95_percent(self, compressor):
|
||||
assert compressor.should_block_tools(99_000) is True
|
||||
|
||||
def test_no_trigger_below_95_percent(self, compressor):
|
||||
assert compressor.should_block_tools(94_999) is False
|
||||
|
||||
def test_no_trigger_at_zero(self, compressor):
|
||||
assert compressor.should_block_tools(0) is False
|
||||
|
||||
def test_records_overflow_event(self, compressor):
|
||||
compressor.should_block_tools(95_000)
|
||||
assert len(compressor._overflow_events) == 1
|
||||
assert compressor._overflow_events[0][0] == "block_tools"
|
||||
|
||||
def test_both_triggers_fire_at_95(self, compressor):
|
||||
# At 95%, both hard_compress and block_tools should trigger
|
||||
assert compressor.should_hard_compress(95_000) is True
|
||||
assert compressor.should_block_tools(95_000) is True
|
||||
|
||||
|
||||
class TestGetDetailedStatus:
|
||||
"""Test the /context-status display data."""
|
||||
|
||||
def test_returns_all_fields(self, compressor):
|
||||
status = compressor.get_detailed_status()
|
||||
required_keys = [
|
||||
"last_prompt_tokens", "last_completion_tokens", "last_total_tokens",
|
||||
"context_length", "usage_percent", "threshold_tokens", "threshold_percent",
|
||||
"hard_warn_tokens", "hard_warn_percent", "hard_block_tokens", "hard_block_percent",
|
||||
"remaining_before_threshold", "remaining_before_hard_warn",
|
||||
"remaining_before_hard_block", "compression_count",
|
||||
"overflow_events_total", "overflow_events_recent", "previous_summary",
|
||||
]
|
||||
for key in required_keys:
|
||||
assert key in status, f"Missing key: {key}"
|
||||
|
||||
def test_usage_percent_correct(self, compressor):
|
||||
compressor.last_prompt_tokens = 50_000
|
||||
compressor.last_completion_tokens = 10_000
|
||||
status = compressor.get_detailed_status()
|
||||
assert status["usage_percent"] == 60.0
|
||||
assert status["last_total_tokens"] == 60_000
|
||||
|
||||
def test_remaining_calculations(self, compressor):
|
||||
compressor.last_prompt_tokens = 40_000
|
||||
compressor.last_completion_tokens = 5_000
|
||||
status = compressor.get_detailed_status()
|
||||
# 45K used, threshold at 50K -> 5K remaining
|
||||
assert status["remaining_before_threshold"] == 5_000
|
||||
# 45K used, hard_warn at 85K -> 40K remaining
|
||||
assert status["remaining_before_hard_warn"] == 40_000
|
||||
# 45K used, hard_block at 95K -> 50K remaining
|
||||
assert status["remaining_before_hard_block"] == 50_000
|
||||
|
||||
def test_overflow_event_counts(self, compressor):
|
||||
compressor.should_hard_compress(85_000)
|
||||
compressor.should_block_tools(95_000)
|
||||
status = compressor.get_detailed_status()
|
||||
assert status["overflow_events_total"] == 2
|
||||
assert status["overflow_events_recent"] == 2
|
||||
|
||||
def test_compression_count(self, compressor):
|
||||
compressor.compression_count = 3
|
||||
status = compressor.get_detailed_status()
|
||||
assert status["compression_count"] == 3
|
||||
|
||||
def test_previous_summary(self, compressor):
|
||||
compressor._previous_summary = "The user was working on a Python project..."
|
||||
status = compressor.get_detailed_status()
|
||||
assert status["previous_summary"] == "The user was working on a Python project..."
|
||||
|
||||
def test_previous_summary_truncated(self, compressor):
|
||||
compressor._previous_summary = "A" * 500
|
||||
status = compressor.get_detailed_status()
|
||||
assert len(status["previous_summary"]) == 200
|
||||
|
||||
def test_no_previous_summary(self, compressor):
|
||||
status = compressor.get_detailed_status()
|
||||
assert status["previous_summary"] is None
|
||||
|
||||
|
||||
class TestHardCompressVsNormalCompress:
|
||||
"""Test that hard compress triggers independently of normal threshold."""
|
||||
|
||||
def test_hard_compress_fires_below_threshold(self):
|
||||
"""Hard compress at 85% should fire even if threshold is 90%."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
comp = ContextCompressor(
|
||||
model="test-model",
|
||||
threshold_percent=0.90, # Normal threshold at 90K
|
||||
quiet_mode=True,
|
||||
)
|
||||
# At 85K, normal threshold (90K) hasn't fired
|
||||
assert comp.should_compress(85_000) is False
|
||||
# But hard compress (85K) should fire
|
||||
assert comp.should_hard_compress(85_000) is True
|
||||
|
||||
def test_normal_compress_does_not_trigger_hard(self):
|
||||
"""Normal compress at threshold should not record overflow events."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
comp = ContextCompressor(
|
||||
model="test-model",
|
||||
threshold_percent=0.50,
|
||||
quiet_mode=True,
|
||||
)
|
||||
assert comp.should_compress(50_000) is True
|
||||
# No overflow events recorded for normal compression
|
||||
assert len(comp._overflow_events) == 0
|
||||
|
||||
|
||||
class TestSmallContextModel:
|
||||
"""Test hard limits work correctly with small context models."""
|
||||
|
||||
def test_4k_model_limits(self):
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=4_096):
|
||||
comp = ContextCompressor(
|
||||
model="gpt-3.5-turbo",
|
||||
threshold_percent=0.50,
|
||||
quiet_mode=True,
|
||||
)
|
||||
assert comp.context_length == 4_096
|
||||
assert comp.threshold_tokens == 2_048
|
||||
assert comp.hard_warn_tokens == 3_481 # 85% of 4096
|
||||
assert comp.hard_block_tokens == 3_891 # 95% of 4096
|
||||
|
||||
def test_128k_model_limits(self):
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=128_000):
|
||||
comp = ContextCompressor(
|
||||
model="claude-sonnet-4",
|
||||
threshold_percent=0.50,
|
||||
quiet_mode=True,
|
||||
)
|
||||
assert comp.hard_warn_tokens == 108_800
|
||||
assert comp.hard_block_tokens == 121_600
|
||||
Reference in New Issue
Block a user