refactor(gateway): remove broken 1.4x hygiene multiplier entirely

The previous commit capped the 1.4x at 95% of context, but the multiplier
itself is unnecessary and confusing:

  85% threshold × 1.4 = 119% of context → never fires
  95% warn      × 1.4 = 133% of context → never warns

The 85% hygiene threshold already provides ample headroom over the agent's
own 50% compressor. Even if rough estimates overestimate by 50%, hygiene
would fire at ~57% actual usage — safe and harmless.

Remove the multiplier entirely. Both actual and estimated token paths
now use the same 85% / 95% thresholds. Update tests and comments.
This commit is contained in:
Teknium
2026-03-22 15:21:18 -07:00
parent b2b4a9ee7d
commit b799bca7a3
2 changed files with 55 additions and 76 deletions

View File

@@ -212,71 +212,59 @@ class TestSessionHygieneWarnThreshold:
assert post_compress_tokens < warn_threshold
class TestEstimatedTokenSafetyCap:
"""Verify the 1.4x safety factor on rough estimates is capped at 95% of
context length, preventing the threshold from exceeding the model's
actual limit.
class TestEstimatedTokenThreshold:
"""Verify that hygiene thresholds are always below the model's context
limit — for both actual and estimated token counts.
Bug: For ~200K models (GLM-5-turbo), the uncapped 1.4x pushed the
threshold to 238K — above the model's limit — so hygiene never fired.
Regression: a previous 1.4x multiplier on rough estimates pushed the
threshold to 85% * 1.4 = 119% of context, which exceeded the model's
limit and prevented hygiene from ever firing for ~200K models (GLM-5).
The fix removed the multiplier entirely — the 85% threshold already
provides ample headroom over the agent's 50% compressor.
"""
def test_uncapped_14x_would_exceed_context(self):
"""Without the cap, 200K * 0.85 * 1.4 = 238K > 200K (broken)."""
def test_threshold_below_context_for_200k_model(self):
"""Hygiene threshold must always be below model context."""
context_length = 200_000
threshold_pct = 0.85
raw_threshold = int(context_length * threshold_pct) # 170K
uncapped = int(raw_threshold * 1.4) # 238K
assert uncapped > context_length, (
"Uncapped 1.4x should exceed model context (this is the bug)"
)
threshold = int(context_length * 0.85)
assert threshold < context_length
def test_capped_14x_stays_within_context(self):
"""With the cap, the threshold stays at 95% of context length."""
context_length = 200_000
threshold_pct = 0.85
raw_threshold = int(context_length * threshold_pct) # 170K
max_safe = int(context_length * 0.95) # 190K
capped = min(int(raw_threshold * 1.4), max_safe)
assert capped <= context_length, (
f"Capped threshold ({capped:,}) must not exceed context ({context_length:,})"
)
assert capped == max_safe, (
f"For 200K models, the cap should bind: expected {max_safe:,}, got {capped:,}"
)
def test_cap_does_not_affect_large_context_models(self):
"""For 1M+ models the 1.4x factor stays below 95%, so cap is no-op."""
context_length = 1_000_000
threshold_pct = 0.85
raw_threshold = int(context_length * threshold_pct) # 850K
max_safe = int(context_length * 0.95) # 950K
uncapped = int(raw_threshold * 1.4) # 1,190K — but that's > 950K
capped = min(uncapped, max_safe)
# For very large models the cap still applies but the resulting
# threshold (950K) is still large enough to prevent premature compression
assert capped <= context_length
def test_cap_for_128k_model(self):
"""128K model: 128K * 0.85 * 1.4 = 152K — exceeds 128K, cap binds."""
def test_threshold_below_context_for_128k_model(self):
context_length = 128_000
threshold_pct = 0.85
raw_threshold = int(context_length * threshold_pct) # 108,800
max_safe = int(context_length * 0.95) # 121,600
uncapped = int(raw_threshold * 1.4) # 152,320
capped = min(uncapped, max_safe)
assert uncapped > context_length, "1.4x exceeds 128K context"
assert capped == max_safe, "Cap should bind for 128K models"
assert capped < context_length, "Capped value must be below context limit"
threshold = int(context_length * 0.85)
assert threshold < context_length
def test_warn_threshold_capped_at_context_length(self):
"""Warn threshold (0.95 * 1.4) must be capped at context_length."""
def test_no_multiplier_means_same_threshold_for_estimated_and_actual(self):
"""Without the 1.4x, estimated and actual token paths use the same threshold."""
context_length = 200_000
raw_warn = int(context_length * 0.95) # 190K
uncapped_warn = int(raw_warn * 1.4) # 266K
capped_warn = min(uncapped_warn, context_length)
assert uncapped_warn > context_length
assert capped_warn == context_length
threshold_pct = 0.85
threshold = int(context_length * threshold_pct)
# Both paths should use 170K — no inflation
assert threshold == 170_000
def test_warn_threshold_below_context(self):
"""Warn threshold (95%) must be below context length."""
for ctx in (128_000, 200_000, 1_000_000):
warn = int(ctx * 0.95)
assert warn < ctx
def test_overestimate_fires_early_but_safely(self):
"""If rough estimate is 50% inflated, hygiene fires at ~57% actual usage.
That's between the agent's 50% threshold and the model's limit —
safe and harmless.
"""
context_length = 200_000
threshold = int(context_length * 0.85) # 170K
# If actual tokens = 113K, rough estimate = 113K * 1.5 = 170K
# Hygiene fires when estimate hits 170K, actual is ~113K = 57% of ctx
actual_when_fires = threshold / 1.5
assert actual_when_fires > context_length * 0.50, (
"Early fire should still be above agent's 50% threshold"
)
assert actual_when_fires < context_length, (
"Early fire must be well below model limit"
)
class TestTokenEstimation: