refactor(gateway): remove broken 1.4x hygiene multiplier entirely

The previous commit capped the 1.4x at 95% of context, but the multiplier itself is unnecessary and confusing: 85% threshold × 1.4 = 119% of context → never fires 95% warn × 1.4 = 133% of context → never warns The 85% hygiene threshold already provides ample headroom over the agent's own 50% compressor. Even if rough estimates overestimate by 50%, hygiene would fire at ~57% actual usage — safe and harmless. Remove the multiplier entirely. Both actual and estimated token paths now use the same 85% / 95% thresholds. Update tests and comments.
2026-03-22 15:21:18 -07:00
parent b2b4a9ee7d
commit b799bca7a3
2 changed files with 55 additions and 76 deletions
--- a/tests/gateway/test_session_hygiene.py
+++ b/tests/gateway/test_session_hygiene.py
@@ -212,71 +212,59 @@ class TestSessionHygieneWarnThreshold:
        assert post_compress_tokens < warn_threshold


-class TestEstimatedTokenSafetyCap:
-    """Verify the 1.4x safety factor on rough estimates is capped at 95% of
-    context length, preventing the threshold from exceeding the model's
-    actual limit.
+class TestEstimatedTokenThreshold:
+    """Verify that hygiene thresholds are always below the model's context
+    limit — for both actual and estimated token counts.

-    Bug: For ~200K models (GLM-5-turbo), the uncapped 1.4x pushed the
-    threshold to 238K — above the model's limit — so hygiene never fired.
+    Regression: a previous 1.4x multiplier on rough estimates pushed the
+    threshold to 85% * 1.4 = 119% of context, which exceeded the model's
+    limit and prevented hygiene from ever firing for ~200K models (GLM-5).
+    The fix removed the multiplier entirely — the 85% threshold already
+    provides ample headroom over the agent's 50% compressor.
    """

-    def test_uncapped_14x_would_exceed_context(self):
-        """Without the cap, 200K * 0.85 * 1.4 = 238K > 200K (broken)."""
+    def test_threshold_below_context_for_200k_model(self):
+        """Hygiene threshold must always be below model context."""
        context_length = 200_000
-        threshold_pct = 0.85
-        raw_threshold = int(context_length * threshold_pct)  # 170K
-        uncapped = int(raw_threshold * 1.4)  # 238K
-        assert uncapped > context_length, (
-            "Uncapped 1.4x should exceed model context (this is the bug)"
-        )
+        threshold = int(context_length * 0.85)
+        assert threshold < context_length

-    def test_capped_14x_stays_within_context(self):
-        """With the cap, the threshold stays at 95% of context length."""
-        context_length = 200_000
-        threshold_pct = 0.85
-        raw_threshold = int(context_length * threshold_pct)  # 170K
-        max_safe = int(context_length * 0.95)  # 190K
-        capped = min(int(raw_threshold * 1.4), max_safe)
-        assert capped <= context_length, (
-            f"Capped threshold ({capped:,}) must not exceed context ({context_length:,})"
-        )
-        assert capped == max_safe, (
-            f"For 200K models, the cap should bind: expected {max_safe:,}, got {capped:,}"
-        )
-
-    def test_cap_does_not_affect_large_context_models(self):
-        """For 1M+ models the 1.4x factor stays below 95%, so cap is no-op."""
-        context_length = 1_000_000
-        threshold_pct = 0.85
-        raw_threshold = int(context_length * threshold_pct)  # 850K
-        max_safe = int(context_length * 0.95)  # 950K
-        uncapped = int(raw_threshold * 1.4)  # 1,190K — but that's > 950K
-        capped = min(uncapped, max_safe)
-        # For very large models the cap still applies but the resulting
-        # threshold (950K) is still large enough to prevent premature compression
-        assert capped <= context_length
-
-    def test_cap_for_128k_model(self):
-        """128K model: 128K * 0.85 * 1.4 = 152K — exceeds 128K, cap binds."""
+    def test_threshold_below_context_for_128k_model(self):
        context_length = 128_000
-        threshold_pct = 0.85
-        raw_threshold = int(context_length * threshold_pct)  # 108,800
-        max_safe = int(context_length * 0.95)  # 121,600
-        uncapped = int(raw_threshold * 1.4)  # 152,320
-        capped = min(uncapped, max_safe)
-        assert uncapped > context_length, "1.4x exceeds 128K context"
-        assert capped == max_safe, "Cap should bind for 128K models"
-        assert capped < context_length, "Capped value must be below context limit"
+        threshold = int(context_length * 0.85)
+        assert threshold < context_length

-    def test_warn_threshold_capped_at_context_length(self):
-        """Warn threshold (0.95 * 1.4) must be capped at context_length."""
+    def test_no_multiplier_means_same_threshold_for_estimated_and_actual(self):
+        """Without the 1.4x, estimated and actual token paths use the same threshold."""
        context_length = 200_000
-        raw_warn = int(context_length * 0.95)  # 190K
-        uncapped_warn = int(raw_warn * 1.4)  # 266K
-        capped_warn = min(uncapped_warn, context_length)
-        assert uncapped_warn > context_length
-        assert capped_warn == context_length
+        threshold_pct = 0.85
+        threshold = int(context_length * threshold_pct)
+        # Both paths should use 170K — no inflation
+        assert threshold == 170_000
+
+    def test_warn_threshold_below_context(self):
+        """Warn threshold (95%) must be below context length."""
+        for ctx in (128_000, 200_000, 1_000_000):
+            warn = int(ctx * 0.95)
+            assert warn < ctx
+
+    def test_overestimate_fires_early_but_safely(self):
+        """If rough estimate is 50% inflated, hygiene fires at ~57% actual usage.
+
+        That's between the agent's 50% threshold and the model's limit —
+        safe and harmless.
+        """
+        context_length = 200_000
+        threshold = int(context_length * 0.85)  # 170K
+        # If actual tokens = 113K, rough estimate = 113K * 1.5 = 170K
+        # Hygiene fires when estimate hits 170K, actual is ~113K = 57% of ctx
+        actual_when_fires = threshold / 1.5
+        assert actual_when_fires > context_length * 0.50, (
+            "Early fire should still be above agent's 50% threshold"
+        )
+        assert actual_when_fires < context_length, (
+            "Early fire must be well below model limit"
+        )


 class TestTokenEstimation: