Adds agent/retry_utils.py with jittered_backoff() — exponential backoff with additive jitter to prevent thundering-herd retry spikes when multiple gateway sessions hit the same rate-limited provider. Replaces fixed exponential backoff at 4 call sites: - run_agent.py: None-choices retry path (5s base, 120s cap) - run_agent.py: API error retry path (2s base, 60s cap) - trajectory_compressor.py: sync + async summarization retries Thread-safe jitter counter with overflow guards ensures unique seeds across concurrent retries. Trimmed from original PR to keep only wired-in functionality. Co-authored-by: martinp09 <martinp09@users.noreply.github.com>
58 lines
1.8 KiB
Python
58 lines
1.8 KiB
Python
"""Retry utilities — jittered backoff for decorrelated retries.
|
|
|
|
Replaces fixed exponential backoff with jittered delays to prevent
|
|
thundering-herd retry spikes when multiple sessions hit the same
|
|
rate-limited provider concurrently.
|
|
"""
|
|
|
|
import random
|
|
import threading
|
|
import time
|
|
|
|
# Monotonic counter for jitter seed uniqueness within the same process.
|
|
# Protected by a lock to avoid race conditions in concurrent retry paths
|
|
# (e.g. multiple gateway sessions retrying simultaneously).
|
|
_jitter_counter = 0
|
|
_jitter_lock = threading.Lock()
|
|
|
|
|
|
def jittered_backoff(
|
|
attempt: int,
|
|
*,
|
|
base_delay: float = 5.0,
|
|
max_delay: float = 120.0,
|
|
jitter_ratio: float = 0.5,
|
|
) -> float:
|
|
"""Compute a jittered exponential backoff delay.
|
|
|
|
Args:
|
|
attempt: 1-based retry attempt number.
|
|
base_delay: Base delay in seconds for attempt 1.
|
|
max_delay: Maximum delay cap in seconds.
|
|
jitter_ratio: Fraction of computed delay to use as random jitter
|
|
range. 0.5 means jitter is uniform in [0, 0.5 * delay].
|
|
|
|
Returns:
|
|
Delay in seconds: min(base * 2^(attempt-1), max_delay) + jitter.
|
|
|
|
The jitter decorrelates concurrent retries so multiple sessions
|
|
hitting the same provider don't all retry at the same instant.
|
|
"""
|
|
global _jitter_counter
|
|
with _jitter_lock:
|
|
_jitter_counter += 1
|
|
tick = _jitter_counter
|
|
|
|
exponent = max(0, attempt - 1)
|
|
if exponent >= 63 or base_delay <= 0:
|
|
delay = max_delay
|
|
else:
|
|
delay = min(base_delay * (2 ** exponent), max_delay)
|
|
|
|
# Seed from time + counter for decorrelation even with coarse clocks.
|
|
seed = (time.time_ns() ^ (tick * 0x9E3779B9)) & 0xFFFFFFFF
|
|
rng = random.Random(seed)
|
|
jitter = rng.uniform(0, jitter_ratio * delay)
|
|
|
|
return delay + jitter
|