Deep scan with vulture, pyflakes, and manual cross-referencing identified: - 41 dead functions/methods (zero callers in production) - 7 production-dead functions (only test callers, tests deleted) - 5 dead constants/variables - ~35 unused imports across agent/, hermes_cli/, tools/, gateway/ Categories of dead code removed: - Refactoring leftovers: _set_default_model, _setup_copilot_reasoning_selection, rebuild_lookups, clear_session_context, get_logs_dir, clear_session - Unused API surface: search_models_dev, get_pricing, skills_categories, get_read_files_summary, clear_read_tracker, menu_labels, get_spinner_list - Dead compatibility wrappers: schedule_cronjob, list_cronjobs, remove_cronjob - Stale debug helpers: get_debug_session_info copies in 4 tool files (centralized version in debug_helpers.py already exists) - Dead gateway methods: send_emote, send_notice (matrix), send_reaction (bluebubbles), _normalize_inbound_text (feishu), fetch_room_history (matrix), _start_typing_indicator (signal), parse_feishu_post_content - Dead constants: NOUS_API_BASE_URL, SKILLS_TOOL_DESCRIPTION, FILE_TOOLS, VALID_ASPECT_RATIOS, MEMORY_DIR - Unused UI code: _interactive_provider_selection, _interactive_model_selection (superseded by prompt_toolkit picker) Test suite verified: 609 tests covering affected files all pass. Tests for removed functions deleted. Tests using removed utilities (clear_read_tracker, MEMORY_DIR) updated to use internal APIs directly.
247 lines
8.3 KiB
Python
247 lines
8.3 KiB
Python
"""Rate limit tracking for inference API responses.
|
|
|
|
Captures x-ratelimit-* headers from provider responses and provides
|
|
formatted display for the /usage slash command. Currently supports
|
|
the Nous Portal header format (also used by OpenRouter and OpenAI-compatible
|
|
APIs that follow the same convention).
|
|
|
|
Header schema (12 headers total):
|
|
x-ratelimit-limit-requests RPM cap
|
|
x-ratelimit-limit-requests-1h RPH cap
|
|
x-ratelimit-limit-tokens TPM cap
|
|
x-ratelimit-limit-tokens-1h TPH cap
|
|
x-ratelimit-remaining-requests requests left in minute window
|
|
x-ratelimit-remaining-requests-1h requests left in hour window
|
|
x-ratelimit-remaining-tokens tokens left in minute window
|
|
x-ratelimit-remaining-tokens-1h tokens left in hour window
|
|
x-ratelimit-reset-requests seconds until minute request window resets
|
|
x-ratelimit-reset-requests-1h seconds until hour request window resets
|
|
x-ratelimit-reset-tokens seconds until minute token window resets
|
|
x-ratelimit-reset-tokens-1h seconds until hour token window resets
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Mapping, Optional
|
|
|
|
|
|
@dataclass
|
|
class RateLimitBucket:
|
|
"""One rate-limit window (e.g. requests per minute)."""
|
|
|
|
limit: int = 0
|
|
remaining: int = 0
|
|
reset_seconds: float = 0.0
|
|
captured_at: float = 0.0 # time.time() when this was captured
|
|
|
|
@property
|
|
def used(self) -> int:
|
|
return max(0, self.limit - self.remaining)
|
|
|
|
@property
|
|
def usage_pct(self) -> float:
|
|
if self.limit <= 0:
|
|
return 0.0
|
|
return (self.used / self.limit) * 100.0
|
|
|
|
@property
|
|
def remaining_seconds_now(self) -> float:
|
|
"""Estimated seconds remaining until reset, adjusted for elapsed time."""
|
|
elapsed = time.time() - self.captured_at
|
|
return max(0.0, self.reset_seconds - elapsed)
|
|
|
|
|
|
@dataclass
|
|
class RateLimitState:
|
|
"""Full rate-limit state parsed from response headers."""
|
|
|
|
requests_min: RateLimitBucket = field(default_factory=RateLimitBucket)
|
|
requests_hour: RateLimitBucket = field(default_factory=RateLimitBucket)
|
|
tokens_min: RateLimitBucket = field(default_factory=RateLimitBucket)
|
|
tokens_hour: RateLimitBucket = field(default_factory=RateLimitBucket)
|
|
captured_at: float = 0.0 # when the headers were captured
|
|
provider: str = ""
|
|
|
|
@property
|
|
def has_data(self) -> bool:
|
|
return self.captured_at > 0
|
|
|
|
@property
|
|
def age_seconds(self) -> float:
|
|
if not self.has_data:
|
|
return float("inf")
|
|
return time.time() - self.captured_at
|
|
|
|
|
|
def _safe_int(value: Any, default: int = 0) -> int:
|
|
try:
|
|
return int(float(value))
|
|
except (TypeError, ValueError):
|
|
return default
|
|
|
|
|
|
def _safe_float(value: Any, default: float = 0.0) -> float:
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return default
|
|
|
|
|
|
def parse_rate_limit_headers(
|
|
headers: Mapping[str, str],
|
|
provider: str = "",
|
|
) -> Optional[RateLimitState]:
|
|
"""Parse x-ratelimit-* headers into a RateLimitState.
|
|
|
|
Returns None if no rate limit headers are present.
|
|
"""
|
|
# Normalize to lowercase so lookups work regardless of how the server
|
|
# capitalises headers (HTTP header names are case-insensitive per RFC 7230).
|
|
lowered = {k.lower(): v for k, v in headers.items()}
|
|
|
|
# Quick check: at least one rate limit header must exist
|
|
has_any = any(k.startswith("x-ratelimit-") for k in lowered)
|
|
if not has_any:
|
|
return None
|
|
|
|
now = time.time()
|
|
|
|
def _bucket(resource: str, suffix: str = "") -> RateLimitBucket:
|
|
# e.g. resource="requests", suffix="" -> per-minute
|
|
# resource="tokens", suffix="-1h" -> per-hour
|
|
tag = f"{resource}{suffix}"
|
|
return RateLimitBucket(
|
|
limit=_safe_int(lowered.get(f"x-ratelimit-limit-{tag}")),
|
|
remaining=_safe_int(lowered.get(f"x-ratelimit-remaining-{tag}")),
|
|
reset_seconds=_safe_float(lowered.get(f"x-ratelimit-reset-{tag}")),
|
|
captured_at=now,
|
|
)
|
|
|
|
return RateLimitState(
|
|
requests_min=_bucket("requests"),
|
|
requests_hour=_bucket("requests", "-1h"),
|
|
tokens_min=_bucket("tokens"),
|
|
tokens_hour=_bucket("tokens", "-1h"),
|
|
captured_at=now,
|
|
provider=provider,
|
|
)
|
|
|
|
|
|
# ── Formatting ──────────────────────────────────────────────────────────
|
|
|
|
|
|
def _fmt_count(n: int) -> str:
|
|
"""Human-friendly number: 7999856 -> '8.0M', 33599 -> '33.6K', 799 -> '799'."""
|
|
if n >= 1_000_000:
|
|
return f"{n / 1_000_000:.1f}M"
|
|
if n >= 10_000:
|
|
return f"{n / 1_000:.1f}K"
|
|
if n >= 1_000:
|
|
return f"{n / 1_000:.1f}K"
|
|
return str(n)
|
|
|
|
|
|
def _fmt_seconds(seconds: float) -> str:
|
|
"""Seconds -> human-friendly duration: '58s', '2m 14s', '58m 57s', '1h 2m'."""
|
|
s = max(0, int(seconds))
|
|
if s < 60:
|
|
return f"{s}s"
|
|
if s < 3600:
|
|
m, sec = divmod(s, 60)
|
|
return f"{m}m {sec}s" if sec else f"{m}m"
|
|
h, remainder = divmod(s, 3600)
|
|
m = remainder // 60
|
|
return f"{h}h {m}m" if m else f"{h}h"
|
|
|
|
|
|
def _bar(pct: float, width: int = 20) -> str:
|
|
"""ASCII progress bar: [████████░░░░░░░░░░░░] 40%."""
|
|
filled = int(pct / 100.0 * width)
|
|
filled = max(0, min(width, filled))
|
|
empty = width - filled
|
|
return f"[{'█' * filled}{'░' * empty}]"
|
|
|
|
|
|
def _bucket_line(label: str, bucket: RateLimitBucket, label_width: int = 14) -> str:
|
|
"""Format one bucket as a single line."""
|
|
if bucket.limit <= 0:
|
|
return f" {label:<{label_width}} (no data)"
|
|
|
|
pct = bucket.usage_pct
|
|
used = _fmt_count(bucket.used)
|
|
limit = _fmt_count(bucket.limit)
|
|
remaining = _fmt_count(bucket.remaining)
|
|
reset = _fmt_seconds(bucket.remaining_seconds_now)
|
|
|
|
bar = _bar(pct)
|
|
return f" {label:<{label_width}} {bar} {pct:5.1f}% {used}/{limit} used ({remaining} left, resets in {reset})"
|
|
|
|
|
|
def format_rate_limit_display(state: RateLimitState) -> str:
|
|
"""Format rate limit state for terminal/chat display."""
|
|
if not state.has_data:
|
|
return "No rate limit data yet — make an API request first."
|
|
|
|
age = state.age_seconds
|
|
if age < 5:
|
|
freshness = "just now"
|
|
elif age < 60:
|
|
freshness = f"{int(age)}s ago"
|
|
else:
|
|
freshness = f"{_fmt_seconds(age)} ago"
|
|
|
|
provider_label = state.provider.title() if state.provider else "Provider"
|
|
|
|
lines = [
|
|
f"{provider_label} Rate Limits (captured {freshness}):",
|
|
"",
|
|
_bucket_line("Requests/min", state.requests_min),
|
|
_bucket_line("Requests/hr", state.requests_hour),
|
|
"",
|
|
_bucket_line("Tokens/min", state.tokens_min),
|
|
_bucket_line("Tokens/hr", state.tokens_hour),
|
|
]
|
|
|
|
# Add warnings if any bucket is getting hot
|
|
warnings = []
|
|
for label, bucket in [
|
|
("requests/min", state.requests_min),
|
|
("requests/hr", state.requests_hour),
|
|
("tokens/min", state.tokens_min),
|
|
("tokens/hr", state.tokens_hour),
|
|
]:
|
|
if bucket.limit > 0 and bucket.usage_pct >= 80:
|
|
reset = _fmt_seconds(bucket.remaining_seconds_now)
|
|
warnings.append(f" ⚠ {label} at {bucket.usage_pct:.0f}% — resets in {reset}")
|
|
|
|
if warnings:
|
|
lines.append("")
|
|
lines.extend(warnings)
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def format_rate_limit_compact(state: RateLimitState) -> str:
|
|
"""One-line compact summary for status bars / gateway messages."""
|
|
if not state.has_data:
|
|
return "No rate limit data."
|
|
|
|
rm = state.requests_min
|
|
tm = state.tokens_min
|
|
rh = state.requests_hour
|
|
th = state.tokens_hour
|
|
|
|
parts = []
|
|
if rm.limit > 0:
|
|
parts.append(f"RPM: {rm.remaining}/{rm.limit}")
|
|
if rh.limit > 0:
|
|
parts.append(f"RPH: {_fmt_count(rh.remaining)}/{_fmt_count(rh.limit)} (resets {_fmt_seconds(rh.remaining_seconds_now)})")
|
|
if tm.limit > 0:
|
|
parts.append(f"TPM: {_fmt_count(tm.remaining)}/{_fmt_count(tm.limit)}")
|
|
if th.limit > 0:
|
|
parts.append(f"TPH: {_fmt_count(th.remaining)}/{_fmt_count(th.limit)} (resets {_fmt_seconds(th.remaining_seconds_now)})")
|
|
|
|
return " | ".join(parts)
|