feat: local customizations - refusal detection, kimi routing, usage pricing, auth providers

2026-03-30 18:47:55 -04:00
7 changed files with 412 additions and 8 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -63,6 +63,11 @@ _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
    "opencode-zen": "gemini-3-flash",
    "opencode-go": "glm-5",
    "kilocode": "google/gemini-3-flash-preview",
    # Uniwizard backends
    "gemini": "gemini-2.5-flash",
    "groq": "llama-3.3-70b-versatile",
    "grok": "grok-3-mini-fast",
    "openrouter": "openai/gpt-4.1-mini",
 }
 # OpenRouter app attribution headers
--- a/agent/usage_pricing.py
+++ b/agent/usage_pricing.py
@@ -75,6 +75,22 @@ class CostResult:
    notes: tuple[str, ...] = ()
@dataclass(frozen=True)
 class CostBreakdown:
    input_usd: Optional[Decimal]
    output_usd: Optional[Decimal]
    cache_read_usd: Optional[Decimal]
    cache_write_usd: Optional[Decimal]
    request_usd: Optional[Decimal]
    total_usd: Optional[Decimal]
    status: CostStatus
    source: CostSource
    label: str
    fetched_at: Optional[datetime] = None
    pricing_version: Optional[str] = None
    notes: tuple[str, ...] = ()
 _UTC_NOW = lambda: datetime.now(timezone.utc)
@@ -93,6 +109,25 @@ _OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
        source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
        pricing_version="anthropic-prompt-caching-2026-03-16",
    ),
    # Aliases for short model names (Anthropic API resolves these to dated versions)
    ("anthropic", "claude-opus-4-6"): PricingEntry(
        input_cost_per_million=Decimal("15.00"),
        output_cost_per_million=Decimal("75.00"),
        cache_read_cost_per_million=Decimal("1.50"),
        cache_write_cost_per_million=Decimal("18.75"),
        source="official_docs_snapshot",
        source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
        pricing_version="anthropic-prompt-caching-2026-03-16",
    ),
    ("anthropic", "claude-opus-4.6"): PricingEntry(
        input_cost_per_million=Decimal("15.00"),
        output_cost_per_million=Decimal("75.00"),
        cache_read_cost_per_million=Decimal("1.50"),
        cache_write_cost_per_million=Decimal("18.75"),
        source="official_docs_snapshot",
        source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
        pricing_version="anthropic-prompt-caching-2026-03-16",
    ),
    (
        "anthropic",
        "claude-sonnet-4-20250514",
@@ -105,6 +140,24 @@ _OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
        source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
        pricing_version="anthropic-prompt-caching-2026-03-16",
    ),
    ("anthropic", "claude-sonnet-4-5"): PricingEntry(
        input_cost_per_million=Decimal("3.00"),
        output_cost_per_million=Decimal("15.00"),
        cache_read_cost_per_million=Decimal("0.30"),
        cache_write_cost_per_million=Decimal("3.75"),
        source="official_docs_snapshot",
        source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
        pricing_version="anthropic-prompt-caching-2026-03-16",
    ),
    ("anthropic", "claude-sonnet-4.5"): PricingEntry(
        input_cost_per_million=Decimal("3.00"),
        output_cost_per_million=Decimal("15.00"),
        cache_read_cost_per_million=Decimal("0.30"),
        cache_write_cost_per_million=Decimal("3.75"),
        source="official_docs_snapshot",
        source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
        pricing_version="anthropic-prompt-caching-2026-03-16",
    ),
    # OpenAI
    (
        "openai",
@@ -654,3 +707,80 @@ def format_token_count_compact(value: int) -> str:
            return f"{sign}{text}{suffix}"
    return f"{value:,}"
 def estimate_usage_cost_breakdown(
    model_name: str,
    usage: CanonicalUsage,
    *,
    provider: Optional[str] = None,
    base_url: Optional[str] = None,
    api_key: Optional[str] = None,
 ) -> CostBreakdown:
    """Estimate per-bucket cost breakdown for a usage record.
    Returns the same status/source semantics as estimate_usage_cost(), but splits
    the total into input/cache/output/request components when pricing data is
    available. For subscription-included routes (e.g. openai-codex), all
    components are reported as zero-cost instead of unknown.
    """
    cost_result = estimate_usage_cost(
        model_name,
        usage,
        provider=provider,
        base_url=base_url,
        api_key=api_key,
    )
    route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
    entry = get_pricing_entry(model_name, provider=provider, base_url=base_url, api_key=api_key)
    if not entry and route.billing_mode == "subscription_included":
        entry = PricingEntry(
            input_cost_per_million=_ZERO,
            output_cost_per_million=_ZERO,
            cache_read_cost_per_million=_ZERO,
            cache_write_cost_per_million=_ZERO,
            request_cost=_ZERO,
            source="none",
            pricing_version="included-route",
        )
    if not entry:
        return CostBreakdown(
            input_usd=None,
            output_usd=None,
            cache_read_usd=None,
            cache_write_usd=None,
            request_usd=None,
            total_usd=cost_result.amount_usd,
            status=cost_result.status,
            source=cost_result.source,
            label=cost_result.label,
            fetched_at=cost_result.fetched_at,
            pricing_version=cost_result.pricing_version,
            notes=cost_result.notes,
        )
    def _component(tokens: int, rate: Optional[Decimal]) -> Optional[Decimal]:
        if rate is None:
            return None
        return (Decimal(tokens or 0) * rate) / _ONE_MILLION
    request_usd = None
    if entry.request_cost is not None:
        request_usd = Decimal(usage.request_count or 0) * entry.request_cost
    return CostBreakdown(
        input_usd=_component(usage.input_tokens, entry.input_cost_per_million),
        output_usd=_component(usage.output_tokens, entry.output_cost_per_million),
        cache_read_usd=_component(usage.cache_read_tokens, entry.cache_read_cost_per_million),
        cache_write_usd=_component(usage.cache_write_tokens, entry.cache_write_cost_per_million),
        request_usd=request_usd,
        total_usd=cost_result.amount_usd,
        status=cost_result.status,
        source=cost_result.source,
        label=cost_result.label,
        fetched_at=cost_result.fetched_at,
        pricing_version=cost_result.pricing_version,
        notes=cost_result.notes,
    )
--- a/cli.py
+++ b/cli.py
@@ -4563,7 +4563,30 @@ class HermesCLI:
            print("(._.) No API calls made yet in this session.")
            return
-        # Current context window state
+        def _fmt_money(amount):
            return "n/a" if amount is None else f"${float(amount):.4f}"
        def _fmt_limit(remaining, limit):
            if remaining is None and limit is None:
                return "n/a"
            if remaining is None:
                return f"? / {limit:,}"
            if limit is None:
                return f"{remaining:,} / ?"
            return f"{remaining:,} / {limit:,}"
        def _fmt_reset(seconds):
            if seconds is None:
                return "n/a"
            seconds = int(seconds)
            if seconds < 60:
                return f"{seconds}s"
            minutes, secs = divmod(seconds, 60)
            if minutes < 60:
                return f"{minutes}m {secs}s"
            hours, minutes = divmod(minutes, 60)
            return f"{hours}h {minutes}m"
        compressor = agent.context_compressor
        last_prompt = compressor.last_prompt_tokens
        ctx_len = compressor.context_length
@@ -4571,14 +4594,21 @@ class HermesCLI:
        compressions = compressor.compression_count
        msg_count = len(self.conversation_history)
        usage = CanonicalUsage(
            input_tokens=input_tokens,
            output_tokens=output_tokens,
            cache_read_tokens=cache_read_tokens,
            cache_write_tokens=cache_write_tokens,
        )
        cost_result = estimate_usage_cost(
            agent.model,
-            CanonicalUsage(
+            usage,
-                input_tokens=input_tokens,
+            provider=getattr(agent, "provider", None),
-                output_tokens=output_tokens,
+            base_url=getattr(agent, "base_url", None),
-                cache_read_tokens=cache_read_tokens,
+        )
-                cache_write_tokens=cache_write_tokens,
+        cost_breakdown = estimate_usage_cost_breakdown(
-            ),
+            agent.model,
            usage,
            provider=getattr(agent, "provider", None),
            base_url=getattr(agent, "base_url", None),
        )
@@ -4605,6 +4635,38 @@ class HermesCLI:
            print(f"  Total cost:              {'included':>10}")
        else:
            print(f"  Total cost:              {'n/a':>10}")
        print(f"  Cost input:              {_fmt_money(cost_breakdown.input_usd):>10}")
        print(f"  Cost cache read:         {_fmt_money(cost_breakdown.cache_read_usd):>10}")
        print(f"  Cost cache write:        {_fmt_money(cost_breakdown.cache_write_usd):>10}")
        print(f"  Cost output:             {_fmt_money(cost_breakdown.output_usd):>10}")
        if cost_breakdown.request_usd is not None:
            print(f"  Cost requests:           {_fmt_money(cost_breakdown.request_usd):>10}")
        rate_limits = getattr(agent, "session_openai_rate_limits", None) or {}
        last_request_id = getattr(agent, "session_last_request_id", None)
        rate_limit_events = getattr(agent, "session_rate_limit_events", 0) or 0
        if last_request_id:
            print(f"  Last request id:         {last_request_id:>10}")
        if rate_limits:
            status_code = rate_limits.get("status_code")
            if status_code is not None:
                print(f"  Last HTTP status:        {status_code:>10}")
            req_remaining = rate_limits.get("remaining_requests")
            req_limit = rate_limits.get("limit_requests")
            req_reset = rate_limits.get("reset_requests_seconds")
            if req_remaining is not None or req_limit is not None:
                print(f"  Req limit:             {_fmt_limit(req_remaining, req_limit):>14}  reset {_fmt_reset(req_reset)}")
            tok_remaining = rate_limits.get("remaining_tokens")
            tok_limit = rate_limits.get("limit_tokens")
            tok_reset = rate_limits.get("reset_tokens_seconds")
            if tok_remaining is not None or tok_limit is not None:
                print(f"  Token limit:           {_fmt_limit(tok_remaining, tok_limit):>14}  reset {_fmt_reset(tok_reset)}")
            retry_after = rate_limits.get("retry_after_seconds")
            if retry_after is not None:
                print(f"  Retry after:             {_fmt_reset(retry_after):>10}")
        if rate_limit_events:
            print(f"  Rate limit hits:         {rate_limit_events:>10,}")
        print(f"  {'─' * 40}")
        print(f"  Current context:  {last_prompt:,} / {ctx_len:,} ({pct:.0f}%)")
        print(f"  Messages:         {msg_count}")
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -220,6 +220,39 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
        api_key_env_vars=("HF_TOKEN",),
        base_url_env_var="HF_BASE_URL",
    ),
    # ── Uniwizard backends (added 2026-03-30) ─────────────────────────
    "gemini": ProviderConfig(
        id="gemini",
        name="Google Gemini",
        auth_type="api_key",
        inference_base_url="https://generativelanguage.googleapis.com/v1beta/openai",
        api_key_env_vars=("GEMINI_API_KEY",),
        base_url_env_var="GEMINI_BASE_URL",
    ),
    "groq": ProviderConfig(
        id="groq",
        name="Groq",
        auth_type="api_key",
        inference_base_url="https://api.groq.com/openai/v1",
        api_key_env_vars=("GROQ_API_KEY",),
        base_url_env_var="GROQ_BASE_URL",
    ),
    "grok": ProviderConfig(
        id="grok",
        name="xAI Grok",
        auth_type="api_key",
        inference_base_url="https://api.x.ai/v1",
        api_key_env_vars=("XAI_API_KEY", "GROK_API_KEY"),
        base_url_env_var="XAI_BASE_URL",
    ),
    "openrouter": ProviderConfig(
        id="openrouter",
        name="OpenRouter",
        auth_type="api_key",
        inference_base_url="https://openrouter.ai/api/v1",
        api_key_env_vars=("OPENROUTER_API_KEY",),
        base_url_env_var="OPENROUTER_BASE_URL",
    ),
 }
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,8 @@ license = { text = "MIT" }
 dependencies = [
  # Core — pinned to known-good ranges to limit supply chain attack surface
  "openai>=2.21.0,<3",
-  "anthropic>=0.39.0,<1",\n  "google-genai>=1.2.0,<2",
+  "anthropic>=0.39.0,<1",
  "google-genai>=1.2.0,<2",
  "python-dotenv>=1.2.1,<2",
  "fire>=0.7.1,<1",
  "httpx>=0.28.1,<1",
--- a/run_agent.py
+++ b/run_agent.py
@@ -3472,6 +3472,79 @@ class AIAgent:
        http_client = getattr(client, "_client", None)
        return bool(getattr(http_client, "is_closed", False))
    def _coerce_rate_limit_int(self, value: Any) -> Optional[int]:
        try:
            if value is None or value == "":
                return None
            return int(float(str(value).strip()))
        except Exception:
            return None
    def _parse_rate_limit_reset_seconds(self, value: Any) -> Optional[int]:
        if value is None:
            return None
        text = str(value).strip().lower()
        if not text:
            return None
        try:
            return int(round(float(text)))
        except Exception:
            pass
        total = 0.0
        matches = re.findall(r"(\d+(?:\.\d+)?)(ms|s|m|h)", text)
        if not matches:
            return None
        for number, unit in matches:
            value_f = float(number)
            if unit == "ms":
                total += value_f / 1000.0
            elif unit == "s":
                total += value_f
            elif unit == "m":
                total += value_f * 60.0
            elif unit == "h":
                total += value_f * 3600.0
        return int(round(total))
    def _capture_openai_http_response(self, response: Any) -> None:
        if self.api_mode == "anthropic_messages":
            return
        headers = getattr(response, "headers", None)
        if not headers:
            return
        lowered = {str(k).lower(): str(v) for k, v in headers.items()}
        telemetry = dict(getattr(self, "session_openai_rate_limits", {}) or {})
        def _put(key: str, value: Any) -> None:
            if value is not None:
                telemetry[key] = value
        _put("status_code", getattr(response, "status_code", None))
        _put("limit_requests", self._coerce_rate_limit_int(lowered.get("x-ratelimit-limit-requests")))
        _put("remaining_requests", self._coerce_rate_limit_int(lowered.get("x-ratelimit-remaining-requests")))
        _put("limit_tokens", self._coerce_rate_limit_int(lowered.get("x-ratelimit-limit-tokens")))
        _put("remaining_tokens", self._coerce_rate_limit_int(lowered.get("x-ratelimit-remaining-tokens")))
        _put("reset_requests_seconds", self._parse_rate_limit_reset_seconds(lowered.get("x-ratelimit-reset-requests")))
        _put("reset_tokens_seconds", self._parse_rate_limit_reset_seconds(lowered.get("x-ratelimit-reset-tokens")))
        retry_after_seconds = None
        retry_after_ms = self._coerce_rate_limit_int(lowered.get("retry-after-ms"))
        if retry_after_ms is not None:
            retry_after_seconds = max(0, int(round(retry_after_ms / 1000.0)))
        if retry_after_seconds is None:
            retry_after_seconds = self._parse_rate_limit_reset_seconds(lowered.get("retry-after"))
        _put("retry_after_seconds", retry_after_seconds)
        _put("observed_at", int(time.time()))
        request_id = lowered.get("x-request-id") or lowered.get("openai-request-id")
        if request_id:
            self.session_last_request_id = request_id
            _put("request_id", request_id)
        self.session_openai_rate_limits = telemetry
        if getattr(response, "status_code", None) == 429:
            self.session_rate_limit_events = (getattr(self, "session_rate_limit_events", 0) or 0) + 1
    def _create_openai_client(self, client_kwargs: dict, *, reason: str, shared: bool) -> Any:
        if self.provider == "copilot-acp" or str(client_kwargs.get("base_url", "")).startswith("acp://copilot"):
            from agent.copilot_acp_client import CopilotACPClient
@@ -3485,6 +3558,23 @@ class AIAgent:
            )
            return client
        client = OpenAI(**client_kwargs)
        http_client = getattr(client, "_client", None)
        if http_client is not None and not getattr(http_client, "_hermes_response_telemetry_installed", False):
            original_send = http_client.send
            def _send_with_telemetry(request, *args, **kwargs):
                response = original_send(request, *args, **kwargs)
                try:
                    self._capture_openai_http_response(response)
                except Exception as exc:
                    logger.debug("OpenAI response telemetry capture failed: %s", exc)
                return response
            http_client.send = _send_with_telemetry
            try:
                setattr(http_client, "_hermes_response_telemetry_installed", True)
            except Exception:
                pass
        logger.info(
            "OpenAI client created (%s, shared=%s) %s",
            reason,
@@ -7466,6 +7556,53 @@ class AIAgent:
                if hasattr(self, '_incomplete_scratchpad_retries'):
                    self._incomplete_scratchpad_retries = 0
                # ── Uniwizard: Semantic refusal detection ──────────────────
                # Catches 200 OK responses where the model REFUSED the request.
                # No existing LLM gateway does this. This is novel.
                if (assistant_message.content
                        and not assistant_message.tool_calls
                        and self._fallback_index < len(self._fallback_chain)):
                    _refusal_text = (assistant_message.content or "").strip()
                    _REFUSAL_PATTERNS = (
                        "I can't help with",
                        "I cannot help with",
                        "I'm not able to",
                        "I am not able to",
                        "I must decline",
                        "I'm unable to",
                        "I am unable to",
                        "against my guidelines",
                        "against my policy",
                        "I can't assist with",
                        "I cannot assist with",
                        "I apologize, but I can't",
                        "I'm sorry, but I can't",
                        "I'm sorry, but I cannot",
                        "not something I can help",
                        "I don't think I should",
                        "I can't fulfill that",
                        "I cannot fulfill that",
                        "I'm not comfortable",
                        "I can't provide",
                        "I cannot provide",
                    )
                    _refusal_lower = _refusal_text.lower()
                    _is_refusal = any(p.lower() in _refusal_lower for p in _REFUSAL_PATTERNS)
                    if _is_refusal:
                        _fb_target = self._fallback_chain[self._fallback_index]
                        self._emit_status(
                            f"🚫 Semantic refusal detected from {self.provider}/{self.model}. "
                            f"Rerouting to {_fb_target.get('model', '?')} via {_fb_target.get('provider', '?')}..."
                        )
                        logging.warning(
                            "Refusal detected from %s/%s: %.80s...",
                            self.provider, self.model, _refusal_text,
                        )
                        if self._try_activate_fallback():
                            retry_count = 0
                            continue
                # ── End refusal detection ──────────────────────────────────
                if self.api_mode == "codex_responses" and finish_reason == "incomplete":
                    if not hasattr(self, "_codex_incomplete_retries"):
                        self._codex_incomplete_retries = 0
--- a/tests/test_cli_status_bar.py
+++ b/tests/test_cli_status_bar.py
@@ -144,6 +144,42 @@ class TestCLIUsageReport:
        assert "0.064" in output
        assert "Session duration:" in output
        assert "Compressions:" in output
        assert "Cost input:" in output
        assert "Cost output:" in output
    def test_show_usage_displays_rate_limit_telemetry(self, capsys):
        cli_obj = _attach_agent(
            _make_cli(model="openai/gpt-5.4"),
            prompt_tokens=10_000,
            completion_tokens=500,
            total_tokens=10_500,
            api_calls=3,
            context_tokens=10_500,
            context_length=200_000,
        )
        cli_obj.agent.provider = "openai-codex"
        cli_obj.agent.session_openai_rate_limits = {
            "status_code": 200,
            "limit_requests": 60,
            "remaining_requests": 48,
            "reset_requests_seconds": 33,
            "limit_tokens": 2000000,
            "remaining_tokens": 1750000,
            "reset_tokens_seconds": 90,
            "retry_after_seconds": 5,
        }
        cli_obj.agent.session_last_request_id = "req_123"
        cli_obj.agent.session_rate_limit_events = 2
        cli_obj.verbose = False
        cli_obj._show_usage()
        output = capsys.readouterr().out
        assert "Last request id:" in output
        assert "Req limit:" in output
        assert "Token limit:" in output
        assert "Retry after:" in output
        assert "Rate limit hits:" in output
    def test_show_usage_marks_unknown_pricing(self, capsys):
        cli_obj = _attach_agent(