Compare commits
1 Commits
security/f
...
feat/sover
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3b09b7b49d |
@@ -63,6 +63,11 @@ _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
|
|||||||
"opencode-zen": "gemini-3-flash",
|
"opencode-zen": "gemini-3-flash",
|
||||||
"opencode-go": "glm-5",
|
"opencode-go": "glm-5",
|
||||||
"kilocode": "google/gemini-3-flash-preview",
|
"kilocode": "google/gemini-3-flash-preview",
|
||||||
|
# Uniwizard backends
|
||||||
|
"gemini": "gemini-2.5-flash",
|
||||||
|
"groq": "llama-3.3-70b-versatile",
|
||||||
|
"grok": "grok-3-mini-fast",
|
||||||
|
"openrouter": "openai/gpt-4.1-mini",
|
||||||
}
|
}
|
||||||
|
|
||||||
# OpenRouter app attribution headers
|
# OpenRouter app attribution headers
|
||||||
|
|||||||
@@ -75,6 +75,22 @@ class CostResult:
|
|||||||
notes: tuple[str, ...] = ()
|
notes: tuple[str, ...] = ()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class CostBreakdown:
|
||||||
|
input_usd: Optional[Decimal]
|
||||||
|
output_usd: Optional[Decimal]
|
||||||
|
cache_read_usd: Optional[Decimal]
|
||||||
|
cache_write_usd: Optional[Decimal]
|
||||||
|
request_usd: Optional[Decimal]
|
||||||
|
total_usd: Optional[Decimal]
|
||||||
|
status: CostStatus
|
||||||
|
source: CostSource
|
||||||
|
label: str
|
||||||
|
fetched_at: Optional[datetime] = None
|
||||||
|
pricing_version: Optional[str] = None
|
||||||
|
notes: tuple[str, ...] = ()
|
||||||
|
|
||||||
|
|
||||||
_UTC_NOW = lambda: datetime.now(timezone.utc)
|
_UTC_NOW = lambda: datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
@@ -93,6 +109,25 @@ _OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
|
|||||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||||
pricing_version="anthropic-prompt-caching-2026-03-16",
|
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||||
),
|
),
|
||||||
|
# Aliases for short model names (Anthropic API resolves these to dated versions)
|
||||||
|
("anthropic", "claude-opus-4-6"): PricingEntry(
|
||||||
|
input_cost_per_million=Decimal("15.00"),
|
||||||
|
output_cost_per_million=Decimal("75.00"),
|
||||||
|
cache_read_cost_per_million=Decimal("1.50"),
|
||||||
|
cache_write_cost_per_million=Decimal("18.75"),
|
||||||
|
source="official_docs_snapshot",
|
||||||
|
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||||
|
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||||
|
),
|
||||||
|
("anthropic", "claude-opus-4.6"): PricingEntry(
|
||||||
|
input_cost_per_million=Decimal("15.00"),
|
||||||
|
output_cost_per_million=Decimal("75.00"),
|
||||||
|
cache_read_cost_per_million=Decimal("1.50"),
|
||||||
|
cache_write_cost_per_million=Decimal("18.75"),
|
||||||
|
source="official_docs_snapshot",
|
||||||
|
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||||
|
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||||
|
),
|
||||||
(
|
(
|
||||||
"anthropic",
|
"anthropic",
|
||||||
"claude-sonnet-4-20250514",
|
"claude-sonnet-4-20250514",
|
||||||
@@ -105,6 +140,24 @@ _OFFICIAL_DOCS_PRICING: Dict[tuple[str, str], PricingEntry] = {
|
|||||||
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||||
pricing_version="anthropic-prompt-caching-2026-03-16",
|
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||||
),
|
),
|
||||||
|
("anthropic", "claude-sonnet-4-5"): PricingEntry(
|
||||||
|
input_cost_per_million=Decimal("3.00"),
|
||||||
|
output_cost_per_million=Decimal("15.00"),
|
||||||
|
cache_read_cost_per_million=Decimal("0.30"),
|
||||||
|
cache_write_cost_per_million=Decimal("3.75"),
|
||||||
|
source="official_docs_snapshot",
|
||||||
|
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||||
|
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||||
|
),
|
||||||
|
("anthropic", "claude-sonnet-4.5"): PricingEntry(
|
||||||
|
input_cost_per_million=Decimal("3.00"),
|
||||||
|
output_cost_per_million=Decimal("15.00"),
|
||||||
|
cache_read_cost_per_million=Decimal("0.30"),
|
||||||
|
cache_write_cost_per_million=Decimal("3.75"),
|
||||||
|
source="official_docs_snapshot",
|
||||||
|
source_url="https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching",
|
||||||
|
pricing_version="anthropic-prompt-caching-2026-03-16",
|
||||||
|
),
|
||||||
# OpenAI
|
# OpenAI
|
||||||
(
|
(
|
||||||
"openai",
|
"openai",
|
||||||
@@ -654,3 +707,80 @@ def format_token_count_compact(value: int) -> str:
|
|||||||
return f"{sign}{text}{suffix}"
|
return f"{sign}{text}{suffix}"
|
||||||
|
|
||||||
return f"{value:,}"
|
return f"{value:,}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_usage_cost_breakdown(
|
||||||
|
model_name: str,
|
||||||
|
usage: CanonicalUsage,
|
||||||
|
*,
|
||||||
|
provider: Optional[str] = None,
|
||||||
|
base_url: Optional[str] = None,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
) -> CostBreakdown:
|
||||||
|
"""Estimate per-bucket cost breakdown for a usage record.
|
||||||
|
|
||||||
|
Returns the same status/source semantics as estimate_usage_cost(), but splits
|
||||||
|
the total into input/cache/output/request components when pricing data is
|
||||||
|
available. For subscription-included routes (e.g. openai-codex), all
|
||||||
|
components are reported as zero-cost instead of unknown.
|
||||||
|
"""
|
||||||
|
cost_result = estimate_usage_cost(
|
||||||
|
model_name,
|
||||||
|
usage,
|
||||||
|
provider=provider,
|
||||||
|
base_url=base_url,
|
||||||
|
api_key=api_key,
|
||||||
|
)
|
||||||
|
route = resolve_billing_route(model_name, provider=provider, base_url=base_url)
|
||||||
|
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url, api_key=api_key)
|
||||||
|
if not entry and route.billing_mode == "subscription_included":
|
||||||
|
entry = PricingEntry(
|
||||||
|
input_cost_per_million=_ZERO,
|
||||||
|
output_cost_per_million=_ZERO,
|
||||||
|
cache_read_cost_per_million=_ZERO,
|
||||||
|
cache_write_cost_per_million=_ZERO,
|
||||||
|
request_cost=_ZERO,
|
||||||
|
source="none",
|
||||||
|
pricing_version="included-route",
|
||||||
|
)
|
||||||
|
|
||||||
|
if not entry:
|
||||||
|
return CostBreakdown(
|
||||||
|
input_usd=None,
|
||||||
|
output_usd=None,
|
||||||
|
cache_read_usd=None,
|
||||||
|
cache_write_usd=None,
|
||||||
|
request_usd=None,
|
||||||
|
total_usd=cost_result.amount_usd,
|
||||||
|
status=cost_result.status,
|
||||||
|
source=cost_result.source,
|
||||||
|
label=cost_result.label,
|
||||||
|
fetched_at=cost_result.fetched_at,
|
||||||
|
pricing_version=cost_result.pricing_version,
|
||||||
|
notes=cost_result.notes,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _component(tokens: int, rate: Optional[Decimal]) -> Optional[Decimal]:
|
||||||
|
if rate is None:
|
||||||
|
return None
|
||||||
|
return (Decimal(tokens or 0) * rate) / _ONE_MILLION
|
||||||
|
|
||||||
|
request_usd = None
|
||||||
|
if entry.request_cost is not None:
|
||||||
|
request_usd = Decimal(usage.request_count or 0) * entry.request_cost
|
||||||
|
|
||||||
|
return CostBreakdown(
|
||||||
|
input_usd=_component(usage.input_tokens, entry.input_cost_per_million),
|
||||||
|
output_usd=_component(usage.output_tokens, entry.output_cost_per_million),
|
||||||
|
cache_read_usd=_component(usage.cache_read_tokens, entry.cache_read_cost_per_million),
|
||||||
|
cache_write_usd=_component(usage.cache_write_tokens, entry.cache_write_cost_per_million),
|
||||||
|
request_usd=request_usd,
|
||||||
|
total_usd=cost_result.amount_usd,
|
||||||
|
status=cost_result.status,
|
||||||
|
source=cost_result.source,
|
||||||
|
label=cost_result.label,
|
||||||
|
fetched_at=cost_result.fetched_at,
|
||||||
|
pricing_version=cost_result.pricing_version,
|
||||||
|
notes=cost_result.notes,
|
||||||
|
)
|
||||||
|
|||||||
76
cli.py
76
cli.py
@@ -4563,7 +4563,30 @@ class HermesCLI:
|
|||||||
print("(._.) No API calls made yet in this session.")
|
print("(._.) No API calls made yet in this session.")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Current context window state
|
def _fmt_money(amount):
|
||||||
|
return "n/a" if amount is None else f"${float(amount):.4f}"
|
||||||
|
|
||||||
|
def _fmt_limit(remaining, limit):
|
||||||
|
if remaining is None and limit is None:
|
||||||
|
return "n/a"
|
||||||
|
if remaining is None:
|
||||||
|
return f"? / {limit:,}"
|
||||||
|
if limit is None:
|
||||||
|
return f"{remaining:,} / ?"
|
||||||
|
return f"{remaining:,} / {limit:,}"
|
||||||
|
|
||||||
|
def _fmt_reset(seconds):
|
||||||
|
if seconds is None:
|
||||||
|
return "n/a"
|
||||||
|
seconds = int(seconds)
|
||||||
|
if seconds < 60:
|
||||||
|
return f"{seconds}s"
|
||||||
|
minutes, secs = divmod(seconds, 60)
|
||||||
|
if minutes < 60:
|
||||||
|
return f"{minutes}m {secs}s"
|
||||||
|
hours, minutes = divmod(minutes, 60)
|
||||||
|
return f"{hours}h {minutes}m"
|
||||||
|
|
||||||
compressor = agent.context_compressor
|
compressor = agent.context_compressor
|
||||||
last_prompt = compressor.last_prompt_tokens
|
last_prompt = compressor.last_prompt_tokens
|
||||||
ctx_len = compressor.context_length
|
ctx_len = compressor.context_length
|
||||||
@@ -4571,14 +4594,21 @@ class HermesCLI:
|
|||||||
compressions = compressor.compression_count
|
compressions = compressor.compression_count
|
||||||
|
|
||||||
msg_count = len(self.conversation_history)
|
msg_count = len(self.conversation_history)
|
||||||
|
usage = CanonicalUsage(
|
||||||
|
input_tokens=input_tokens,
|
||||||
|
output_tokens=output_tokens,
|
||||||
|
cache_read_tokens=cache_read_tokens,
|
||||||
|
cache_write_tokens=cache_write_tokens,
|
||||||
|
)
|
||||||
cost_result = estimate_usage_cost(
|
cost_result = estimate_usage_cost(
|
||||||
agent.model,
|
agent.model,
|
||||||
CanonicalUsage(
|
usage,
|
||||||
input_tokens=input_tokens,
|
provider=getattr(agent, "provider", None),
|
||||||
output_tokens=output_tokens,
|
base_url=getattr(agent, "base_url", None),
|
||||||
cache_read_tokens=cache_read_tokens,
|
)
|
||||||
cache_write_tokens=cache_write_tokens,
|
cost_breakdown = estimate_usage_cost_breakdown(
|
||||||
),
|
agent.model,
|
||||||
|
usage,
|
||||||
provider=getattr(agent, "provider", None),
|
provider=getattr(agent, "provider", None),
|
||||||
base_url=getattr(agent, "base_url", None),
|
base_url=getattr(agent, "base_url", None),
|
||||||
)
|
)
|
||||||
@@ -4605,6 +4635,38 @@ class HermesCLI:
|
|||||||
print(f" Total cost: {'included':>10}")
|
print(f" Total cost: {'included':>10}")
|
||||||
else:
|
else:
|
||||||
print(f" Total cost: {'n/a':>10}")
|
print(f" Total cost: {'n/a':>10}")
|
||||||
|
print(f" Cost input: {_fmt_money(cost_breakdown.input_usd):>10}")
|
||||||
|
print(f" Cost cache read: {_fmt_money(cost_breakdown.cache_read_usd):>10}")
|
||||||
|
print(f" Cost cache write: {_fmt_money(cost_breakdown.cache_write_usd):>10}")
|
||||||
|
print(f" Cost output: {_fmt_money(cost_breakdown.output_usd):>10}")
|
||||||
|
if cost_breakdown.request_usd is not None:
|
||||||
|
print(f" Cost requests: {_fmt_money(cost_breakdown.request_usd):>10}")
|
||||||
|
|
||||||
|
rate_limits = getattr(agent, "session_openai_rate_limits", None) or {}
|
||||||
|
last_request_id = getattr(agent, "session_last_request_id", None)
|
||||||
|
rate_limit_events = getattr(agent, "session_rate_limit_events", 0) or 0
|
||||||
|
if last_request_id:
|
||||||
|
print(f" Last request id: {last_request_id:>10}")
|
||||||
|
if rate_limits:
|
||||||
|
status_code = rate_limits.get("status_code")
|
||||||
|
if status_code is not None:
|
||||||
|
print(f" Last HTTP status: {status_code:>10}")
|
||||||
|
req_remaining = rate_limits.get("remaining_requests")
|
||||||
|
req_limit = rate_limits.get("limit_requests")
|
||||||
|
req_reset = rate_limits.get("reset_requests_seconds")
|
||||||
|
if req_remaining is not None or req_limit is not None:
|
||||||
|
print(f" Req limit: {_fmt_limit(req_remaining, req_limit):>14} reset {_fmt_reset(req_reset)}")
|
||||||
|
tok_remaining = rate_limits.get("remaining_tokens")
|
||||||
|
tok_limit = rate_limits.get("limit_tokens")
|
||||||
|
tok_reset = rate_limits.get("reset_tokens_seconds")
|
||||||
|
if tok_remaining is not None or tok_limit is not None:
|
||||||
|
print(f" Token limit: {_fmt_limit(tok_remaining, tok_limit):>14} reset {_fmt_reset(tok_reset)}")
|
||||||
|
retry_after = rate_limits.get("retry_after_seconds")
|
||||||
|
if retry_after is not None:
|
||||||
|
print(f" Retry after: {_fmt_reset(retry_after):>10}")
|
||||||
|
if rate_limit_events:
|
||||||
|
print(f" Rate limit hits: {rate_limit_events:>10,}")
|
||||||
|
|
||||||
print(f" {'─' * 40}")
|
print(f" {'─' * 40}")
|
||||||
print(f" Current context: {last_prompt:,} / {ctx_len:,} ({pct:.0f}%)")
|
print(f" Current context: {last_prompt:,} / {ctx_len:,} ({pct:.0f}%)")
|
||||||
print(f" Messages: {msg_count}")
|
print(f" Messages: {msg_count}")
|
||||||
|
|||||||
@@ -220,6 +220,39 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
|
|||||||
api_key_env_vars=("HF_TOKEN",),
|
api_key_env_vars=("HF_TOKEN",),
|
||||||
base_url_env_var="HF_BASE_URL",
|
base_url_env_var="HF_BASE_URL",
|
||||||
),
|
),
|
||||||
|
# ── Uniwizard backends (added 2026-03-30) ─────────────────────────
|
||||||
|
"gemini": ProviderConfig(
|
||||||
|
id="gemini",
|
||||||
|
name="Google Gemini",
|
||||||
|
auth_type="api_key",
|
||||||
|
inference_base_url="https://generativelanguage.googleapis.com/v1beta/openai",
|
||||||
|
api_key_env_vars=("GEMINI_API_KEY",),
|
||||||
|
base_url_env_var="GEMINI_BASE_URL",
|
||||||
|
),
|
||||||
|
"groq": ProviderConfig(
|
||||||
|
id="groq",
|
||||||
|
name="Groq",
|
||||||
|
auth_type="api_key",
|
||||||
|
inference_base_url="https://api.groq.com/openai/v1",
|
||||||
|
api_key_env_vars=("GROQ_API_KEY",),
|
||||||
|
base_url_env_var="GROQ_BASE_URL",
|
||||||
|
),
|
||||||
|
"grok": ProviderConfig(
|
||||||
|
id="grok",
|
||||||
|
name="xAI Grok",
|
||||||
|
auth_type="api_key",
|
||||||
|
inference_base_url="https://api.x.ai/v1",
|
||||||
|
api_key_env_vars=("XAI_API_KEY", "GROK_API_KEY"),
|
||||||
|
base_url_env_var="XAI_BASE_URL",
|
||||||
|
),
|
||||||
|
"openrouter": ProviderConfig(
|
||||||
|
id="openrouter",
|
||||||
|
name="OpenRouter",
|
||||||
|
auth_type="api_key",
|
||||||
|
inference_base_url="https://openrouter.ai/api/v1",
|
||||||
|
api_key_env_vars=("OPENROUTER_API_KEY",),
|
||||||
|
base_url_env_var="OPENROUTER_BASE_URL",
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,8 @@ license = { text = "MIT" }
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
# Core — pinned to known-good ranges to limit supply chain attack surface
|
# Core — pinned to known-good ranges to limit supply chain attack surface
|
||||||
"openai>=2.21.0,<3",
|
"openai>=2.21.0,<3",
|
||||||
"anthropic>=0.39.0,<1",\n "google-genai>=1.2.0,<2",
|
"anthropic>=0.39.0,<1",
|
||||||
|
"google-genai>=1.2.0,<2",
|
||||||
"python-dotenv>=1.2.1,<2",
|
"python-dotenv>=1.2.1,<2",
|
||||||
"fire>=0.7.1,<1",
|
"fire>=0.7.1,<1",
|
||||||
"httpx>=0.28.1,<1",
|
"httpx>=0.28.1,<1",
|
||||||
|
|||||||
137
run_agent.py
137
run_agent.py
@@ -3472,6 +3472,79 @@ class AIAgent:
|
|||||||
http_client = getattr(client, "_client", None)
|
http_client = getattr(client, "_client", None)
|
||||||
return bool(getattr(http_client, "is_closed", False))
|
return bool(getattr(http_client, "is_closed", False))
|
||||||
|
|
||||||
|
def _coerce_rate_limit_int(self, value: Any) -> Optional[int]:
|
||||||
|
try:
|
||||||
|
if value is None or value == "":
|
||||||
|
return None
|
||||||
|
return int(float(str(value).strip()))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _parse_rate_limit_reset_seconds(self, value: Any) -> Optional[int]:
|
||||||
|
if value is None:
|
||||||
|
return None
|
||||||
|
text = str(value).strip().lower()
|
||||||
|
if not text:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(round(float(text)))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
total = 0.0
|
||||||
|
matches = re.findall(r"(\d+(?:\.\d+)?)(ms|s|m|h)", text)
|
||||||
|
if not matches:
|
||||||
|
return None
|
||||||
|
for number, unit in matches:
|
||||||
|
value_f = float(number)
|
||||||
|
if unit == "ms":
|
||||||
|
total += value_f / 1000.0
|
||||||
|
elif unit == "s":
|
||||||
|
total += value_f
|
||||||
|
elif unit == "m":
|
||||||
|
total += value_f * 60.0
|
||||||
|
elif unit == "h":
|
||||||
|
total += value_f * 3600.0
|
||||||
|
return int(round(total))
|
||||||
|
|
||||||
|
def _capture_openai_http_response(self, response: Any) -> None:
|
||||||
|
if self.api_mode == "anthropic_messages":
|
||||||
|
return
|
||||||
|
headers = getattr(response, "headers", None)
|
||||||
|
if not headers:
|
||||||
|
return
|
||||||
|
lowered = {str(k).lower(): str(v) for k, v in headers.items()}
|
||||||
|
telemetry = dict(getattr(self, "session_openai_rate_limits", {}) or {})
|
||||||
|
|
||||||
|
def _put(key: str, value: Any) -> None:
|
||||||
|
if value is not None:
|
||||||
|
telemetry[key] = value
|
||||||
|
|
||||||
|
_put("status_code", getattr(response, "status_code", None))
|
||||||
|
_put("limit_requests", self._coerce_rate_limit_int(lowered.get("x-ratelimit-limit-requests")))
|
||||||
|
_put("remaining_requests", self._coerce_rate_limit_int(lowered.get("x-ratelimit-remaining-requests")))
|
||||||
|
_put("limit_tokens", self._coerce_rate_limit_int(lowered.get("x-ratelimit-limit-tokens")))
|
||||||
|
_put("remaining_tokens", self._coerce_rate_limit_int(lowered.get("x-ratelimit-remaining-tokens")))
|
||||||
|
_put("reset_requests_seconds", self._parse_rate_limit_reset_seconds(lowered.get("x-ratelimit-reset-requests")))
|
||||||
|
_put("reset_tokens_seconds", self._parse_rate_limit_reset_seconds(lowered.get("x-ratelimit-reset-tokens")))
|
||||||
|
|
||||||
|
retry_after_seconds = None
|
||||||
|
retry_after_ms = self._coerce_rate_limit_int(lowered.get("retry-after-ms"))
|
||||||
|
if retry_after_ms is not None:
|
||||||
|
retry_after_seconds = max(0, int(round(retry_after_ms / 1000.0)))
|
||||||
|
if retry_after_seconds is None:
|
||||||
|
retry_after_seconds = self._parse_rate_limit_reset_seconds(lowered.get("retry-after"))
|
||||||
|
_put("retry_after_seconds", retry_after_seconds)
|
||||||
|
_put("observed_at", int(time.time()))
|
||||||
|
|
||||||
|
request_id = lowered.get("x-request-id") or lowered.get("openai-request-id")
|
||||||
|
if request_id:
|
||||||
|
self.session_last_request_id = request_id
|
||||||
|
_put("request_id", request_id)
|
||||||
|
|
||||||
|
self.session_openai_rate_limits = telemetry
|
||||||
|
if getattr(response, "status_code", None) == 429:
|
||||||
|
self.session_rate_limit_events = (getattr(self, "session_rate_limit_events", 0) or 0) + 1
|
||||||
|
|
||||||
def _create_openai_client(self, client_kwargs: dict, *, reason: str, shared: bool) -> Any:
|
def _create_openai_client(self, client_kwargs: dict, *, reason: str, shared: bool) -> Any:
|
||||||
if self.provider == "copilot-acp" or str(client_kwargs.get("base_url", "")).startswith("acp://copilot"):
|
if self.provider == "copilot-acp" or str(client_kwargs.get("base_url", "")).startswith("acp://copilot"):
|
||||||
from agent.copilot_acp_client import CopilotACPClient
|
from agent.copilot_acp_client import CopilotACPClient
|
||||||
@@ -3485,6 +3558,23 @@ class AIAgent:
|
|||||||
)
|
)
|
||||||
return client
|
return client
|
||||||
client = OpenAI(**client_kwargs)
|
client = OpenAI(**client_kwargs)
|
||||||
|
http_client = getattr(client, "_client", None)
|
||||||
|
if http_client is not None and not getattr(http_client, "_hermes_response_telemetry_installed", False):
|
||||||
|
original_send = http_client.send
|
||||||
|
|
||||||
|
def _send_with_telemetry(request, *args, **kwargs):
|
||||||
|
response = original_send(request, *args, **kwargs)
|
||||||
|
try:
|
||||||
|
self._capture_openai_http_response(response)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug("OpenAI response telemetry capture failed: %s", exc)
|
||||||
|
return response
|
||||||
|
|
||||||
|
http_client.send = _send_with_telemetry
|
||||||
|
try:
|
||||||
|
setattr(http_client, "_hermes_response_telemetry_installed", True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
logger.info(
|
logger.info(
|
||||||
"OpenAI client created (%s, shared=%s) %s",
|
"OpenAI client created (%s, shared=%s) %s",
|
||||||
reason,
|
reason,
|
||||||
@@ -7466,6 +7556,53 @@ class AIAgent:
|
|||||||
if hasattr(self, '_incomplete_scratchpad_retries'):
|
if hasattr(self, '_incomplete_scratchpad_retries'):
|
||||||
self._incomplete_scratchpad_retries = 0
|
self._incomplete_scratchpad_retries = 0
|
||||||
|
|
||||||
|
# ── Uniwizard: Semantic refusal detection ──────────────────
|
||||||
|
# Catches 200 OK responses where the model REFUSED the request.
|
||||||
|
# No existing LLM gateway does this. This is novel.
|
||||||
|
if (assistant_message.content
|
||||||
|
and not assistant_message.tool_calls
|
||||||
|
and self._fallback_index < len(self._fallback_chain)):
|
||||||
|
_refusal_text = (assistant_message.content or "").strip()
|
||||||
|
_REFUSAL_PATTERNS = (
|
||||||
|
"I can't help with",
|
||||||
|
"I cannot help with",
|
||||||
|
"I'm not able to",
|
||||||
|
"I am not able to",
|
||||||
|
"I must decline",
|
||||||
|
"I'm unable to",
|
||||||
|
"I am unable to",
|
||||||
|
"against my guidelines",
|
||||||
|
"against my policy",
|
||||||
|
"I can't assist with",
|
||||||
|
"I cannot assist with",
|
||||||
|
"I apologize, but I can't",
|
||||||
|
"I'm sorry, but I can't",
|
||||||
|
"I'm sorry, but I cannot",
|
||||||
|
"not something I can help",
|
||||||
|
"I don't think I should",
|
||||||
|
"I can't fulfill that",
|
||||||
|
"I cannot fulfill that",
|
||||||
|
"I'm not comfortable",
|
||||||
|
"I can't provide",
|
||||||
|
"I cannot provide",
|
||||||
|
)
|
||||||
|
_refusal_lower = _refusal_text.lower()
|
||||||
|
_is_refusal = any(p.lower() in _refusal_lower for p in _REFUSAL_PATTERNS)
|
||||||
|
if _is_refusal:
|
||||||
|
_fb_target = self._fallback_chain[self._fallback_index]
|
||||||
|
self._emit_status(
|
||||||
|
f"🚫 Semantic refusal detected from {self.provider}/{self.model}. "
|
||||||
|
f"Rerouting to {_fb_target.get('model', '?')} via {_fb_target.get('provider', '?')}..."
|
||||||
|
)
|
||||||
|
logging.warning(
|
||||||
|
"Refusal detected from %s/%s: %.80s...",
|
||||||
|
self.provider, self.model, _refusal_text,
|
||||||
|
)
|
||||||
|
if self._try_activate_fallback():
|
||||||
|
retry_count = 0
|
||||||
|
continue
|
||||||
|
# ── End refusal detection ──────────────────────────────────
|
||||||
|
|
||||||
if self.api_mode == "codex_responses" and finish_reason == "incomplete":
|
if self.api_mode == "codex_responses" and finish_reason == "incomplete":
|
||||||
if not hasattr(self, "_codex_incomplete_retries"):
|
if not hasattr(self, "_codex_incomplete_retries"):
|
||||||
self._codex_incomplete_retries = 0
|
self._codex_incomplete_retries = 0
|
||||||
|
|||||||
@@ -144,6 +144,42 @@ class TestCLIUsageReport:
|
|||||||
assert "0.064" in output
|
assert "0.064" in output
|
||||||
assert "Session duration:" in output
|
assert "Session duration:" in output
|
||||||
assert "Compressions:" in output
|
assert "Compressions:" in output
|
||||||
|
assert "Cost input:" in output
|
||||||
|
assert "Cost output:" in output
|
||||||
|
|
||||||
|
def test_show_usage_displays_rate_limit_telemetry(self, capsys):
|
||||||
|
cli_obj = _attach_agent(
|
||||||
|
_make_cli(model="openai/gpt-5.4"),
|
||||||
|
prompt_tokens=10_000,
|
||||||
|
completion_tokens=500,
|
||||||
|
total_tokens=10_500,
|
||||||
|
api_calls=3,
|
||||||
|
context_tokens=10_500,
|
||||||
|
context_length=200_000,
|
||||||
|
)
|
||||||
|
cli_obj.agent.provider = "openai-codex"
|
||||||
|
cli_obj.agent.session_openai_rate_limits = {
|
||||||
|
"status_code": 200,
|
||||||
|
"limit_requests": 60,
|
||||||
|
"remaining_requests": 48,
|
||||||
|
"reset_requests_seconds": 33,
|
||||||
|
"limit_tokens": 2000000,
|
||||||
|
"remaining_tokens": 1750000,
|
||||||
|
"reset_tokens_seconds": 90,
|
||||||
|
"retry_after_seconds": 5,
|
||||||
|
}
|
||||||
|
cli_obj.agent.session_last_request_id = "req_123"
|
||||||
|
cli_obj.agent.session_rate_limit_events = 2
|
||||||
|
cli_obj.verbose = False
|
||||||
|
|
||||||
|
cli_obj._show_usage()
|
||||||
|
output = capsys.readouterr().out
|
||||||
|
|
||||||
|
assert "Last request id:" in output
|
||||||
|
assert "Req limit:" in output
|
||||||
|
assert "Token limit:" in output
|
||||||
|
assert "Retry after:" in output
|
||||||
|
assert "Rate limit hits:" in output
|
||||||
|
|
||||||
def test_show_usage_marks_unknown_pricing(self, capsys):
|
def test_show_usage_marks_unknown_pricing(self, capsys):
|
||||||
cli_obj = _attach_agent(
|
cli_obj = _attach_agent(
|
||||||
|
|||||||
Reference in New Issue
Block a user