Automated salvage commit — agent session ended (exit 124). Work in progress, may need continuation.
80 lines
2.9 KiB
Python
80 lines
2.9 KiB
Python
"""Circuit-breaker and health tracking for the Cascade LLM Router.
|
|
|
|
Standalone functions that mutate Provider state in place.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from datetime import UTC, datetime
|
|
|
|
from infrastructure.router.models import CircuitState, Provider, ProviderStatus, RouterConfig
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def record_success(provider: Provider, latency_ms: float, config: RouterConfig) -> None:
|
|
"""Record a successful request."""
|
|
provider.metrics.total_requests += 1
|
|
provider.metrics.successful_requests += 1
|
|
provider.metrics.total_latency_ms += latency_ms
|
|
provider.metrics.last_request_time = datetime.now(UTC).isoformat()
|
|
provider.metrics.consecutive_failures = 0
|
|
|
|
# Close circuit breaker if half-open
|
|
if provider.circuit_state == CircuitState.HALF_OPEN:
|
|
provider.half_open_calls += 1
|
|
if provider.half_open_calls >= config.circuit_breaker_half_open_max_calls:
|
|
close_circuit(provider)
|
|
|
|
# Update status based on error rate
|
|
if provider.metrics.error_rate < 0.1:
|
|
provider.status = ProviderStatus.HEALTHY
|
|
elif provider.metrics.error_rate < 0.3:
|
|
provider.status = ProviderStatus.DEGRADED
|
|
|
|
|
|
def record_failure(provider: Provider, config: RouterConfig) -> None:
|
|
"""Record a failed request."""
|
|
provider.metrics.total_requests += 1
|
|
provider.metrics.failed_requests += 1
|
|
provider.metrics.last_error_time = datetime.now(UTC).isoformat()
|
|
provider.metrics.consecutive_failures += 1
|
|
|
|
# Check if we should open circuit breaker
|
|
if provider.metrics.consecutive_failures >= config.circuit_breaker_failure_threshold:
|
|
open_circuit(provider)
|
|
|
|
# Update status
|
|
if provider.metrics.error_rate > 0.3:
|
|
provider.status = ProviderStatus.DEGRADED
|
|
if provider.metrics.error_rate > 0.5:
|
|
provider.status = ProviderStatus.UNHEALTHY
|
|
|
|
|
|
def open_circuit(provider: Provider) -> None:
|
|
"""Open the circuit breaker for a provider."""
|
|
provider.circuit_state = CircuitState.OPEN
|
|
provider.circuit_opened_at = time.time()
|
|
provider.status = ProviderStatus.UNHEALTHY
|
|
logger.warning("Circuit breaker OPEN for %s", provider.name)
|
|
|
|
|
|
def can_close_circuit(provider: Provider, config: RouterConfig) -> bool:
|
|
"""Check if circuit breaker can transition to half-open."""
|
|
if provider.circuit_opened_at is None:
|
|
return False
|
|
elapsed = time.time() - provider.circuit_opened_at
|
|
return elapsed >= config.circuit_breaker_recovery_timeout
|
|
|
|
|
|
def close_circuit(provider: Provider) -> None:
|
|
"""Close the circuit breaker (provider healthy again)."""
|
|
provider.circuit_state = CircuitState.CLOSED
|
|
provider.circuit_opened_at = None
|
|
provider.half_open_calls = 0
|
|
provider.metrics.consecutive_failures = 0
|
|
provider.status = ProviderStatus.HEALTHY
|
|
logger.info("Circuit breaker CLOSED for %s", provider.name)
|