Files
Timmy-time-dashboard/src/infrastructure/router/health.py
Alexander Whitestone ab4b2f938d WIP: Claude Code progress on #1342
Automated salvage commit — agent session ended (exit 124).
Work in progress, may need continuation.
2026-03-23 23:03:51 -04:00

80 lines
2.9 KiB
Python

"""Circuit-breaker and health tracking for the Cascade LLM Router.
Standalone functions that mutate Provider state in place.
"""
from __future__ import annotations
import logging
import time
from datetime import UTC, datetime
from infrastructure.router.models import CircuitState, Provider, ProviderStatus, RouterConfig
logger = logging.getLogger(__name__)
def record_success(provider: Provider, latency_ms: float, config: RouterConfig) -> None:
"""Record a successful request."""
provider.metrics.total_requests += 1
provider.metrics.successful_requests += 1
provider.metrics.total_latency_ms += latency_ms
provider.metrics.last_request_time = datetime.now(UTC).isoformat()
provider.metrics.consecutive_failures = 0
# Close circuit breaker if half-open
if provider.circuit_state == CircuitState.HALF_OPEN:
provider.half_open_calls += 1
if provider.half_open_calls >= config.circuit_breaker_half_open_max_calls:
close_circuit(provider)
# Update status based on error rate
if provider.metrics.error_rate < 0.1:
provider.status = ProviderStatus.HEALTHY
elif provider.metrics.error_rate < 0.3:
provider.status = ProviderStatus.DEGRADED
def record_failure(provider: Provider, config: RouterConfig) -> None:
"""Record a failed request."""
provider.metrics.total_requests += 1
provider.metrics.failed_requests += 1
provider.metrics.last_error_time = datetime.now(UTC).isoformat()
provider.metrics.consecutive_failures += 1
# Check if we should open circuit breaker
if provider.metrics.consecutive_failures >= config.circuit_breaker_failure_threshold:
open_circuit(provider)
# Update status
if provider.metrics.error_rate > 0.3:
provider.status = ProviderStatus.DEGRADED
if provider.metrics.error_rate > 0.5:
provider.status = ProviderStatus.UNHEALTHY
def open_circuit(provider: Provider) -> None:
"""Open the circuit breaker for a provider."""
provider.circuit_state = CircuitState.OPEN
provider.circuit_opened_at = time.time()
provider.status = ProviderStatus.UNHEALTHY
logger.warning("Circuit breaker OPEN for %s", provider.name)
def can_close_circuit(provider: Provider, config: RouterConfig) -> bool:
"""Check if circuit breaker can transition to half-open."""
if provider.circuit_opened_at is None:
return False
elapsed = time.time() - provider.circuit_opened_at
return elapsed >= config.circuit_breaker_recovery_timeout
def close_circuit(provider: Provider) -> None:
"""Close the circuit breaker (provider healthy again)."""
provider.circuit_state = CircuitState.CLOSED
provider.circuit_opened_at = None
provider.half_open_calls = 0
provider.metrics.consecutive_failures = 0
provider.status = ProviderStatus.HEALTHY
logger.info("Circuit breaker CLOSED for %s", provider.name)