fix(network): scope connectivity cooldown by destination

This commit is contained in:
pixelpaws
2026-04-20 15:20:57 +08:00
parent 5a7f4dc88b
commit 7ab271c752
3 changed files with 154 additions and 42 deletions

View File

@@ -6,6 +6,7 @@ import asyncio
import errno
import logging
import socket
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Any
@@ -49,68 +50,118 @@ class ConnectivityGuard:
if hasattr(self, "_initialized"):
return
self._initialized = True
self.online = True
self.failure_count = 0
self.cooldown_until: datetime | None = None
self._default_destination = "__global__"
self._destination_states: dict[str, _DestinationState] = {
self._default_destination: _DestinationState()
}
self.base_backoff_seconds = 30
self.max_backoff_seconds = 300
self.failure_threshold = 3
@property
def online(self) -> bool:
return self._state_for_destination(None).online
@online.setter
def online(self, value: bool) -> None:
self._state_for_destination(None).online = value
@property
def failure_count(self) -> int:
return self._state_for_destination(None).failure_count
@failure_count.setter
def failure_count(self, value: int) -> None:
self._state_for_destination(None).failure_count = value
@property
def cooldown_until(self) -> datetime | None:
return self._state_for_destination(None).cooldown_until
@cooldown_until.setter
def cooldown_until(self, value: datetime | None) -> None:
self._state_for_destination(None).cooldown_until = value
def _now(self) -> datetime:
return datetime.now()
def in_cooldown(self) -> bool:
if self.cooldown_until is None:
def _normalize_destination(self, destination: str | None) -> str:
if destination is None or not destination.strip():
return self._default_destination
return destination.lower().strip()
def _state_for_destination(self, destination: str | None) -> "_DestinationState":
destination_key = self._normalize_destination(destination)
if destination_key not in self._destination_states:
self._destination_states[destination_key] = _DestinationState()
return self._destination_states[destination_key]
def in_cooldown(self, destination: str | None = None) -> bool:
state = self._state_for_destination(destination)
if state.cooldown_until is None:
return False
return self._now() < self.cooldown_until
return self._now() < state.cooldown_until
def cooldown_remaining_seconds(self) -> float:
if self.cooldown_until is None:
def cooldown_remaining_seconds(self, destination: str | None = None) -> float:
state = self._state_for_destination(destination)
if state.cooldown_until is None:
return 0.0
return max(0.0, (self.cooldown_until - self._now()).total_seconds())
return max(0.0, (state.cooldown_until - self._now()).total_seconds())
def should_block_request(self) -> bool:
return self.in_cooldown()
def should_block_request(self, destination: str | None = None) -> bool:
return self.in_cooldown(destination)
def register_success(self) -> None:
was_offline = (not self.online) or self.cooldown_until is not None
self.online = True
self.failure_count = 0
self.cooldown_until = None
def register_success(self, destination: str | None = None) -> None:
destination_key = self._normalize_destination(destination)
state = self._state_for_destination(destination_key)
was_offline = (not state.online) or state.cooldown_until is not None
state.online = True
state.failure_count = 0
state.cooldown_until = None
if was_offline:
logger.info("Connectivity restored; requests resumed.")
logger.info(
"Connectivity restored for destination '%s'; requests resumed.",
destination_key,
)
def register_network_failure(self, exc: Exception) -> None:
self.online = False
self.failure_count += 1
def register_network_failure(
self, exc: Exception, destination: str | None = None
) -> None:
destination_key = self._normalize_destination(destination)
state = self._state_for_destination(destination_key)
state.online = False
state.failure_count += 1
if self.failure_count < self.failure_threshold:
if state.failure_count < self.failure_threshold:
logger.debug(
"Network failure tracked (%d/%d): %s",
self.failure_count,
"Network failure tracked for destination '%s' (%d/%d): %s",
destination_key,
state.failure_count,
self.failure_threshold,
exc,
)
return
retry_step = self.failure_count - self.failure_threshold
retry_step = state.failure_count - self.failure_threshold
backoff = min(
self.max_backoff_seconds,
self.base_backoff_seconds * (2**retry_step),
)
should_log_warning = not self.in_cooldown()
self.cooldown_until = self._now() + timedelta(seconds=backoff)
should_log_warning = not self.in_cooldown(destination_key)
state.cooldown_until = self._now() + timedelta(seconds=backoff)
if should_log_warning:
logger.warning(
"Connectivity offline; enter cooldown for %ss after %d network failures.",
"Connectivity offline for destination '%s'; enter cooldown for %ss after %d network failures.",
destination_key,
int(backoff),
self.failure_count,
state.failure_count,
)
else:
logger.debug(
"Cooldown still active; failure_count=%d, backoff=%ss.",
self.failure_count,
"Cooldown still active for destination '%s'; failure_count=%d, backoff=%ss.",
destination_key,
state.failure_count,
int(backoff),
)
@@ -145,3 +196,9 @@ class ConnectivityGuard:
return False
@dataclass
class _DestinationState:
online: bool = True
failure_count: int = 0
cooldown_until: datetime | None = None