From 15dfaed4627fc82fd60ae89e5974e852c2feadd9 Mon Sep 17 00:00:00 2001 From: Will Miao Date: Fri, 22 May 2026 07:05:06 +0800 Subject: [PATCH] fix(api): treat transient server errors (524/5xx) as non-fatal in model updates (#935) Teach CivitaiClient.get_model_versions() to recognise Cloudflare 524, generic 5xx, and connection-level errors as transient failures and return None instead of raising RuntimeError, so a single upstream glitch does not block the entire batch update or produce a scary traceback. Also downgrade the generic except Exception log level in ModelUpdateService._refresh_single_model() from error (with exc_info) to warning (message only), since the full traceback is already logged upstream in CivitaiClient. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- py/services/civitai_client.py | 30 +++++++++++++++++++++++++++++ py/services/model_update_service.py | 3 +-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/py/services/civitai_client.py b/py/services/civitai_client.py index 65178cd9..7452f69d 100644 --- a/py/services/civitai_client.py +++ b/py/services/civitai_client.py @@ -201,6 +201,29 @@ class CivitaiClient: return _from_value(payload) + @staticmethod + def _is_transient_server_error(message: str) -> bool: + """Return True when the message indicates a transient upstream failure. + + Recognises Cloudflare 524, generic 5xx, and connectivity-level flakiness + that should not be treated as a permanent failure. + """ + normalized = message.lower() + if "status 5" in normalized or "status 524" in normalized: + return True + if any( + keyword in normalized + for keyword in ( + "connection refused", + "connection reset", + "temporary failure", + "name resolution", + "connection closed", + ) + ): + return True + return False + async def get_model_versions(self, model_id: str) -> Optional[Dict]: """Get all versions of a model with local availability info""" try: @@ -223,6 +246,13 @@ class CivitaiClient: logger.info("Civitai request skipped: %s", OFFLINE_FRIENDLY_MESSAGE) return None if message: + if self._is_transient_server_error(message): + logger.info( + "Transient server error for model %s: %s", + model_id, + message, + ) + return None raise RuntimeError(message) return None except RateLimitError: diff --git a/py/services/model_update_service.py b/py/services/model_update_service.py index 0bfcfd23..0965e229 100644 --- a/py/services/model_update_service.py +++ b/py/services/model_update_service.py @@ -1000,12 +1000,11 @@ class ModelUpdateService: fallback_error_message = str(exc) or "resource not found" mark_model_as_ignored = True except Exception as exc: # pragma: no cover - defensive log - logger.error( + logger.warning( "Failed to fetch versions for model %s (%s): %s", model_id, model_type, exc, - exc_info=True, ) fallback_error_message = str(exc) if response is not None: