fix(civitai): retry transient server errors and cache version info to reduce 504 timeouts

CivitaiClient._make_request now retries 5xx/524/network errors up to 3 times with exponential backoff (1s, 2s) before giving up to the fallback provider chain. get_model_version_info gains an in-memory OrderedDict cache (LRU, max 500 entries) so duplicate lookups of the same version ID within a single import/scan flow return instantly without a redundant API call. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
2026-06-09 20:39:25 -03:00 · 2026-05-26 16:09:08 +08:00
parent 26be187d42
commit 7416080cfb
1 changed files with 72 additions and 15 deletions
--- a/py/services/civitai_client.py
+++ b/py/services/civitai_client.py
@@ -2,6 +2,7 @@ import asyncio
 import copy
 import logging
 import os
 from collections import OrderedDict
 from typing import Any, Optional, Dict, Tuple, List, Sequence
 from .connectivity_guard import (
    OFFLINE_FRIENDLY_MESSAGE,
@@ -45,6 +46,14 @@ class CivitaiClient:
        self._initialized = True
        self.base_url = "https://civitai.red/api/v1"
        # In-memory cache to avoid redundant get_model_version_info calls
        # within the same import/scan flow. Only successful results are cached.
        # Uses OrderedDict with LRU eviction at MAX_CACHE_ENTRIES to prevent
        # unbounded growth in long-running server processes.
        self._version_info_cache: OrderedDict[
            str, Tuple[Optional[Dict], Optional[str]]
        ] = OrderedDict()
        self._MAX_CACHE_ENTRIES = 500
    def _build_image_info_url(self, image_id: str) -> str:
        return f"{self.base_url}/images?imageId={image_id}&nsfw=X"
@@ -57,8 +66,11 @@ class CivitaiClient:
        use_auth: bool = False,
        **kwargs,
    ) -> Tuple[bool, Dict | str]:
-        """Wrapper around downloader.make_request that surfaces rate limits."""
+        """Wrapper around downloader.make_request that surfaces rate limits,
        with retry for transient server errors (5xx, Cloudflare 524, network flakiness)."""
        max_retries = 3
        for attempt in range(max_retries):
            downloader = await get_downloader()
            success, result = await downloader.make_request(
                method,
@@ -66,13 +78,45 @@ class CivitaiClient:
                use_auth=use_auth,
                **kwargs,
            )
-        if not success and isinstance(result, RateLimitError):
+            if success:
                return True, result
            if isinstance(result, RateLimitError):
                if result.provider is None:
                    result.provider = "civitai_api"
                raise result
-        if not success and is_offline_cooldown_error(result):
+
            if is_offline_cooldown_error(result):
                return False, OFFLINE_FRIENDLY_MESSAGE
-        return success, result
+
            # Transient server error — retry with exponential backoff
            if self._is_transient_server_error(str(result)):
                if attempt < max_retries - 1:
                    wait = 2**attempt  # 1s, 2s, 4s
                    logger.info(
                        "Transient error on %s %s, retrying in %ds "
                        "(attempt %d/%d): %s",
                        method,
                        url,
                        wait,
                        attempt + 1,
                        max_retries,
                        result,
                    )
                    await asyncio.sleep(wait)
                    continue
                logger.warning(
                    "All %d retries exhausted for %s %s: %s",
                    max_retries,
                    method,
                    url,
                    result,
                )
                return False, result
            return False, result
        return False, "Unexpected error in _make_request"
    @staticmethod
    def _remove_comfy_metadata(model_version: Optional[Dict]) -> None:
@@ -512,6 +556,14 @@ class CivitaiClient:
                - The model version data or None if not found
                - An error message if there was an error, or None on success
        """
        # In-memory cache avoids redundant API calls within the same
        # import/scan flow (e.g. _resolve_base_model_from_checkpoint
        # followed by _resolve_and_populate_checkpoint with the same id).
        if version_id in self._version_info_cache:
            logger.debug("Cache hit for model version info: %s", version_id)
            self._version_info_cache.move_to_end(version_id)  # LRU bump
            return self._version_info_cache[version_id]
        try:
            url = f"{self.base_url}/model-versions/{version_id}"
@@ -521,6 +573,11 @@ class CivitaiClient:
            if success:
                logger.debug("Successfully fetched model version info for: %s", version_id)
                self._remove_comfy_metadata(result)
                self._version_info_cache[version_id] = (result, None)
                self._version_info_cache.move_to_end(version_id)
                # Evict oldest entry when over capacity
                if len(self._version_info_cache) > self._MAX_CACHE_ENTRIES:
                    self._version_info_cache.popitem(last=False)
                return result, None
            # Handle specific error cases