fix(civitai): retry transient server errors and cache version info to reduce 504 timeouts

CivitaiClient._make_request now retries 5xx/524/network errors up to 3 times with exponential backoff (1s, 2s) before giving up to the fallback provider chain.

get_model_version_info gains an in-memory OrderedDict cache (LRU, max 500 entries) so duplicate lookups of the same version ID within a single import/scan flow return instantly without a redundant API call.

Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
Will Miao
2026-05-26 16:09:08 +08:00
parent 26be187d42
commit 7416080cfb

View File

@@ -2,6 +2,7 @@ import asyncio
import copy import copy
import logging import logging
import os import os
from collections import OrderedDict
from typing import Any, Optional, Dict, Tuple, List, Sequence from typing import Any, Optional, Dict, Tuple, List, Sequence
from .connectivity_guard import ( from .connectivity_guard import (
OFFLINE_FRIENDLY_MESSAGE, OFFLINE_FRIENDLY_MESSAGE,
@@ -45,6 +46,14 @@ class CivitaiClient:
self._initialized = True self._initialized = True
self.base_url = "https://civitai.red/api/v1" self.base_url = "https://civitai.red/api/v1"
# In-memory cache to avoid redundant get_model_version_info calls
# within the same import/scan flow. Only successful results are cached.
# Uses OrderedDict with LRU eviction at MAX_CACHE_ENTRIES to prevent
# unbounded growth in long-running server processes.
self._version_info_cache: OrderedDict[
str, Tuple[Optional[Dict], Optional[str]]
] = OrderedDict()
self._MAX_CACHE_ENTRIES = 500
def _build_image_info_url(self, image_id: str) -> str: def _build_image_info_url(self, image_id: str) -> str:
return f"{self.base_url}/images?imageId={image_id}&nsfw=X" return f"{self.base_url}/images?imageId={image_id}&nsfw=X"
@@ -57,8 +66,11 @@ class CivitaiClient:
use_auth: bool = False, use_auth: bool = False,
**kwargs, **kwargs,
) -> Tuple[bool, Dict | str]: ) -> Tuple[bool, Dict | str]:
"""Wrapper around downloader.make_request that surfaces rate limits.""" """Wrapper around downloader.make_request that surfaces rate limits,
with retry for transient server errors (5xx, Cloudflare 524, network flakiness)."""
max_retries = 3
for attempt in range(max_retries):
downloader = await get_downloader() downloader = await get_downloader()
success, result = await downloader.make_request( success, result = await downloader.make_request(
method, method,
@@ -66,13 +78,45 @@ class CivitaiClient:
use_auth=use_auth, use_auth=use_auth,
**kwargs, **kwargs,
) )
if not success and isinstance(result, RateLimitError): if success:
return True, result
if isinstance(result, RateLimitError):
if result.provider is None: if result.provider is None:
result.provider = "civitai_api" result.provider = "civitai_api"
raise result raise result
if not success and is_offline_cooldown_error(result):
if is_offline_cooldown_error(result):
return False, OFFLINE_FRIENDLY_MESSAGE return False, OFFLINE_FRIENDLY_MESSAGE
return success, result
# Transient server error — retry with exponential backoff
if self._is_transient_server_error(str(result)):
if attempt < max_retries - 1:
wait = 2**attempt # 1s, 2s, 4s
logger.info(
"Transient error on %s %s, retrying in %ds "
"(attempt %d/%d): %s",
method,
url,
wait,
attempt + 1,
max_retries,
result,
)
await asyncio.sleep(wait)
continue
logger.warning(
"All %d retries exhausted for %s %s: %s",
max_retries,
method,
url,
result,
)
return False, result
return False, result
return False, "Unexpected error in _make_request"
@staticmethod @staticmethod
def _remove_comfy_metadata(model_version: Optional[Dict]) -> None: def _remove_comfy_metadata(model_version: Optional[Dict]) -> None:
@@ -512,6 +556,14 @@ class CivitaiClient:
- The model version data or None if not found - The model version data or None if not found
- An error message if there was an error, or None on success - An error message if there was an error, or None on success
""" """
# In-memory cache avoids redundant API calls within the same
# import/scan flow (e.g. _resolve_base_model_from_checkpoint
# followed by _resolve_and_populate_checkpoint with the same id).
if version_id in self._version_info_cache:
logger.debug("Cache hit for model version info: %s", version_id)
self._version_info_cache.move_to_end(version_id) # LRU bump
return self._version_info_cache[version_id]
try: try:
url = f"{self.base_url}/model-versions/{version_id}" url = f"{self.base_url}/model-versions/{version_id}"
@@ -521,6 +573,11 @@ class CivitaiClient:
if success: if success:
logger.debug("Successfully fetched model version info for: %s", version_id) logger.debug("Successfully fetched model version info for: %s", version_id)
self._remove_comfy_metadata(result) self._remove_comfy_metadata(result)
self._version_info_cache[version_id] = (result, None)
self._version_info_cache.move_to_end(version_id)
# Evict oldest entry when over capacity
if len(self._version_info_cache) > self._MAX_CACHE_ENTRIES:
self._version_info_cache.popitem(last=False)
return result, None return result, None
# Handle specific error cases # Handle specific error cases