fix(rate-limit): continue to next provider on CivArchive 429 to prevent bulk refresh from freezing (#983)

When CivArchive returns HTTP 429 with a large retry_after, the bulk
metadata refresh would block for hours because:

1. FallbackMetadataProvider raised RateLimitError instead of continuing
   to the next provider (e.g., SQLite archive was never reached).

2. _RateLimitRetryHelper retried long-rate-limit 429s 3 times — all
   futile since the hourly cap hasn't reset.

3. The batch loop had no awareness of persistent rate-limiting,
   causing 192+ models to each hammer the same rate-limited endpoint.

Changes:
- FallbackMetadataProvider: all 6 methods now continue to next provider
  on RateLimitError instead of raising (model_metadata_provider.py)
- fetch_and_update_model: deleted-model path also continues on
  RateLimitError so sqlite provider gets a chance (metadata_sync_service.py)
- _RateLimitRetryHelper: when retry_after >= 120s, only 1 attempt is
  made — retries are futile for hour-scale rate limits
- BulkMetadataRefreshUseCase: tracks consecutive rate-limit failures
  and aborts early after 3 (bulk_metadata_refresh_use_case.py)

Tests: updated test_fallback_respects_retry_limit for new continue
behavior; added tests for large/small retry_after thresholds.
This commit is contained in:
Will Miao
2026-06-16 13:05:37 +08:00
parent 518a4dd5ee
commit 7a76fc72d0
5 changed files with 130 additions and 32 deletions

View File

@@ -65,7 +65,14 @@ class _RateLimitRetryHelper:
return await func(*args, **kwargs)
except RateLimitError as exc:
attempt += 1
if attempt >= self._retry_limit:
# Determine effective retry limit based on rate-limit magnitude
effective_retry_limit = self._retry_limit # default: 3
if exc.retry_after is not None and exc.retry_after >= 120.0:
# Long rate-limit window (>=2 min) — retries are futile
effective_retry_limit = 1 # total 1 attempt = 0 retries
if attempt >= effective_retry_limit:
exc.provider = exc.provider or label
raise
@@ -478,8 +485,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result:
return result, error
except RateLimitError as exc:
exc.provider = exc.provider or label
raise exc
logger.warning(
"Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e:
logger.debug("Provider %s failed for get_model_by_hash: %s", label, e)
continue
@@ -497,16 +508,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result:
return result
except RateLimitError as exc:
if not_found_confirmed:
logger.debug(
"Suppressing rate limit from %s for model %s: "
"already confirmed as not found by another provider",
label,
model_id,
)
return None
exc.provider = exc.provider or label
raise exc
logger.warning(
"Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except ResourceNotFoundError:
not_found_confirmed = True
logger.debug(
@@ -532,8 +539,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result:
return result
except RateLimitError as exc:
exc.provider = exc.provider or label
raise exc
logger.warning(
"Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e:
logger.debug("Provider %s failed for get_model_version: %s", label, e)
continue
@@ -550,8 +561,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result:
return result, error
except RateLimitError as exc:
exc.provider = exc.provider or label
raise exc
logger.warning(
"Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e:
logger.debug("Provider %s failed for get_model_version_info: %s", label, e)
continue
@@ -572,8 +587,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
except NotImplementedError:
continue
except RateLimitError as exc:
exc.provider = exc.provider or label
raise exc
logger.warning(
"Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e:
logger.debug(
"Provider %s failed for get_model_versions_by_hashes: %s",
@@ -594,8 +613,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result is not None:
return result
except RateLimitError as exc:
exc.provider = exc.provider or label
raise exc
logger.warning(
"Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e:
logger.debug("Provider %s failed for get_user_models: %s", label, e)
continue