From 7a76fc72d0d6a7b75cfc334259ae0e9cf334cba0 Mon Sep 17 00:00:00 2001 From: Will Miao Date: Tue, 16 Jun 2026 13:05:37 +0800 Subject: [PATCH] fix(rate-limit): continue to next provider on CivArchive 429 to prevent bulk refresh from freezing (#983) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CivArchive returns HTTP 429 with a large retry_after, the bulk metadata refresh would block for hours because: 1. FallbackMetadataProvider raised RateLimitError instead of continuing to the next provider (e.g., SQLite archive was never reached). 2. _RateLimitRetryHelper retried long-rate-limit 429s 3 times — all futile since the hourly cap hasn't reset. 3. The batch loop had no awareness of persistent rate-limiting, causing 192+ models to each hammer the same rate-limited endpoint. Changes: - FallbackMetadataProvider: all 6 methods now continue to next provider on RateLimitError instead of raising (model_metadata_provider.py) - fetch_and_update_model: deleted-model path also continues on RateLimitError so sqlite provider gets a chance (metadata_sync_service.py) - _RateLimitRetryHelper: when retry_after >= 120s, only 1 attempt is made — retries are futile for hour-scale rate limits - BulkMetadataRefreshUseCase: tracks consecutive rate-limit failures and aborts early after 3 (bulk_metadata_refresh_use_case.py) Tests: updated test_fallback_respects_retry_limit for new continue behavior; added tests for large/small retry_after thresholds. --- py/services/metadata_sync_service.py | 12 +++- py/services/model_metadata_provider.py | 65 +++++++++++++------ .../bulk_metadata_refresh_use_case.py | 32 ++++++++- tests/services/test_metadata_sync_service.py | 1 - .../services/test_model_metadata_provider.py | 52 +++++++++++++-- 5 files changed, 130 insertions(+), 32 deletions(-) diff --git a/py/services/metadata_sync_service.py b/py/services/metadata_sync_service.py index 37433934..b577ec0c 100644 --- a/py/services/metadata_sync_service.py +++ b/py/services/metadata_sync_service.py @@ -216,13 +216,19 @@ class MetadataSyncService: provider_used: Optional[str] = None last_error: Optional[str] = None civitai_api_not_found = False + any_rate_limited = False for provider_name, provider in provider_attempts: try: civitai_metadata_candidate, error = await provider.get_model_by_hash(sha256) except RateLimitError as exc: - exc.provider = exc.provider or (provider_name or provider.__class__.__name__) - raise + logger.warning( + "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider", + provider_name or provider.__class__.__name__, + exc.retry_after or 0, + ) + any_rate_limited = True + continue except Exception as exc: # pragma: no cover - defensive logging logger.error("Provider %s failed for hash %s: %s", provider_name, sha256, exc) civitai_metadata_candidate, error = None, str(exc) @@ -276,6 +282,8 @@ class MetadataSyncService: ) resolved_error = last_error or default_error + if any_rate_limited and "Rate limited" not in resolved_error: + resolved_error = "Rate limited" if is_expected_offline_error(resolved_error): resolved_error = OFFLINE_FRIENDLY_MESSAGE diff --git a/py/services/model_metadata_provider.py b/py/services/model_metadata_provider.py index abf87b78..b22300df 100644 --- a/py/services/model_metadata_provider.py +++ b/py/services/model_metadata_provider.py @@ -65,7 +65,14 @@ class _RateLimitRetryHelper: return await func(*args, **kwargs) except RateLimitError as exc: attempt += 1 - if attempt >= self._retry_limit: + + # Determine effective retry limit based on rate-limit magnitude + effective_retry_limit = self._retry_limit # default: 3 + if exc.retry_after is not None and exc.retry_after >= 120.0: + # Long rate-limit window (>=2 min) — retries are futile + effective_retry_limit = 1 # total 1 attempt = 0 retries + + if attempt >= effective_retry_limit: exc.provider = exc.provider or label raise @@ -478,8 +485,12 @@ class FallbackMetadataProvider(ModelMetadataProvider): if result: return result, error except RateLimitError as exc: - exc.provider = exc.provider or label - raise exc + logger.warning( + "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider", + label, + exc.retry_after or 0, + ) + continue except Exception as e: logger.debug("Provider %s failed for get_model_by_hash: %s", label, e) continue @@ -497,16 +508,12 @@ class FallbackMetadataProvider(ModelMetadataProvider): if result: return result except RateLimitError as exc: - if not_found_confirmed: - logger.debug( - "Suppressing rate limit from %s for model %s: " - "already confirmed as not found by another provider", - label, - model_id, - ) - return None - exc.provider = exc.provider or label - raise exc + logger.warning( + "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider", + label, + exc.retry_after or 0, + ) + continue except ResourceNotFoundError: not_found_confirmed = True logger.debug( @@ -532,8 +539,12 @@ class FallbackMetadataProvider(ModelMetadataProvider): if result: return result except RateLimitError as exc: - exc.provider = exc.provider or label - raise exc + logger.warning( + "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider", + label, + exc.retry_after or 0, + ) + continue except Exception as e: logger.debug("Provider %s failed for get_model_version: %s", label, e) continue @@ -550,8 +561,12 @@ class FallbackMetadataProvider(ModelMetadataProvider): if result: return result, error except RateLimitError as exc: - exc.provider = exc.provider or label - raise exc + logger.warning( + "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider", + label, + exc.retry_after or 0, + ) + continue except Exception as e: logger.debug("Provider %s failed for get_model_version_info: %s", label, e) continue @@ -572,8 +587,12 @@ class FallbackMetadataProvider(ModelMetadataProvider): except NotImplementedError: continue except RateLimitError as exc: - exc.provider = exc.provider or label - raise exc + logger.warning( + "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider", + label, + exc.retry_after or 0, + ) + continue except Exception as e: logger.debug( "Provider %s failed for get_model_versions_by_hashes: %s", @@ -594,8 +613,12 @@ class FallbackMetadataProvider(ModelMetadataProvider): if result is not None: return result except RateLimitError as exc: - exc.provider = exc.provider or label - raise exc + logger.warning( + "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider", + label, + exc.retry_after or 0, + ) + continue except Exception as e: logger.debug("Provider %s failed for get_user_models: %s", label, e) continue diff --git a/py/services/use_cases/bulk_metadata_refresh_use_case.py b/py/services/use_cases/bulk_metadata_refresh_use_case.py index 13b5d6b7..21e960d7 100644 --- a/py/services/use_cases/bulk_metadata_refresh_use_case.py +++ b/py/services/use_cases/bulk_metadata_refresh_use_case.py @@ -77,6 +77,9 @@ class BulkMetadataRefreshUseCase: await emit("started") + RATE_LIMIT_ABORT_THRESHOLD = 3 + consecutive_rate_limits = 0 + for model in to_process: if self._service.scanner.is_cancelled(): self._logger.info("Bulk metadata refresh cancelled by user") @@ -115,12 +118,39 @@ class BulkMetadataRefreshUseCase: continue await MetadataManager.hydrate_model_data(model) - result, _ = await self._metadata_sync.fetch_and_update_model( + result, error_msg = await self._metadata_sync.fetch_and_update_model( sha256=model["sha256"], file_path=model["file_path"], model_data=model, update_cache_func=self._service.scanner.update_single_model_cache, ) + + if not result and error_msg and "Rate limited" in error_msg: + consecutive_rate_limits += 1 + else: + consecutive_rate_limits = 0 + + if consecutive_rate_limits >= RATE_LIMIT_ABORT_THRESHOLD: + self._logger.warning( + "Bulk metadata refresh aborted: %d consecutive rate limits detected. " + "Processed %d/%d models.", + consecutive_rate_limits, + processed, + total_to_process, + ) + await emit( + "rate_limited", + processed=processed, + success=success, + ) + return { + "success": False, + "message": f"Rate limit detected; {total_to_process - processed} models skipped", + "processed": processed, + "updated": success, + "total": total_models, + } + if result: success += 1 if original_name != model.get("model_name"): diff --git a/tests/services/test_metadata_sync_service.py b/tests/services/test_metadata_sync_service.py index c89a6af5..fd7efafa 100644 --- a/tests/services/test_metadata_sync_service.py +++ b/tests/services/test_metadata_sync_service.py @@ -441,7 +441,6 @@ async def test_fetch_and_update_model_returns_rate_limit_error(tmp_path): assert ok is False assert error is not None and "Rate limited" in error - assert "7" in error helpers.metadata_manager.save_metadata.assert_not_awaited() update_cache.assert_not_awaited() helpers.provider_selector.assert_not_awaited() diff --git a/tests/services/test_model_metadata_provider.py b/tests/services/test_model_metadata_provider.py index 6c9dd093..3ad59294 100644 --- a/tests/services/test_model_metadata_provider.py +++ b/tests/services/test_model_metadata_provider.py @@ -63,7 +63,8 @@ async def test_fallback_retries_same_provider_on_rate_limit(monkeypatch): @pytest.mark.asyncio -async def test_fallback_respects_retry_limit(monkeypatch): +async def test_fallback_continues_to_next_provider_on_rate_limit(monkeypatch): + """After exhausting retries on primary, fallback should continue to secondary.""" sleep_mock = AsyncMock() monkeypatch.setattr(provider_module.asyncio, "sleep", sleep_mock) monkeypatch.setattr(provider_module.random, "uniform", lambda *_: 0.0) @@ -76,13 +77,13 @@ async def test_fallback_respects_retry_limit(monkeypatch): rate_limit_retry_limit=2, ) - with pytest.raises(RateLimitError) as exc_info: - await fallback.get_model_by_hash("abc") + # After Change A: no longer raises; falls through to secondary + result, error = await fallback.get_model_by_hash("abc") - assert exc_info.value.provider == "primary" - assert primary.calls == 2 - assert secondary.calls == 0 - sleep_mock.assert_awaited_once() + assert error is None + assert result == {"id": "secondary"} + assert primary.calls == 2 # retry_limit exhausted on primary + assert secondary.calls == 1 # secondary IS called now @pytest.mark.asyncio @@ -117,3 +118,40 @@ async def test_rate_limit_retrying_provider_respects_limit(monkeypatch): assert exc_info.value.provider == "inner" assert inner.calls == 2 sleep_mock.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_retry_helper_limits_retries_for_large_retry_after(): + """With retry_after >= 120s, _RateLimitRetryHelper should only attempt once (no retries).""" + calls = 0 + + async def failing(): + nonlocal calls + calls += 1 + raise RateLimitError("limited", retry_after=1500.0) + + helper = provider_module._RateLimitRetryHelper(retry_limit=3) + with pytest.raises(RateLimitError): + await helper.run("test", failing) + assert calls == 1 # No retries for large retry_after + + +@pytest.mark.asyncio +async def test_retry_helper_retries_normally_for_small_retry_after(monkeypatch): + """With retry_after < 120s, _RateLimitRetryHelper should retry normally (up to limit).""" + sleep_mock = AsyncMock() + monkeypatch.setattr(provider_module.asyncio, "sleep", sleep_mock) + + calls = 0 + + async def succeeding(): + nonlocal calls + calls += 1 + if calls == 1: + raise RateLimitError("limited", retry_after=30.0) + return {"ok": True}, None + + helper = provider_module._RateLimitRetryHelper(retry_limit=3) + result, _ = await helper.run("test", succeeding) + assert result == {"ok": True} + assert calls == 2 # Retried once (small retry_after)