fix(rate-limit): continue to next provider on CivArchive 429 to prevent bulk refresh from freezing (#983)

When CivArchive returns HTTP 429 with a large retry_after, the bulk
metadata refresh would block for hours because:

1. FallbackMetadataProvider raised RateLimitError instead of continuing
   to the next provider (e.g., SQLite archive was never reached).

2. _RateLimitRetryHelper retried long-rate-limit 429s 3 times — all
   futile since the hourly cap hasn't reset.

3. The batch loop had no awareness of persistent rate-limiting,
   causing 192+ models to each hammer the same rate-limited endpoint.

Changes:
- FallbackMetadataProvider: all 6 methods now continue to next provider
  on RateLimitError instead of raising (model_metadata_provider.py)
- fetch_and_update_model: deleted-model path also continues on
  RateLimitError so sqlite provider gets a chance (metadata_sync_service.py)
- _RateLimitRetryHelper: when retry_after >= 120s, only 1 attempt is
  made — retries are futile for hour-scale rate limits
- BulkMetadataRefreshUseCase: tracks consecutive rate-limit failures
  and aborts early after 3 (bulk_metadata_refresh_use_case.py)

Tests: updated test_fallback_respects_retry_limit for new continue
behavior; added tests for large/small retry_after thresholds.
This commit is contained in:
Will Miao
2026-06-16 13:05:37 +08:00
parent 518a4dd5ee
commit 7a76fc72d0
5 changed files with 130 additions and 32 deletions

View File

@@ -216,13 +216,19 @@ class MetadataSyncService:
provider_used: Optional[str] = None provider_used: Optional[str] = None
last_error: Optional[str] = None last_error: Optional[str] = None
civitai_api_not_found = False civitai_api_not_found = False
any_rate_limited = False
for provider_name, provider in provider_attempts: for provider_name, provider in provider_attempts:
try: try:
civitai_metadata_candidate, error = await provider.get_model_by_hash(sha256) civitai_metadata_candidate, error = await provider.get_model_by_hash(sha256)
except RateLimitError as exc: except RateLimitError as exc:
exc.provider = exc.provider or (provider_name or provider.__class__.__name__) logger.warning(
raise "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
provider_name or provider.__class__.__name__,
exc.retry_after or 0,
)
any_rate_limited = True
continue
except Exception as exc: # pragma: no cover - defensive logging except Exception as exc: # pragma: no cover - defensive logging
logger.error("Provider %s failed for hash %s: %s", provider_name, sha256, exc) logger.error("Provider %s failed for hash %s: %s", provider_name, sha256, exc)
civitai_metadata_candidate, error = None, str(exc) civitai_metadata_candidate, error = None, str(exc)
@@ -276,6 +282,8 @@ class MetadataSyncService:
) )
resolved_error = last_error or default_error resolved_error = last_error or default_error
if any_rate_limited and "Rate limited" not in resolved_error:
resolved_error = "Rate limited"
if is_expected_offline_error(resolved_error): if is_expected_offline_error(resolved_error):
resolved_error = OFFLINE_FRIENDLY_MESSAGE resolved_error = OFFLINE_FRIENDLY_MESSAGE

View File

@@ -65,7 +65,14 @@ class _RateLimitRetryHelper:
return await func(*args, **kwargs) return await func(*args, **kwargs)
except RateLimitError as exc: except RateLimitError as exc:
attempt += 1 attempt += 1
if attempt >= self._retry_limit:
# Determine effective retry limit based on rate-limit magnitude
effective_retry_limit = self._retry_limit # default: 3
if exc.retry_after is not None and exc.retry_after >= 120.0:
# Long rate-limit window (>=2 min) — retries are futile
effective_retry_limit = 1 # total 1 attempt = 0 retries
if attempt >= effective_retry_limit:
exc.provider = exc.provider or label exc.provider = exc.provider or label
raise raise
@@ -478,8 +485,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result: if result:
return result, error return result, error
except RateLimitError as exc: except RateLimitError as exc:
exc.provider = exc.provider or label logger.warning(
raise exc "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e: except Exception as e:
logger.debug("Provider %s failed for get_model_by_hash: %s", label, e) logger.debug("Provider %s failed for get_model_by_hash: %s", label, e)
continue continue
@@ -497,16 +508,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result: if result:
return result return result
except RateLimitError as exc: except RateLimitError as exc:
if not_found_confirmed: logger.warning(
logger.debug( "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
"Suppressing rate limit from %s for model %s: "
"already confirmed as not found by another provider",
label, label,
model_id, exc.retry_after or 0,
) )
return None continue
exc.provider = exc.provider or label
raise exc
except ResourceNotFoundError: except ResourceNotFoundError:
not_found_confirmed = True not_found_confirmed = True
logger.debug( logger.debug(
@@ -532,8 +539,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result: if result:
return result return result
except RateLimitError as exc: except RateLimitError as exc:
exc.provider = exc.provider or label logger.warning(
raise exc "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e: except Exception as e:
logger.debug("Provider %s failed for get_model_version: %s", label, e) logger.debug("Provider %s failed for get_model_version: %s", label, e)
continue continue
@@ -550,8 +561,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result: if result:
return result, error return result, error
except RateLimitError as exc: except RateLimitError as exc:
exc.provider = exc.provider or label logger.warning(
raise exc "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e: except Exception as e:
logger.debug("Provider %s failed for get_model_version_info: %s", label, e) logger.debug("Provider %s failed for get_model_version_info: %s", label, e)
continue continue
@@ -572,8 +587,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
except NotImplementedError: except NotImplementedError:
continue continue
except RateLimitError as exc: except RateLimitError as exc:
exc.provider = exc.provider or label logger.warning(
raise exc "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e: except Exception as e:
logger.debug( logger.debug(
"Provider %s failed for get_model_versions_by_hashes: %s", "Provider %s failed for get_model_versions_by_hashes: %s",
@@ -594,8 +613,12 @@ class FallbackMetadataProvider(ModelMetadataProvider):
if result is not None: if result is not None:
return result return result
except RateLimitError as exc: except RateLimitError as exc:
exc.provider = exc.provider or label logger.warning(
raise exc "Provider %s is rate-limited (retry_after=%.0fs); skipping to next provider",
label,
exc.retry_after or 0,
)
continue
except Exception as e: except Exception as e:
logger.debug("Provider %s failed for get_user_models: %s", label, e) logger.debug("Provider %s failed for get_user_models: %s", label, e)
continue continue

View File

@@ -77,6 +77,9 @@ class BulkMetadataRefreshUseCase:
await emit("started") await emit("started")
RATE_LIMIT_ABORT_THRESHOLD = 3
consecutive_rate_limits = 0
for model in to_process: for model in to_process:
if self._service.scanner.is_cancelled(): if self._service.scanner.is_cancelled():
self._logger.info("Bulk metadata refresh cancelled by user") self._logger.info("Bulk metadata refresh cancelled by user")
@@ -115,12 +118,39 @@ class BulkMetadataRefreshUseCase:
continue continue
await MetadataManager.hydrate_model_data(model) await MetadataManager.hydrate_model_data(model)
result, _ = await self._metadata_sync.fetch_and_update_model( result, error_msg = await self._metadata_sync.fetch_and_update_model(
sha256=model["sha256"], sha256=model["sha256"],
file_path=model["file_path"], file_path=model["file_path"],
model_data=model, model_data=model,
update_cache_func=self._service.scanner.update_single_model_cache, update_cache_func=self._service.scanner.update_single_model_cache,
) )
if not result and error_msg and "Rate limited" in error_msg:
consecutive_rate_limits += 1
else:
consecutive_rate_limits = 0
if consecutive_rate_limits >= RATE_LIMIT_ABORT_THRESHOLD:
self._logger.warning(
"Bulk metadata refresh aborted: %d consecutive rate limits detected. "
"Processed %d/%d models.",
consecutive_rate_limits,
processed,
total_to_process,
)
await emit(
"rate_limited",
processed=processed,
success=success,
)
return {
"success": False,
"message": f"Rate limit detected; {total_to_process - processed} models skipped",
"processed": processed,
"updated": success,
"total": total_models,
}
if result: if result:
success += 1 success += 1
if original_name != model.get("model_name"): if original_name != model.get("model_name"):

View File

@@ -441,7 +441,6 @@ async def test_fetch_and_update_model_returns_rate_limit_error(tmp_path):
assert ok is False assert ok is False
assert error is not None and "Rate limited" in error assert error is not None and "Rate limited" in error
assert "7" in error
helpers.metadata_manager.save_metadata.assert_not_awaited() helpers.metadata_manager.save_metadata.assert_not_awaited()
update_cache.assert_not_awaited() update_cache.assert_not_awaited()
helpers.provider_selector.assert_not_awaited() helpers.provider_selector.assert_not_awaited()

View File

@@ -63,7 +63,8 @@ async def test_fallback_retries_same_provider_on_rate_limit(monkeypatch):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_fallback_respects_retry_limit(monkeypatch): async def test_fallback_continues_to_next_provider_on_rate_limit(monkeypatch):
"""After exhausting retries on primary, fallback should continue to secondary."""
sleep_mock = AsyncMock() sleep_mock = AsyncMock()
monkeypatch.setattr(provider_module.asyncio, "sleep", sleep_mock) monkeypatch.setattr(provider_module.asyncio, "sleep", sleep_mock)
monkeypatch.setattr(provider_module.random, "uniform", lambda *_: 0.0) monkeypatch.setattr(provider_module.random, "uniform", lambda *_: 0.0)
@@ -76,13 +77,13 @@ async def test_fallback_respects_retry_limit(monkeypatch):
rate_limit_retry_limit=2, rate_limit_retry_limit=2,
) )
with pytest.raises(RateLimitError) as exc_info: # After Change A: no longer raises; falls through to secondary
await fallback.get_model_by_hash("abc") result, error = await fallback.get_model_by_hash("abc")
assert exc_info.value.provider == "primary" assert error is None
assert primary.calls == 2 assert result == {"id": "secondary"}
assert secondary.calls == 0 assert primary.calls == 2 # retry_limit exhausted on primary
sleep_mock.assert_awaited_once() assert secondary.calls == 1 # secondary IS called now
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -117,3 +118,40 @@ async def test_rate_limit_retrying_provider_respects_limit(monkeypatch):
assert exc_info.value.provider == "inner" assert exc_info.value.provider == "inner"
assert inner.calls == 2 assert inner.calls == 2
sleep_mock.assert_awaited_once() sleep_mock.assert_awaited_once()
@pytest.mark.asyncio
async def test_retry_helper_limits_retries_for_large_retry_after():
"""With retry_after >= 120s, _RateLimitRetryHelper should only attempt once (no retries)."""
calls = 0
async def failing():
nonlocal calls
calls += 1
raise RateLimitError("limited", retry_after=1500.0)
helper = provider_module._RateLimitRetryHelper(retry_limit=3)
with pytest.raises(RateLimitError):
await helper.run("test", failing)
assert calls == 1 # No retries for large retry_after
@pytest.mark.asyncio
async def test_retry_helper_retries_normally_for_small_retry_after(monkeypatch):
"""With retry_after < 120s, _RateLimitRetryHelper should retry normally (up to limit)."""
sleep_mock = AsyncMock()
monkeypatch.setattr(provider_module.asyncio, "sleep", sleep_mock)
calls = 0
async def succeeding():
nonlocal calls
calls += 1
if calls == 1:
raise RateLimitError("limited", retry_after=30.0)
return {"ok": True}, None
helper = provider_module._RateLimitRetryHelper(retry_limit=3)
result, _ = await helper.run("test", succeeding)
assert result == {"ok": True}
assert calls == 2 # Retried once (small retry_after)