From c12aefa82aff9f165a6d51166397d95a6ae98174 Mon Sep 17 00:00:00 2001 From: Will Miao Date: Tue, 3 Feb 2026 21:31:17 +0800 Subject: [PATCH] fix(recipes): detect duplicates for remote imports using modelVersionId and Civitai URL, #750 - Use modelVersionId as fallback for all loras in fingerprint calculation (not just deleted) - Add URL-based duplicate detection using source_path field - Combine both fingerprint and URL-based duplicate detection in API response - Fix _download_remote_media return type and unbound variable issue --- py/routes/handlers/recipe_handlers.py | 40 +++++++- py/services/recipe_scanner.py | 23 +++++ py/utils/utils.py | 6 +- tests/services/test_duplicate_detection.py | 110 +++++++++++++++++++++ tests/utils/test_fingerprint_fallback.py | 100 +++++++++++++++++++ 5 files changed, 271 insertions(+), 8 deletions(-) create mode 100644 tests/services/test_duplicate_detection.py create mode 100644 tests/utils/test_fingerprint_fallback.py diff --git a/py/routes/handlers/recipe_handlers.py b/py/routes/handlers/recipe_handlers.py index 227a5c65..0ffe211c 100644 --- a/py/routes/handlers/recipe_handlers.py +++ b/py/routes/handlers/recipe_handlers.py @@ -412,10 +412,11 @@ class RecipeQueryHandler: if recipe_scanner is None: raise RuntimeError("Recipe scanner unavailable") - duplicate_groups = await recipe_scanner.find_all_duplicate_recipes() + fingerprint_groups = await recipe_scanner.find_all_duplicate_recipes() + url_groups = await recipe_scanner.find_duplicate_recipes_by_source() response_data = [] - for fingerprint, recipe_ids in duplicate_groups.items(): + for fingerprint, recipe_ids in fingerprint_groups.items(): if len(recipe_ids) <= 1: continue @@ -439,12 +440,44 @@ class RecipeQueryHandler: recipes.sort(key=lambda entry: entry.get("modified", 0), reverse=True) response_data.append( { + "type": "fingerprint", "fingerprint": fingerprint, "count": len(recipes), "recipes": recipes, } ) + for url, recipe_ids in url_groups.items(): + if len(recipe_ids) <= 1: + continue + + recipes = [] + for recipe_id in recipe_ids: + recipe = await recipe_scanner.get_recipe_by_id(recipe_id) + if recipe: + recipes.append( + { + "id": recipe.get("id"), + "title": recipe.get("title"), + "file_url": recipe.get("file_url") + or self._format_recipe_file_url(recipe.get("file_path", "")), + "modified": recipe.get("modified"), + "created_date": recipe.get("created_date"), + "lora_count": len(recipe.get("loras", [])), + } + ) + + if len(recipes) >= 2: + recipes.sort(key=lambda entry: entry.get("modified", 0), reverse=True) + response_data.append( + { + "type": "source_url", + "fingerprint": url, + "count": len(recipes), + "recipes": recipes, + } + ) + response_data.sort(key=lambda entry: entry["count"], reverse=True) return web.json_response({"success": True, "duplicate_groups": response_data}) except Exception as exc: @@ -1021,7 +1054,7 @@ class RecipeManagementHandler: "exclude": False, } - async def _download_remote_media(self, image_url: str) -> tuple[bytes, str]: + async def _download_remote_media(self, image_url: str) -> tuple[bytes, str, Any]: civitai_client = self._civitai_client_getter() downloader = await self._downloader_factory() temp_path = None @@ -1029,6 +1062,7 @@ class RecipeManagementHandler: with tempfile.NamedTemporaryFile(delete=False) as temp_file: temp_path = temp_file.name download_url = image_url + image_info = None civitai_match = re.match(r"https://civitai\.com/images/(\d+)", image_url) if civitai_match: if civitai_client is None: diff --git a/py/services/recipe_scanner.py b/py/services/recipe_scanner.py index 65e5bcb6..7a10d030 100644 --- a/py/services/recipe_scanner.py +++ b/py/services/recipe_scanner.py @@ -2231,3 +2231,26 @@ class RecipeScanner: duplicate_groups = {k: v for k, v in fingerprint_groups.items() if len(v) > 1} return duplicate_groups + + async def find_duplicate_recipes_by_source(self) -> dict: + """Find all recipe duplicates based on source_path (Civitai image URLs) + + Returns: + Dictionary where keys are source URLs and values are lists of recipe IDs + """ + cache = await self.get_cached_data() + + url_groups = {} + for recipe in cache.raw_data: + source_url = recipe.get('source_path', '').strip() + if not source_url: + continue + + if source_url not in url_groups: + url_groups[source_url] = [] + + url_groups[source_url].append(recipe.get('id')) + + duplicate_groups = {k: v for k, v in url_groups.items() if len(v) > 1} + + return duplicate_groups diff --git a/py/utils/utils.py b/py/utils/utils.py index 2d7ea125..b5b74623 100644 --- a/py/utils/utils.py +++ b/py/utils/utils.py @@ -138,19 +138,15 @@ def calculate_recipe_fingerprint(loras): if not loras: return "" - # Filter valid entries and extract hash and strength valid_loras = [] for lora in loras: - # Skip excluded loras if lora.get("exclude", False): continue - # Get the hash - use modelVersionId as fallback if hash is empty hash_value = lora.get("hash", "").lower() - if not hash_value and lora.get("isDeleted", False) and lora.get("modelVersionId"): + if not hash_value and lora.get("modelVersionId"): hash_value = str(lora.get("modelVersionId")) - # Skip entries without a valid hash if not hash_value: continue diff --git a/tests/services/test_duplicate_detection.py b/tests/services/test_duplicate_detection.py new file mode 100644 index 00000000..1ab465e8 --- /dev/null +++ b/tests/services/test_duplicate_detection.py @@ -0,0 +1,110 @@ +"""Test for duplicate detection by source URL.""" +import pytest +from unittest.mock import AsyncMock, MagicMock + + +@pytest.mark.asyncio +async def test_find_duplicate_recipes_by_source(): + """Test that duplicate recipes are detected by source URL.""" + from py.services.recipe_scanner import RecipeScanner + + scanner = MagicMock(spec=RecipeScanner) + scanner.get_cached_data = AsyncMock() + + cache = MagicMock() + cache.raw_data = [ + { + 'id': '8705c972-ef08-47f3-8ac3-9ac3b8ff4c0b', + 'source_path': 'https://civitai.com/images/119165946', + 'title': 'Recipe 1' + }, + { + 'id': '52e636ce-ea9f-4f64-a6a9-c704bd715889', + 'source_path': 'https://civitai.com/images/119165946', + 'title': 'Recipe 2' + }, + { + 'id': '00000000-0000-0000-0000-000000000001', + 'source_path': 'https://civitai.com/images/999999999', + 'title': 'Recipe 3' + }, + { + 'id': '00000000-0000-0000-0000-000000000002', + 'source_path': '', + 'title': 'Recipe 4 (no source)' + }, + ] + + scanner.get_cached_data.return_value = cache + + # Call the actual method on the mocked scanner + from py.services.recipe_scanner import RecipeScanner as RealRecipeScanner + result = await RealRecipeScanner.find_duplicate_recipes_by_source(scanner) + + assert len(result) == 1 + assert 'https://civitai.com/images/119165946' in result + assert len(result['https://civitai.com/images/119165946']) == 2 + assert '8705c972-ef08-47f3-8ac3-9ac3b8ff4c0b' in result['https://civitai.com/images/119165946'] + assert '52e636ce-ea9f-4f64-a6a9-c704bd715889' in result['https://civitai.com/images/119165946'] + + +@pytest.mark.asyncio +async def test_find_duplicate_recipes_by_source_empty(): + """Test that empty result is returned when no duplicates found.""" + from py.services.recipe_scanner import RecipeScanner + + scanner = MagicMock(spec=RecipeScanner) + scanner.get_cached_data = AsyncMock() + + cache = MagicMock() + cache.raw_data = [ + { + 'id': '8705c972-ef08-47f3-8ac3-9ac3b8ff4c0b', + 'source_path': 'https://civitai.com/images/119165946', + 'title': 'Recipe 1' + }, + { + 'id': '00000000-0000-0000-0000-000000000002', + 'source_path': '', + 'title': 'Recipe 2 (no source)' + }, + ] + + scanner.get_cached_data.return_value = cache + + from py.services.recipe_scanner import RecipeScanner as RealRecipeScanner + result = await RealRecipeScanner.find_duplicate_recipes_by_source(scanner) + + assert len(result) == 0 + + +@pytest.mark.asyncio +async def test_find_duplicate_recipes_by_source_trimming_whitespace(): + """Test that whitespace is trimmed from source URLs.""" + from py.services.recipe_scanner import RecipeScanner + + scanner = MagicMock(spec=RecipeScanner) + scanner.get_cached_data = AsyncMock() + + cache = MagicMock() + cache.raw_data = [ + { + 'id': '8705c972-ef08-47f3-8ac3-9ac3b8ff4c0b', + 'source_path': 'https://civitai.com/images/119165946', + 'title': 'Recipe 1' + }, + { + 'id': '52e636ce-ea9f-4f64-a6a9-c704bd715889', + 'source_path': ' https://civitai.com/images/119165946 ', + 'title': 'Recipe 2' + }, + ] + + scanner.get_cached_data.return_value = cache + + from py.services.recipe_scanner import RecipeScanner as RealRecipeScanner + result = await RealRecipeScanner.find_duplicate_recipes_by_source(scanner) + + assert len(result) == 1 + assert 'https://civitai.com/images/119165946' in result + assert len(result['https://civitai.com/images/119165946']) == 2 diff --git a/tests/utils/test_fingerprint_fallback.py b/tests/utils/test_fingerprint_fallback.py new file mode 100644 index 00000000..cde80915 --- /dev/null +++ b/tests/utils/test_fingerprint_fallback.py @@ -0,0 +1,100 @@ +"""Test for modelVersionId fallback in fingerprint calculation.""" +import pytest +from py.utils.utils import calculate_recipe_fingerprint + + +def test_calculate_fingerprint_with_model_version_id_fallback(): + """Test that fingerprint uses modelVersionId when hash is empty, even when not deleted.""" + loras = [ + { + "hash": "", + "strength": 1.0, + "modelVersionId": 2639467, + "isDeleted": False, + "exclude": False + } + ] + fingerprint = calculate_recipe_fingerprint(loras) + assert fingerprint == "2639467:1.0" + + +def test_calculate_fingerprint_with_multiple_model_version_ids(): + """Test fingerprint with multiple loras using modelVersionId fallback.""" + loras = [ + { + "hash": "", + "strength": 1.0, + "modelVersionId": 2639467, + "isDeleted": False, + "exclude": False + }, + { + "hash": "", + "strength": 0.8, + "modelVersionId": 1234567, + "isDeleted": False, + "exclude": False + } + ] + fingerprint = calculate_recipe_fingerprint(loras) + assert fingerprint == "1234567:0.8|2639467:1.0" + + +def test_calculate_fingerprint_with_deleted_lora(): + """Test that deleted loras with modelVersionId are still included.""" + loras = [ + { + "hash": "", + "strength": 1.0, + "modelVersionId": 2639467, + "isDeleted": True, + "exclude": False + } + ] + fingerprint = calculate_recipe_fingerprint(loras) + assert fingerprint == "2639467:1.0" + + +def test_calculate_fingerprint_with_excluded_lora(): + """Test that excluded loras are skipped even with modelVersionId.""" + loras = [ + { + "hash": "", + "strength": 1.0, + "modelVersionId": 2639467, + "isDeleted": False, + "exclude": True + } + ] + fingerprint = calculate_recipe_fingerprint(loras) + assert fingerprint == "" + + +def test_calculate_fingerprint_prefers_hash_over_version_id(): + """Test that hash is used even when modelVersionId is present.""" + loras = [ + { + "hash": "abc123", + "strength": 1.0, + "modelVersionId": 2639467, + "isDeleted": False, + "exclude": False + } + ] + fingerprint = calculate_recipe_fingerprint(loras) + assert fingerprint == "abc123:1.0" + + +def test_calculate_fingerprint_without_hash_or_version_id(): + """Test that loras without hash or modelVersionId are skipped.""" + loras = [ + { + "hash": "", + "strength": 1.0, + "modelVersionId": 0, + "isDeleted": False, + "exclude": False + } + ] + fingerprint = calculate_recipe_fingerprint(loras) + assert fingerprint == ""