From bef222c77de0829d4f3d1ffd23cc43c63e309737 Mon Sep 17 00:00:00 2001 From: Will Miao Date: Sat, 13 Jun 2026 08:32:03 +0800 Subject: [PATCH] perf(recipe): precompute image_id_map for O(1) CivitAI image existence checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build a civitai_image_id → recipe_id mapping once during cache initialization instead of scanning all recipes on every check_image_exists and import_from_url call. - RecipeCache gains an image_id_map field populated by _build_image_id_map() during cache init - check_image_exists and import_from_url duplicate detection now use the precomputed map (O(k) / O(1) vs O(n)) - Map is persisted in SQLite cache_metadata for fast startup - Incrementally updated on add/remove/bulk_remove paths - Fix: conn.close() before cache_metadata query (dead connection) --- py/routes/handlers/recipe_handlers.py | 37 +++++----- py/services/persistent_recipe_cache.py | 61 +++++++++++++++- py/services/recipe_cache.py | 11 ++- py/services/recipe_scanner.py | 73 +++++++++++++++++-- tests/routes/test_recipe_routes.py | 98 +++++++++++++++++++++++++- tests/services/test_recipe_scanner.py | 82 +++++++++++++++++++++ tests/test_persistent_recipe_cache.py | 78 ++++++++++++++++++++ 7 files changed, 410 insertions(+), 30 deletions(-) diff --git a/py/routes/handlers/recipe_handlers.py b/py/routes/handlers/recipe_handlers.py index d2c202c0..210e8f69 100644 --- a/py/routes/handlers/recipe_handlers.py +++ b/py/routes/handlers/recipe_handlers.py @@ -1597,15 +1597,8 @@ class RecipeManagementHandler: cache = await recipe_scanner.get_cached_data() - # Build lookup: image_id -> recipe_id from stored source_path - image_to_recipe = {} - for recipe in getattr(cache, "raw_data", []): - source = recipe.get("source_path") - if not source: - continue - image_id = extract_civitai_image_id(source) - if image_id and image_id not in image_to_recipe: - image_to_recipe[image_id] = recipe.get("id") + # Use precomputed image_id_map (built once at cache init) + image_to_recipe = getattr(cache, "image_id_map", {}) results = {} for img_id in requested_ids: @@ -1641,20 +1634,22 @@ class RecipeManagementHandler: "Could not extract Civitai image ID from URL" ) - # Check for duplicate (fast, before acquiring semaphore), unless force if not force: cache = await recipe_scanner.get_cached_data() - for recipe in getattr(cache, "raw_data", []): - source = recipe.get("source_path") - if source: - existing_id = extract_civitai_image_id(source) - if existing_id == image_id: - return web.json_response({ - "success": True, - "recipe_id": recipe.get("id"), - "name": recipe.get("title", ""), - "already_exists": True, - }) + image_to_recipe = getattr(cache, "image_id_map", {}) + existing_recipe_id = image_to_recipe.get(image_id) + if existing_recipe_id: + recipe_name = "" + for recipe in getattr(cache, "raw_data", []): + if str(recipe.get("id", "")) == existing_recipe_id: + recipe_name = recipe.get("title", "") or "" + break + return web.json_response({ + "success": True, + "recipe_id": existing_recipe_id, + "name": recipe_name, + "already_exists": True, + }) async with self._import_semaphore: return await self._do_import_from_url(image_url, recipe_scanner) diff --git a/py/services/persistent_recipe_cache.py b/py/services/persistent_recipe_cache.py index 10f1dc7a..952b5418 100644 --- a/py/services/persistent_recipe_cache.py +++ b/py/services/persistent_recipe_cache.py @@ -12,7 +12,7 @@ import logging import os import sqlite3 import threading -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Dict, List, Optional, Set, Tuple from ..utils.cache_paths import CacheType, resolve_cache_path_with_migration @@ -26,6 +26,8 @@ class PersistedRecipeData: raw_data: List[Dict] file_stats: Dict[str, Tuple[float, int]] # json_path -> (mtime, size) + image_id_map: Dict[str, str] = field(default_factory=dict) + """Precomputed mapping of civitai image_id → recipe_id.""" class PersistentRecipeCache: @@ -116,6 +118,20 @@ class PersistentRecipeCache: if not rows: return None + # Restore precomputed image_id_map if available + image_id_map: Dict[str, str] = {} + try: + meta_row = conn.execute( + "SELECT value FROM cache_metadata WHERE key = ?", + ("image_id_map",), + ).fetchone() + if meta_row: + parsed = json.loads(meta_row["value"]) + if isinstance(parsed, dict): + image_id_map = parsed + except Exception: + pass # missing or corrupt — rebuilt on next cache refresh + finally: conn.close() except FileNotFoundError: @@ -138,14 +154,24 @@ class PersistentRecipeCache: row["file_size"] or 0, ) - return PersistedRecipeData(raw_data=raw_data, file_stats=file_stats) + return PersistedRecipeData( + raw_data=raw_data, + file_stats=file_stats, + image_id_map=image_id_map, + ) - def save_cache(self, recipes: List[Dict], json_paths: Optional[Dict[str, str]] = None) -> None: + def save_cache( + self, + recipes: List[Dict], + json_paths: Optional[Dict[str, str]] = None, + image_id_map: Optional[Dict[str, str]] = None, + ) -> None: """Save all recipes to SQLite cache. Args: recipes: List of recipe dictionaries to persist. json_paths: Optional mapping of recipe_id -> json_path for file stats. + image_id_map: Optional precomputed civitai image_id → recipe_id mapping. """ if not self.is_enabled(): return @@ -186,6 +212,12 @@ class PersistentRecipeCache: recipe_rows, ) + # Persist image_id_map for O(1) lookups on cache load + conn.execute( + "INSERT OR REPLACE INTO cache_metadata (key, value) VALUES (?, ?)", + ("image_id_map", json.dumps(image_id_map or {})), + ) + conn.commit() logger.debug("Persisted %d recipes to cache", len(recipe_rows)) finally: @@ -273,6 +305,29 @@ class PersistentRecipeCache: except Exception as exc: logger.debug("Failed to remove recipe %s from cache: %s", recipe_id, exc) + def save_image_id_map(self, image_id_map: Dict[str, str]) -> None: + """Persist the image_id_map to cache_metadata without rewriting the full cache. + + This is called after ``add_recipe`` / ``remove_recipe`` mutations so + the persistent copy does not go stale between full ``save_cache`` calls. + """ + if not self.is_enabled() or not self._schema_initialized: + return + + try: + with self._db_lock: + conn = self._connect() + try: + conn.execute( + "INSERT OR REPLACE INTO cache_metadata (key, value) VALUES (?, ?)", + ("image_id_map", json.dumps(image_id_map)), + ) + conn.commit() + finally: + conn.close() + except Exception as exc: + logger.debug("Failed to persist image_id_map: %s", exc) + def get_indexed_recipe_ids(self) -> Set[str]: """Return all recipe IDs in the cache. diff --git a/py/services/recipe_cache.py b/py/services/recipe_cache.py index 7c0c499c..c0762a4a 100644 --- a/py/services/recipe_cache.py +++ b/py/services/recipe_cache.py @@ -1,6 +1,6 @@ import asyncio from typing import Iterable, List, Dict, Optional -from dataclasses import dataclass +from dataclasses import dataclass, field from operator import itemgetter from natsort import natsorted @@ -14,6 +14,15 @@ class RecipeCache: sorted_by_date: List[Dict] folders: List[str] | None = None folder_tree: Dict | None = None + image_id_map: Dict[str, str] = field(default_factory=dict) + """Mapping of civitai image_id → recipe_id, precomputed at cache build time. + + Built once during cache initialization (O(n)) so that + ``check_image_exists`` and ``import_from_url`` duplicate checks + can look up image_id in O(1) instead of scanning all recipes. + Recipes imported from local files have no valid civitai image_id + and are naturally excluded from this map. + """ def __post_init__(self): self._lock = asyncio.Lock() diff --git a/py/services/recipe_scanner.py b/py/services/recipe_scanner.py index 8bb4b102..b0eb3be6 100644 --- a/py/services/recipe_scanner.py +++ b/py/services/recipe_scanner.py @@ -20,6 +20,7 @@ from .metadata_service import get_default_metadata_provider from .checkpoint_scanner import CheckpointScanner from .settings_manager import get_settings_manager from .recipes.errors import RecipeNotFoundError +from ..utils.civitai_utils import extract_civitai_image_id from ..utils.utils import calculate_recipe_fingerprint, fuzzy_match from natsort import natsorted import sys @@ -532,7 +533,21 @@ class RecipeScanner: self._sort_cache_sync() # Backfill source_path from JSON files if missing (schema migration) if self._backfill_source_path_if_needed(recipes, json_paths): - self._persistent_cache.save_cache(recipes, json_paths) + self._cache.image_id_map = self._build_image_id_map() + self._persistent_cache.save_cache( + recipes, json_paths, self._cache.image_id_map + ) + else: + # Use persisted map, or rebuild if empty (e.g. first startup + # after deploying the image_id_map feature). + if persisted.image_id_map: + self._cache.image_id_map = dict(persisted.image_id_map) + else: + self._cache.image_id_map = self._build_image_id_map() + if self._cache.image_id_map: + self._persistent_cache.save_image_id_map( + self._cache.image_id_map + ) return self._cache else: # Partial update: some files changed @@ -545,8 +560,11 @@ class RecipeScanner: self._sort_cache_sync() # Backfill source_path from JSON files if missing (schema migration) self._backfill_source_path_if_needed(recipes, json_paths) + self._cache.image_id_map = self._build_image_id_map() # Persist updated cache - self._persistent_cache.save_cache(recipes, json_paths) + self._persistent_cache.save_cache( + recipes, json_paths, self._cache.image_id_map + ) return self._cache # Fall back to full directory scan @@ -558,9 +576,12 @@ class RecipeScanner: self._cache.raw_data = recipes self._update_folder_metadata(self._cache) self._sort_cache_sync() + self._cache.image_id_map = self._build_image_id_map() # Persist for next startup - self._persistent_cache.save_cache(recipes, json_paths) + self._persistent_cache.save_cache( + recipes, json_paths, self._cache.image_id_map + ) return self._cache except Exception as e: @@ -832,6 +853,28 @@ class RecipeScanner: except Exception as e: logger.error(f"Error sorting recipe cache: {e}") + def _build_image_id_map(self) -> Dict[str, str]: + """Build civitai image_id → recipe_id mapping from cached recipes. + + Only recipes with a valid CivitAI image URL source_path produce an + entry. Recipes imported from local files are naturally excluded. + """ + mapping: Dict[str, str] = {} + if not self._cache: + return mapping + for recipe in getattr(self._cache, "raw_data", []): + if not isinstance(recipe, dict): + continue + source = recipe.get("source_path") + if not source: + continue + image_id = extract_civitai_image_id(source) + if image_id and image_id not in mapping: + recipe_id = recipe.get("id") + if recipe_id is not None: + mapping[image_id] = str(recipe_id) + return mapping + async def _wait_for_lora_scanner(self) -> None: """Ensure the LoRA scanner has initialized before recipe enrichment.""" @@ -1296,11 +1339,20 @@ class RecipeScanner: # Update FTS index self._update_fts_index_for_recipe(recipe_data, "add") + source = recipe_data.get("source_path") + if source: + image_id = extract_civitai_image_id(source) + if image_id: + recipe_id_value = recipe_data.get("id") + if recipe_id_value is not None: + cache.image_id_map[image_id] = str(recipe_id_value) + # Persist to SQLite cache if self._persistent_cache: recipe_id = str(recipe_data.get("id", "")) json_path = self._json_path_map.get(recipe_id, "") self._persistent_cache.update_recipe(recipe_data, json_path) + self._persistent_cache.save_image_id_map(cache.image_id_map) async def remove_recipe(self, recipe_id: str) -> bool: """Remove a recipe from the cache by ID.""" @@ -1319,9 +1371,15 @@ class RecipeScanner: # Update FTS index self._update_fts_index_for_recipe(recipe_id, "remove") + # Remove any image_id entry pointing to this recipe + stale = [k for k, v in cache.image_id_map.items() if v == recipe_id] + for k in stale: + del cache.image_id_map[k] + # Remove from SQLite cache if self._persistent_cache: self._persistent_cache.remove_recipe(recipe_id) + self._persistent_cache.save_image_id_map(cache.image_id_map) self._json_path_map.pop(recipe_id, None) return True @@ -1332,14 +1390,21 @@ class RecipeScanner: cache = await self.get_cached_data() removed = await cache.bulk_remove(recipe_ids, resort=False) if removed: + removed_ids = {str(r.get("id", "")) for r in removed} + stale = [k for k, v in cache.image_id_map.items() if v in removed_ids] + for k in stale: + del cache.image_id_map[k] + self._schedule_resort() - # Update FTS index and persistent cache for each removed recipe for recipe in removed: recipe_id = str(recipe.get("id", "")) self._update_fts_index_for_recipe(recipe_id, "remove") if self._persistent_cache: self._persistent_cache.remove_recipe(recipe_id) self._json_path_map.pop(recipe_id, None) + + if self._persistent_cache: + self._persistent_cache.save_image_id_map(cache.image_id_map) return len(removed) async def scan_all_recipes(self) -> List[Dict]: diff --git a/tests/routes/test_recipe_routes.py b/tests/routes/test_recipe_routes.py index f432e1a9..48c7a1bc 100644 --- a/tests/routes/test_recipe_routes.py +++ b/tests/routes/test_recipe_routes.py @@ -46,6 +46,7 @@ class StubRecipeScanner: self.last_paginated_params: Dict[str, Any] | None = None self.lora_lookup: Dict[str, List[Dict[str, Any]]] = {} self.checkpoint_lookup: Dict[str, List[Dict[str, Any]]] = {} + self.image_id_map_override: Dict[str, str] = {} async def _noop_get_cached_data(force_refresh: bool = False) -> None: # noqa: ARG001 - signature mirrors real scanner return None @@ -56,7 +57,10 @@ class StubRecipeScanner: ) async def get_cached_data(self, force_refresh: bool = False) -> SimpleNamespace: # noqa: ARG002 - flag unused by stub - return SimpleNamespace(raw_data=list(self.cached_raw)) + return SimpleNamespace( + raw_data=list(self.cached_raw), + image_id_map=dict(getattr(self, "image_id_map_override", {})), + ) async def get_paginated_data(self, **params: Any) -> Dict[str, Any]: self.last_paginated_params = params @@ -999,3 +1003,95 @@ async def test_batch_import_cancel_missing_id(monkeypatch, tmp_path: Path) -> No payload = await response.json() assert response.status == 400 assert payload["success"] is False + + +async def test_check_image_exists_uses_image_id_map(monkeypatch, tmp_path: Path) -> None: + """check_image_exists must use precomputed image_id_map instead of scanning raw_data.""" + async with recipe_harness(monkeypatch, tmp_path) as harness: + harness.scanner.image_id_map_override = { + "123": "recipe-alpha", + "789": "recipe-gamma", + } + + response = await harness.client.get( + "/api/lm/recipes/check-image-exists", + params={"image_ids": "123,456,789"}, + ) + payload = await response.json() + + assert response.status == 200 + assert payload["success"] is True + assert payload["results"]["123"] == { + "in_library": True, + "recipe_id": "recipe-alpha", + } + assert payload["results"]["456"] == { + "in_library": False, + "recipe_id": None, + } + assert payload["results"]["789"] == { + "in_library": True, + "recipe_id": "recipe-gamma", + } + + +async def test_check_image_exists_handles_empty_input(monkeypatch, tmp_path: Path) -> None: + """Empty or non-numeric image_ids must return an empty results dict.""" + async with recipe_harness(monkeypatch, tmp_path) as harness: + response = await harness.client.get( + "/api/lm/recipes/check-image-exists", + params={"image_ids": ""}, + ) + payload = await response.json() + assert response.status == 200 + assert payload["results"] == {} + + +async def test_import_from_url_detects_duplicate_via_image_id_map( + monkeypatch, tmp_path: Path, +) -> None: + """import_from_url must return already_exists when image_id is in image_id_map.""" + async with recipe_harness(monkeypatch, tmp_path) as harness: + harness.scanner.cached_raw = [ + {"id": "existing-recipe", "title": "My Recipe"}, + ] + harness.scanner.image_id_map_override = { + "99999": "existing-recipe", + } + + response = await harness.client.get( + "/api/lm/recipes/import-from-url", + params={"image_url": "https://civitai.com/images/99999"}, + ) + payload = await response.json() + + assert response.status == 200 + assert payload["already_exists"] is True + assert payload["recipe_id"] == "existing-recipe" + assert payload["name"] == "My Recipe" + + +async def test_import_from_url_proceeds_when_image_id_not_in_map( + monkeypatch, tmp_path: Path, +) -> None: + """When image_id is absent from image_id_map, import_from_url must proceed to import.""" + async with recipe_harness(monkeypatch, tmp_path) as harness: + harness.scanner.image_id_map_override = { + "111": "some-other-recipe", + } + harness.civitai.image_info["99999"] = { + "id": 99999, + "url": "https://image.civitai.com/x/y/original=true/sample.jpeg", + "type": "image", + "meta": {"prompt": "test"}, + } + + response = await harness.client.get( + "/api/lm/recipes/import-from-url", + params={"image_url": "https://civitai.com/images/99999"}, + ) + + # The import may succeed or fail depending on downstream stubs, + # but it must NOT return already_exists + payload = await response.json() + assert payload.get("already_exists") is not True diff --git a/tests/services/test_recipe_scanner.py b/tests/services/test_recipe_scanner.py index b53a1dec..23fb3140 100644 --- a/tests/services/test_recipe_scanner.py +++ b/tests/services/test_recipe_scanner.py @@ -1015,3 +1015,85 @@ async def test_get_paginated_data_sorting(recipe_scanner): # Test Date ASC: Gamma (5), Alpha (10), Beta (20) res = await scanner.get_paginated_data(page=1, page_size=10, sort_by="date:asc") assert [i["id"] for i in res["items"]] == ["C", "A", "B"] + + +async def test_build_image_id_map_filters_correctly(recipe_scanner): + """Only recipes with valid CivitAI source_path appear in image_id_map. + + Recipes imported from local files or with empty/missing source_path + must be naturally excluded. + """ + scanner, _ = recipe_scanner + from py.services.recipe_cache import RecipeCache + + scanner._cache = RecipeCache( + raw_data=[ + {"id": "r1", "source_path": "https://civitai.com/images/12345"}, + {"id": "r2", "source_path": "https://civitai.com/images/67890"}, + {"id": "r3", "source_path": "/home/user/local_image.png"}, + {"id": "r4", "source_path": ""}, + {"id": "r5"}, + ], + sorted_by_name=[], + sorted_by_date=[], + ) + + result = scanner._build_image_id_map() + + assert result == { + "12345": "r1", + "67890": "r2", + } + # r3 = local file path, r4 = empty string, r5 = no key → all excluded + for rid in ("r3", "r4", "r5"): + assert rid not in result.values() + + +async def test_add_recipe_updates_image_id_map(recipe_scanner): + """Adding a recipe with a CivitAI URL must update image_id_map. + + A recipe with a local file path must NOT produce an entry. + """ + scanner, _ = recipe_scanner + + await scanner.add_recipe({ + "id": "civitai-recipe", + "title": "CivitAI", + "source_path": "https://civitai.com/images/55555", + }) + + cache = await scanner.get_cached_data() + assert cache.image_id_map.get("55555") == "civitai-recipe" + + await scanner.add_recipe({ + "id": "local-recipe", + "title": "Local", + "source_path": "/path/to/local.png", + }) + + assert "local-recipe" not in cache.image_id_map.values() + + +async def test_remove_recipe_clears_image_id_map(recipe_scanner): + """Removing a recipe that has a CivitAI image_id must clean up the map.""" + scanner, _ = recipe_scanner + + await scanner.add_recipe({ + "id": "recipe-a", + "title": "A", + "source_path": "https://civitai.com/images/111", + }) + await scanner.add_recipe({ + "id": "recipe-b", + "title": "B", + "source_path": "https://civitai.com/images/222", + }) + + cache = await scanner.get_cached_data() + assert "111" in cache.image_id_map + assert cache.image_id_map["222"] == "recipe-b" + + await scanner.remove_recipe("recipe-a") + + assert "111" not in cache.image_id_map + assert cache.image_id_map["222"] == "recipe-b" diff --git a/tests/test_persistent_recipe_cache.py b/tests/test_persistent_recipe_cache.py index c7b366ff..3fd8f928 100644 --- a/tests/test_persistent_recipe_cache.py +++ b/tests/test_persistent_recipe_cache.py @@ -465,3 +465,81 @@ class TestPersistentRecipeCache: # Operations should complete assert operation_counts["saves"] == 5 assert operation_counts["removes"] == 5 + + # ----------------------------------------------------------------------- + # image_id_map persistence (Phase 1 improvement) + # ----------------------------------------------------------------------- + + def test_save_and_load_image_id_map_roundtrip(self, temp_db_path, sample_recipes): + """Save image_id_map via save_cache() and verify it round-trips through load_cache().""" + cache = PersistentRecipeCache(db_path=temp_db_path) + + image_id_map = { + "12345": "recipe-alpha", + "67890": "recipe-beta", + } + cache.save_cache(sample_recipes, image_id_map=image_id_map) + + loaded = cache.load_cache() + assert loaded is not None + assert loaded.image_id_map == image_id_map + + def test_load_without_image_id_map_returns_empty_dict(self, temp_db_path, sample_recipes): + """Loading from a cache that has no image_id_map metadata must yield {}.""" + cache = PersistentRecipeCache(db_path=temp_db_path) + + # Save without image_id_map + cache.save_cache(sample_recipes) + + loaded = cache.load_cache() + assert loaded is not None + assert loaded.image_id_map == {} + + def test_save_cache_without_image_id_map_does_not_corrupt_existing( + self, temp_db_path, sample_recipes, + ): + """Overwriting cache without passing image_id_map must not leave stale data. + + The previous image_id_map entry in cache_metadata should be replaced with {}. + """ + cache = PersistentRecipeCache(db_path=temp_db_path) + + cache.save_cache(sample_recipes, image_id_map={"123": "old-recipe"}) + # Overwrite without image_id_map + cache.save_cache(sample_recipes) + + loaded = cache.load_cache() + assert loaded.image_id_map == {} + + def test_image_id_map_survives_recipe_update(self, temp_db_path, sample_recipes): + """Updating a single recipe must not drop the image_id_map metadata.""" + cache = PersistentRecipeCache(db_path=temp_db_path) + + cache.save_cache(sample_recipes, image_id_map={"123": "recipe-alpha"}) + + updated = dict(sample_recipes[0]) + updated["title"] = "Updated" + cache.update_recipe(updated) + + loaded = cache.load_cache() + assert loaded.image_id_map == {"123": "recipe-alpha"} + + def test_save_image_id_map_persists_without_full_save(self, temp_db_path, sample_recipes): + """save_image_id_map must update cache_metadata without rewriting all recipes.""" + cache = PersistentRecipeCache(db_path=temp_db_path) + cache.save_cache(sample_recipes) + + cache.save_image_id_map({"555": "new-recipe", "666": "another-recipe"}) + + loaded = cache.load_cache() + assert loaded.image_id_map == {"555": "new-recipe", "666": "another-recipe"} + + def test_save_image_id_map_overwrites_previous(self, temp_db_path, sample_recipes): + """Calling save_image_id_map twice must replace, not merge.""" + cache = PersistentRecipeCache(db_path=temp_db_path) + cache.save_cache(sample_recipes, image_id_map={"111": "old"}) + + cache.save_image_id_map({"222": "new-only"}) + + loaded = cache.load_cache() + assert loaded.image_id_map == {"222": "new-only"}