feat: add SQLite-based persistent recipe cache for faster startup

Introduce a new PersistentRecipeCache service that stores recipe metadata in an SQLite database to significantly reduce application startup time. The cache eliminates the need to walk directories and parse JSON files on each launch by persisting recipe data between sessions. Key features: - Thread-safe singleton implementation with library-specific instances - Automatic schema initialization and migration support - JSON serialization for complex recipe fields (LoRAs, checkpoints, generation parameters, tags) - File system monitoring with mtime/size validation for cache invalidation - Environment variable toggle (LORA_MANAGER_DISABLE_PERSISTENT_CACHE) for debugging - Comprehensive test suite covering save/load cycles, cache invalidation, and edge cases The cache improves user experience by enabling near-instantaneous recipe loading after the initial cache population, while maintaining data consistency through file change detection.
2026-05-06 16:36:45 -03:00 · 2026-01-23 22:56:38 +08:00
parent 7bba24c19f
commit eb2af454cc
5 changed files with 1337 additions and 115 deletions
--- a/py/services/recipe_scanner.py
+++ b/py/services/recipe_scanner.py
@@ -9,6 +9,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple
 from ..config import config
 from .recipe_cache import RecipeCache
 from .recipe_fts_index import RecipeFTSIndex
+from .persistent_recipe_cache import PersistentRecipeCache, get_persistent_recipe_cache
 from .service_registry import ServiceRegistry
 from .lora_scanner import LoraScanner
 from .metadata_service import get_default_metadata_provider
@@ -78,6 +79,9 @@ class RecipeScanner:
            # FTS index for fast search
            self._fts_index: Optional[RecipeFTSIndex] = None
            self._fts_index_task: Optional[asyncio.Task] = None
+            # Persistent cache for fast startup
+            self._persistent_cache: Optional[PersistentRecipeCache] = None
+            self._json_path_map: Dict[str, str] = {}  # recipe_id -> json_path
            if lora_scanner:
                self._lora_scanner = lora_scanner
            if checkpoint_scanner:
@@ -109,6 +113,11 @@ class RecipeScanner:
            self._fts_index.clear()
        self._fts_index = None

+        # Reset persistent cache instance for new library
+        self._persistent_cache = None
+        self._json_path_map = {}
+        PersistentRecipeCache.clear_instances()
+
        self._cache = None
        self._initialization_task = None
        self._is_initializing = False
@@ -321,12 +330,17 @@ class RecipeScanner:
            with open(recipe_json_path, 'w', encoding='utf-8') as f:
                json.dump(recipe, f, indent=4, ensure_ascii=False)

-            # 4. Update EXIF if image exists
+            # 4. Update persistent SQLite cache
+            if self._persistent_cache:
+                self._persistent_cache.update_recipe(recipe, recipe_json_path)
+                self._json_path_map[str(recipe_id)] = recipe_json_path
+
+            # 5. Update EXIF if image exists
            image_path = recipe.get('file_path')
            if image_path and os.path.exists(image_path):
                from ..utils.exif_utils import ExifUtils
                ExifUtils.append_recipe_metadata(image_path, recipe)
-            
+
            return True
        except Exception as e:
            logger.error(f"Error persisting recipe {recipe_id}: {e}")
@@ -408,117 +422,268 @@ class RecipeScanner:
            logger.error(f"Recipe Scanner: Error initializing cache in background: {e}")
    
    def _initialize_recipe_cache_sync(self):
-        """Synchronous version of recipe cache initialization for thread pool execution"""
+        """Synchronous version of recipe cache initialization for thread pool execution.
+
+        Uses persistent cache for fast startup when available:
+        1. Try to load from persistent SQLite cache
+        2. Reconcile with filesystem (check mtime/size for changes)
+        3. Fall back to full directory scan if cache miss or reconciliation fails
+        4. Persist results for next startup
+        """
        try:
            # Create a new event loop for this thread
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
-            
-            # Create a synchronous method to bypass the async lock
-            def sync_initialize_cache():
-                # We need to implement scan_all_recipes logic synchronously here
-                # instead of calling the async method to avoid event loop issues
-                recipes = []
-                recipes_dir = self.recipes_dir
-                
-                if not recipes_dir or not os.path.exists(recipes_dir):
-                    logger.warning(f"Recipes directory not found: {recipes_dir}")
-                    return recipes
-                
-                # Get all recipe JSON files in the recipes directory
-                recipe_files = []
-                for root, _, files in os.walk(recipes_dir):
-                    recipe_count = sum(1 for f in files if f.lower().endswith('.recipe.json'))
-                    if recipe_count > 0:
-                        for file in files:
-                            if file.lower().endswith('.recipe.json'):
-                                recipe_files.append(os.path.join(root, file))
-                
-                # Process each recipe file
-                for recipe_path in recipe_files:
-                    try:
-                        with open(recipe_path, 'r', encoding='utf-8') as f:
-                            recipe_data = json.load(f)
-                        
-                        # Validate recipe data
-                        if not recipe_data or not isinstance(recipe_data, dict):
-                            logger.warning(f"Invalid recipe data in {recipe_path}")
-                            continue
-                        
-                        # Ensure required fields exist
-                        required_fields = ['id', 'file_path', 'title']
-                        if not all(field in recipe_data for field in required_fields):
-                            logger.warning(f"Missing required fields in {recipe_path}")
-                            continue
-                        
-                        # Ensure the image file exists and prioritize local siblings
-                        image_path = recipe_data.get('file_path')
-                        if image_path:
-                            recipe_dir = os.path.dirname(recipe_path)
-                            image_filename = os.path.basename(image_path)
-                            local_sibling_path = os.path.normpath(os.path.join(recipe_dir, image_filename))
-                            
-                            # If local sibling exists and stored path is different, prefer local
-                            if os.path.exists(local_sibling_path) and os.path.normpath(image_path) != local_sibling_path:
-                                recipe_data['file_path'] = local_sibling_path
-                                # Persist the repair
-                                try:
-                                    with open(recipe_path, 'w', encoding='utf-8') as f:
-                                        json.dump(recipe_data, f, indent=4, ensure_ascii=False)
-                                    logger.info(f"Updated recipe image path to local sibling: {local_sibling_path}")
-                                except Exception as e:
-                                    logger.warning(f"Failed to persist repair for {recipe_path}: {e}")
-                            elif not os.path.exists(image_path):
-                                logger.warning(f"Recipe image not found and no local sibling: {image_path}")
-                        
-                        # Ensure loras array exists
-                        if 'loras' not in recipe_data:
-                            recipe_data['loras'] = []
-                        
-                        # Ensure gen_params exists
-                        if 'gen_params' not in recipe_data:
-                            recipe_data['gen_params'] = {}
-                        
-                        # Add to list without async operations
-                        recipes.append(recipe_data)
-                    except Exception as e:
-                        logger.error(f"Error loading recipe file {recipe_path}: {e}")
-                        import traceback
-                        traceback.print_exc(file=sys.stderr)
-                
-                # Update cache with the collected data
-                self._cache.raw_data = recipes
-                self._update_folder_metadata(self._cache)
-                
-                # Create a simplified resort function that doesn't use await
-                if hasattr(self._cache, "resort"):
-                    try:
-                        # Sort by name
-                        self._cache.sorted_by_name = natsorted(
-                            self._cache.raw_data,
-                            key=lambda x: x.get('title', '').lower()
-                        )
-                        
-                        # Sort by date (modified or created)
-                        self._cache.sorted_by_date = sorted(
-                            self._cache.raw_data,
-                            key=lambda x: x.get('modified', x.get('created_date', 0)),
-                            reverse=True
-                        )
-                    except Exception as e:
-                        logger.error(f"Error sorting recipe cache: {e}")
-                
+
+            # Initialize persistent cache
+            if self._persistent_cache is None:
+                self._persistent_cache = get_persistent_recipe_cache()
+
+            recipes_dir = self.recipes_dir
+            if not recipes_dir or not os.path.exists(recipes_dir):
+                logger.warning(f"Recipes directory not found: {recipes_dir}")
                return self._cache
-            
-            # Run our sync initialization that avoids lock conflicts
-            return sync_initialize_cache()
+
+            # Try to load from persistent cache first
+            persisted = self._persistent_cache.load_cache()
+            if persisted:
+                recipes, changed, json_paths = self._reconcile_recipe_cache(persisted, recipes_dir)
+                self._json_path_map = json_paths
+
+                if not changed:
+                    # Fast path: use cached data directly
+                    logger.info("Recipe cache hit: loaded %d recipes from persistent cache", len(recipes))
+                    self._cache.raw_data = recipes
+                    self._update_folder_metadata(self._cache)
+                    self._sort_cache_sync()
+                    return self._cache
+                else:
+                    # Partial update: some files changed
+                    logger.info("Recipe cache partial hit: reconciled %d recipes with filesystem", len(recipes))
+                    self._cache.raw_data = recipes
+                    self._update_folder_metadata(self._cache)
+                    self._sort_cache_sync()
+                    # Persist updated cache
+                    self._persistent_cache.save_cache(recipes, json_paths)
+                    return self._cache
+
+            # Fall back to full directory scan
+            logger.info("Recipe cache miss: performing full directory scan")
+            recipes, json_paths = self._full_directory_scan_sync(recipes_dir)
+            self._json_path_map = json_paths
+
+            # Update cache with the collected data
+            self._cache.raw_data = recipes
+            self._update_folder_metadata(self._cache)
+            self._sort_cache_sync()
+
+            # Persist for next startup
+            self._persistent_cache.save_cache(recipes, json_paths)
+
+            return self._cache
        except Exception as e:
            logger.error(f"Error in thread-based recipe cache initialization: {e}")
+            import traceback
+            traceback.print_exc(file=sys.stderr)
            return self._cache if hasattr(self, '_cache') else None
        finally:
            # Clean up the event loop
            loop.close()

+    def _reconcile_recipe_cache(
+        self,
+        persisted: "PersistedRecipeData",
+        recipes_dir: str,
+    ) -> Tuple[List[Dict], bool, Dict[str, str]]:
+        """Reconcile persisted cache with current filesystem state.
+
+        Args:
+            persisted: The persisted recipe data from SQLite cache.
+            recipes_dir: Path to the recipes directory.
+
+        Returns:
+            Tuple of (recipes list, changed flag, json_paths dict).
+        """
+        from .persistent_recipe_cache import PersistedRecipeData
+
+        recipes: List[Dict] = []
+        json_paths: Dict[str, str] = {}
+        changed = False
+
+        # Build set of current recipe files
+        current_files: Dict[str, Tuple[float, int]] = {}
+        for root, _, files in os.walk(recipes_dir):
+            for file in files:
+                if file.lower().endswith('.recipe.json'):
+                    file_path = os.path.join(root, file)
+                    try:
+                        stat = os.stat(file_path)
+                        current_files[file_path] = (stat.st_mtime, stat.st_size)
+                    except OSError:
+                        continue
+
+        # Build lookup of persisted recipes by json_path
+        persisted_by_path: Dict[str, Dict] = {}
+        for recipe in persisted.raw_data:
+            recipe_id = str(recipe.get('id', ''))
+            if recipe_id:
+                # Find the json_path from file_stats
+                for json_path, (mtime, size) in persisted.file_stats.items():
+                    if os.path.basename(json_path).startswith(recipe_id):
+                        persisted_by_path[json_path] = recipe
+                        break
+
+        # Also index by recipe ID for faster lookups
+        persisted_by_id: Dict[str, Dict] = {
+            str(r.get('id', '')): r for r in persisted.raw_data if r.get('id')
+        }
+
+        # Process current files
+        for file_path, (current_mtime, current_size) in current_files.items():
+            cached_stats = persisted.file_stats.get(file_path)
+
+            if cached_stats:
+                cached_mtime, cached_size = cached_stats
+                # Check if file is unchanged
+                if abs(current_mtime - cached_mtime) < 1.0 and current_size == cached_size:
+                    # Use cached data
+                    cached_recipe = persisted_by_path.get(file_path)
+                    if cached_recipe:
+                        recipe_id = str(cached_recipe.get('id', ''))
+                        # Track folder from file path
+                        cached_recipe['folder'] = cached_recipe.get('folder') or self._calculate_folder(file_path)
+                        recipes.append(cached_recipe)
+                        json_paths[recipe_id] = file_path
+                        continue
+
+            # File is new or changed - need to re-read
+            changed = True
+            recipe_data = self._load_recipe_file_sync(file_path)
+            if recipe_data:
+                recipe_id = str(recipe_data.get('id', ''))
+                recipes.append(recipe_data)
+                json_paths[recipe_id] = file_path
+
+        # Check for deleted files
+        for json_path in persisted.file_stats.keys():
+            if json_path not in current_files:
+                changed = True
+                logger.debug("Recipe file deleted: %s", json_path)
+
+        return recipes, changed, json_paths
+
+    def _full_directory_scan_sync(self, recipes_dir: str) -> Tuple[List[Dict], Dict[str, str]]:
+        """Perform a full synchronous directory scan for recipes.
+
+        Args:
+            recipes_dir: Path to the recipes directory.
+
+        Returns:
+            Tuple of (recipes list, json_paths dict).
+        """
+        recipes: List[Dict] = []
+        json_paths: Dict[str, str] = {}
+
+        # Get all recipe JSON files
+        recipe_files = []
+        for root, _, files in os.walk(recipes_dir):
+            for file in files:
+                if file.lower().endswith('.recipe.json'):
+                    recipe_files.append(os.path.join(root, file))
+
+        # Process each recipe file
+        for recipe_path in recipe_files:
+            recipe_data = self._load_recipe_file_sync(recipe_path)
+            if recipe_data:
+                recipe_id = str(recipe_data.get('id', ''))
+                recipes.append(recipe_data)
+                json_paths[recipe_id] = recipe_path
+
+        return recipes, json_paths
+
+    def _load_recipe_file_sync(self, recipe_path: str) -> Optional[Dict]:
+        """Load a single recipe file synchronously.
+
+        Args:
+            recipe_path: Path to the recipe JSON file.
+
+        Returns:
+            Recipe dictionary if valid, None otherwise.
+        """
+        try:
+            with open(recipe_path, 'r', encoding='utf-8') as f:
+                recipe_data = json.load(f)
+
+            # Validate recipe data
+            if not recipe_data or not isinstance(recipe_data, dict):
+                logger.warning(f"Invalid recipe data in {recipe_path}")
+                return None
+
+            # Ensure required fields exist
+            required_fields = ['id', 'file_path', 'title']
+            if not all(field in recipe_data for field in required_fields):
+                logger.warning(f"Missing required fields in {recipe_path}")
+                return None
+
+            # Ensure the image file exists and prioritize local siblings
+            image_path = recipe_data.get('file_path')
+            path_updated = False
+            if image_path:
+                recipe_dir = os.path.dirname(recipe_path)
+                image_filename = os.path.basename(image_path)
+                local_sibling_path = os.path.normpath(os.path.join(recipe_dir, image_filename))
+
+                # If local sibling exists and stored path is different, prefer local
+                if os.path.exists(local_sibling_path) and os.path.normpath(image_path) != local_sibling_path:
+                    recipe_data['file_path'] = local_sibling_path
+                    path_updated = True
+                    logger.info(f"Updated recipe image path to local sibling: {local_sibling_path}")
+                elif not os.path.exists(image_path):
+                    logger.warning(f"Recipe image not found and no local sibling: {image_path}")
+
+            if path_updated:
+                try:
+                    with open(recipe_path, 'w', encoding='utf-8') as f:
+                        json.dump(recipe_data, f, indent=4, ensure_ascii=False)
+                except Exception as e:
+                    logger.warning(f"Failed to persist repair for {recipe_path}: {e}")
+
+            # Track folder placement relative to recipes directory
+            recipe_data['folder'] = recipe_data.get('folder') or self._calculate_folder(recipe_path)
+
+            # Ensure loras array exists
+            if 'loras' not in recipe_data:
+                recipe_data['loras'] = []
+
+            # Ensure gen_params exists
+            if 'gen_params' not in recipe_data:
+                recipe_data['gen_params'] = {}
+
+            return recipe_data
+        except Exception as e:
+            logger.error(f"Error loading recipe file {recipe_path}: {e}")
+            import traceback
+            traceback.print_exc(file=sys.stderr)
+            return None
+
+    def _sort_cache_sync(self) -> None:
+        """Sort cache data synchronously."""
+        try:
+            # Sort by name
+            self._cache.sorted_by_name = natsorted(
+                self._cache.raw_data,
+                key=lambda x: x.get('title', '').lower()
+            )
+
+            # Sort by date (modified or created)
+            self._cache.sorted_by_date = sorted(
+                self._cache.raw_data,
+                key=lambda x: (x.get('modified', x.get('created_date', 0)), x.get('file_path', '')),
+                reverse=True
+            )
+        except Exception as e:
+            logger.error(f"Error sorting recipe cache: {e}")
+
    async def _wait_for_lora_scanner(self) -> None:
        """Ensure the LoRA scanner has initialized before recipe enrichment."""

@@ -570,7 +735,10 @@ class RecipeScanner:
        self._post_scan_task = loop.create_task(_run_enrichment(), name="recipe_cache_enrichment")

    def _schedule_fts_index_build(self) -> None:
-        """Build FTS index in background without blocking."""
+        """Build FTS index in background without blocking.
+
+        Validates existing index first and reuses it if valid.
+        """

        if self._fts_index_task and not self._fts_index_task.done():
            return  # Already running
@@ -587,7 +755,25 @@ class RecipeScanner:
            try:
                self._fts_index = RecipeFTSIndex()

-                # Run in thread pool (SQLite is blocking)
+                # Check if existing index is valid
+                recipe_ids = {str(r.get('id', '')) for r in self._cache.raw_data if r.get('id')}
+                recipe_count = len(self._cache.raw_data)
+
+                # Run validation in thread pool
+                is_valid = await loop.run_in_executor(
+                    None,
+                    self._fts_index.validate_index,
+                    recipe_count,
+                    recipe_ids
+                )
+
+                if is_valid:
+                    logger.info("FTS index validated, reusing existing index with %d recipes", recipe_count)
+                    self._fts_index._ready.set()
+                    return
+
+                # Only rebuild if validation fails
+                logger.info("FTS index invalid or outdated, rebuilding...")
                await loop.run_in_executor(
                    None,
                    self._fts_index.build_index,
@@ -875,6 +1061,12 @@ class RecipeScanner:
        # Update FTS index
        self._update_fts_index_for_recipe(recipe_data, 'add')

+        # Persist to SQLite cache
+        if self._persistent_cache:
+            recipe_id = str(recipe_data.get('id', ''))
+            json_path = self._json_path_map.get(recipe_id, '')
+            self._persistent_cache.update_recipe(recipe_data, json_path)
+
    async def remove_recipe(self, recipe_id: str) -> bool:
        """Remove a recipe from the cache by ID."""

@@ -891,6 +1083,12 @@ class RecipeScanner:

        # Update FTS index
        self._update_fts_index_for_recipe(recipe_id, 'remove')
+
+        # Remove from SQLite cache
+        if self._persistent_cache:
+            self._persistent_cache.remove_recipe(recipe_id)
+            self._json_path_map.pop(recipe_id, None)
+
        return True

    async def bulk_remove(self, recipe_ids: Iterable[str]) -> int:
@@ -900,9 +1098,13 @@ class RecipeScanner:
        removed = await cache.bulk_remove(recipe_ids, resort=False)
        if removed:
            self._schedule_resort()
-            # Update FTS index for each removed recipe
-            for recipe_id in (str(r.get('id', '')) for r in removed):
+            # Update FTS index and persistent cache for each removed recipe
+            for recipe in removed:
+                recipe_id = str(recipe.get('id', ''))
                self._update_fts_index_for_recipe(recipe_id, 'remove')
+                if self._persistent_cache:
+                    self._persistent_cache.remove_recipe(recipe_id)
+                    self._json_path_map.pop(recipe_id, None)
        return len(removed)

    async def scan_all_recipes(self) -> List[Dict]:
@@ -1695,11 +1897,11 @@ class RecipeScanner:

    async def update_recipe_metadata(self, recipe_id: str, metadata: dict) -> bool:
        """Update recipe metadata (like title and tags) in both file system and cache
-        
+
        Args:
            recipe_id: The ID of the recipe to update
            metadata: Dictionary containing metadata fields to update (title, tags, etc.)
-            
+
        Returns:
            bool: True if successful, False otherwise
        """
@@ -1707,16 +1909,16 @@ class RecipeScanner:
        recipe_json_path = await self.get_recipe_json_path(recipe_id)
        if not recipe_json_path or not os.path.exists(recipe_json_path):
            return False
-            
+
        try:
            # Load existing recipe data
            with open(recipe_json_path, 'r', encoding='utf-8') as f:
                recipe_data = json.load(f)
-                
+
            # Update fields
            for key, value in metadata.items():
                recipe_data[key] = value
-                
+
            # Save updated recipe
            with open(recipe_json_path, 'w', encoding='utf-8') as f:
                json.dump(recipe_data, f, indent=4, ensure_ascii=False)
@@ -1729,6 +1931,11 @@ class RecipeScanner:
            # Update FTS index
            self._update_fts_index_for_recipe(recipe_data, 'update')

+            # Update persistent SQLite cache
+            if self._persistent_cache:
+                self._persistent_cache.update_recipe(recipe_data, recipe_json_path)
+                self._json_path_map[recipe_id] = recipe_json_path
+
            # If the recipe has an image, update its EXIF metadata
            from ..utils.exif_utils import ExifUtils
            image_path = recipe_data.get('file_path')
@@ -1800,6 +2007,11 @@ class RecipeScanner:
        # Update FTS index
        self._update_fts_index_for_recipe(recipe_data, 'update')

+        # Update persistent SQLite cache
+        if self._persistent_cache:
+            self._persistent_cache.update_recipe(recipe_data, recipe_json_path)
+            self._json_path_map[recipe_id] = recipe_json_path
+
        updated_lora = dict(lora_entry)
        if target_lora is not None:
            preview_url = target_lora.get('preview_url')
@@ -1923,26 +2135,31 @@ class RecipeScanner:
        if not recipes_to_update:
            return 0, 0
            
-        # Persist changes to disk
+        # Persist changes to disk and SQLite cache
        async with self._mutation_lock:
            for recipe in recipes_to_update:
-                recipe_id = recipe.get('id')
+                recipe_id = str(recipe.get('id', ''))
                if not recipe_id:
                    continue
-                    
+
                recipe_path = os.path.join(self.recipes_dir, f"{recipe_id}.recipe.json")
                try:
                    self._write_recipe_file(recipe_path, recipe)
                    file_updated_count += 1
                    logger.info(f"Updated file_name in recipe {recipe_path}: -> {new_file_name}")
+
+                    # Update persistent SQLite cache
+                    if self._persistent_cache:
+                        self._persistent_cache.update_recipe(recipe, recipe_path)
+                        self._json_path_map[recipe_id] = recipe_path
                except Exception as e:
                    logger.error(f"Error updating recipe file {recipe_path}: {e}")
-        
+
        # We don't necessarily need to resort because LoRA file_name isn't a sort key,
        # but we might want to schedule a resort if we're paranoid or if searching relies on sorted state.
        # Given it's a rename of a dependency, search results might change if searching by LoRA name.
        self._schedule_resort()
-            
+
        return file_updated_count, cache_updated_count

    async def find_recipes_by_fingerprint(self, fingerprint: str) -> list: