refactor(cache): reorganize cache directory structure with automatic legacy cleanup

- Centralize cache path resolution in new py/utils/cache_paths.py module
- Migrate legacy cache files to organized structure: {settings_dir}/cache/{model|recipe|fts|symlink}/
- Automatically clean up legacy files after successful migration with integrity verification
- Update Config symlink cache to use new path and migrate from old location
- Simplify service classes (PersistentModelCache, PersistentRecipeCache, RecipeFTSIndex, TagFTSIndex) to use centralized migration logic
- Add comprehensive test coverage for cache paths and automatic cleanup
This commit is contained in:
Will Miao
2026-01-26 16:12:08 +08:00
parent e14afde4b3
commit 7b0c6c8bab
9 changed files with 1108 additions and 74 deletions

421
py/utils/cache_paths.py Normal file
View File

@@ -0,0 +1,421 @@
"""Centralized cache path resolution with automatic migration support.
This module provides a unified interface for resolving cache file paths,
with automatic migration from legacy locations to the new organized
cache directory structure.
Target structure:
{settings_dir}/
└── cache/
├── symlink/
│ └── symlink_map.json
├── model/
│ └── {library_name}.sqlite
├── recipe/
│ └── {library_name}.sqlite
└── fts/
├── recipe_fts.sqlite
└── tag_fts.sqlite
"""
from __future__ import annotations
import logging
import os
import re
import shutil
from enum import Enum
from typing import List, Optional
from .settings_paths import get_project_root, get_settings_dir
logger = logging.getLogger(__name__)
class CacheType(Enum):
"""Types of cache files managed by the cache path resolver."""
MODEL = "model"
RECIPE = "recipe"
RECIPE_FTS = "recipe_fts"
TAG_FTS = "tag_fts"
SYMLINK = "symlink"
# Subdirectory structure for each cache type
_CACHE_SUBDIRS = {
CacheType.MODEL: "model",
CacheType.RECIPE: "recipe",
CacheType.RECIPE_FTS: "fts",
CacheType.TAG_FTS: "fts",
CacheType.SYMLINK: "symlink",
}
# Filename patterns for each cache type
_CACHE_FILENAMES = {
CacheType.MODEL: "{library_name}.sqlite",
CacheType.RECIPE: "{library_name}.sqlite",
CacheType.RECIPE_FTS: "recipe_fts.sqlite",
CacheType.TAG_FTS: "tag_fts.sqlite",
CacheType.SYMLINK: "symlink_map.json",
}
def get_cache_base_dir(create: bool = True) -> str:
"""Return the base cache directory path.
Args:
create: Whether to create the directory if it does not exist.
Returns:
The absolute path to the cache base directory ({settings_dir}/cache/).
"""
settings_dir = get_settings_dir(create=create)
cache_dir = os.path.join(settings_dir, "cache")
if create:
os.makedirs(cache_dir, exist_ok=True)
return cache_dir
def _sanitize_library_name(library_name: Optional[str]) -> str:
"""Sanitize a library name for use in filenames.
Args:
library_name: The library name to sanitize.
Returns:
A sanitized version safe for use in filenames.
"""
name = library_name or "default"
return re.sub(r"[^A-Za-z0-9_.-]", "_", name)
def get_cache_file_path(
cache_type: CacheType,
library_name: Optional[str] = None,
create_dir: bool = True,
) -> str:
"""Get the canonical path for a cache file.
Args:
cache_type: The type of cache file.
library_name: The library name (only used for MODEL and RECIPE types).
create_dir: Whether to create the parent directory if it does not exist.
Returns:
The absolute path to the cache file in its canonical location.
"""
cache_base = get_cache_base_dir(create=create_dir)
subdir = _CACHE_SUBDIRS[cache_type]
cache_dir = os.path.join(cache_base, subdir)
if create_dir:
os.makedirs(cache_dir, exist_ok=True)
filename_template = _CACHE_FILENAMES[cache_type]
safe_name = _sanitize_library_name(library_name)
filename = filename_template.format(library_name=safe_name)
return os.path.join(cache_dir, filename)
def get_legacy_cache_paths(
cache_type: CacheType,
library_name: Optional[str] = None,
) -> List[str]:
"""Get a list of legacy cache file paths to check for migration.
The paths are returned in order of priority (most recent first).
Args:
cache_type: The type of cache file.
library_name: The library name (only used for MODEL and RECIPE types).
Returns:
A list of potential legacy paths to check, in order of preference.
"""
try:
settings_dir = get_settings_dir(create=False)
except Exception:
settings_dir = get_project_root()
safe_name = _sanitize_library_name(library_name)
legacy_paths: List[str] = []
if cache_type == CacheType.MODEL:
# Legacy per-library path: {settings_dir}/model_cache/{library}.sqlite
legacy_paths.append(
os.path.join(settings_dir, "model_cache", f"{safe_name}.sqlite")
)
# Legacy root-level single cache (for "default" library only)
if safe_name.lower() in ("default", ""):
legacy_paths.append(os.path.join(settings_dir, "model_cache.sqlite"))
elif cache_type == CacheType.RECIPE:
# Legacy per-library path: {settings_dir}/recipe_cache/{library}.sqlite
legacy_paths.append(
os.path.join(settings_dir, "recipe_cache", f"{safe_name}.sqlite")
)
# Legacy root-level single cache (for "default" library only)
if safe_name.lower() in ("default", ""):
legacy_paths.append(os.path.join(settings_dir, "recipe_cache.sqlite"))
elif cache_type == CacheType.RECIPE_FTS:
# Legacy root-level path
legacy_paths.append(os.path.join(settings_dir, "recipe_fts.sqlite"))
elif cache_type == CacheType.TAG_FTS:
# Legacy root-level path
legacy_paths.append(os.path.join(settings_dir, "tag_fts.sqlite"))
elif cache_type == CacheType.SYMLINK:
# Current location in cache/ but without subdirectory
legacy_paths.append(
os.path.join(settings_dir, "cache", "symlink_map.json")
)
return legacy_paths
def _cleanup_legacy_file_after_migration(
legacy_path: str,
canonical_path: str,
) -> bool:
"""Safely remove a legacy file after successful migration.
Args:
legacy_path: The legacy file path to remove.
canonical_path: The canonical path where the file was copied to.
Returns:
True if cleanup succeeded, False otherwise.
"""
try:
if not os.path.exists(canonical_path):
logger.warning(
"Skipping cleanup of %s: canonical file not found at %s",
legacy_path,
canonical_path,
)
return False
legacy_size = os.path.getsize(legacy_path)
canonical_size = os.path.getsize(canonical_path)
if legacy_size != canonical_size:
logger.warning(
"Skipping cleanup of %s: file size mismatch (legacy=%d, canonical=%d)",
legacy_path,
legacy_size,
canonical_size,
)
return False
os.remove(legacy_path)
logger.info("Cleaned up legacy cache file: %s", legacy_path)
_cleanup_empty_legacy_directories(legacy_path)
return True
except Exception as exc:
logger.warning(
"Failed to cleanup legacy cache file %s: %s",
legacy_path,
exc,
)
return False
def _cleanup_empty_legacy_directories(legacy_path: str) -> None:
"""Remove empty parent directories of a legacy file.
This function only removes directories if they are empty,
using os.rmdir() which fails on non-empty directories.
Args:
legacy_path: The legacy file path whose parent directories should be cleaned.
"""
try:
parent_dir = os.path.dirname(legacy_path)
legacy_dir_names = ("model_cache", "recipe_cache")
current = parent_dir
while current:
base_name = os.path.basename(current)
if base_name in legacy_dir_names:
if os.path.isdir(current) and not os.listdir(current):
try:
os.rmdir(current)
logger.info("Removed empty legacy directory: %s", current)
except Exception:
pass
parent = os.path.dirname(current)
if parent == current:
break
current = parent
except Exception as exc:
logger.debug("Failed to cleanup empty legacy directories: %s", exc)
def resolve_cache_path_with_migration(
cache_type: CacheType,
library_name: Optional[str] = None,
env_override: Optional[str] = None,
) -> str:
"""Resolve the cache file path, migrating from legacy locations if needed.
This function performs lazy migration: on first access, it checks if the
file exists at the canonical location. If not, it looks for legacy files
and copies them to the new location. After successful migration, the
legacy file is automatically removed.
Args:
cache_type: The type of cache file.
library_name: The library name (only used for MODEL and RECIPE types).
env_override: Optional environment variable value that overrides all
path resolution. When set, returns this path directly without
any migration.
Returns:
The resolved path to use for the cache file.
"""
# Environment override bypasses all migration logic
if env_override:
return env_override
canonical_path = get_cache_file_path(cache_type, library_name, create_dir=True)
# If file already exists at canonical location, use it
if os.path.exists(canonical_path):
return canonical_path
# Check legacy paths for migration
legacy_paths = get_legacy_cache_paths(cache_type, library_name)
for legacy_path in legacy_paths:
if os.path.exists(legacy_path):
try:
shutil.copy2(legacy_path, canonical_path)
logger.info(
"Migrated %s cache from %s to %s",
cache_type.value,
legacy_path,
canonical_path,
)
_cleanup_legacy_file_after_migration(legacy_path, canonical_path)
return canonical_path
except Exception as exc:
logger.warning(
"Failed to migrate %s cache from %s: %s",
cache_type.value,
legacy_path,
exc,
)
# No legacy file found; return canonical path (will be created fresh)
return canonical_path
def get_legacy_cache_files_for_cleanup() -> List[str]:
"""Get a list of legacy cache files that can be removed after migration.
This function returns files that exist in legacy locations and have
corresponding files in the new canonical locations.
Returns:
A list of legacy file paths that are safe to remove.
"""
files_to_remove: List[str] = []
try:
settings_dir = get_settings_dir(create=False)
except Exception:
return files_to_remove
# Check each cache type for migrated legacy files
for cache_type in CacheType:
# For MODEL and RECIPE, we need to check each library
if cache_type in (CacheType.MODEL, CacheType.RECIPE):
# Check default library
_check_legacy_for_cleanup(cache_type, "default", files_to_remove)
# Check for any per-library caches in legacy directories
legacy_dir_name = "model_cache" if cache_type == CacheType.MODEL else "recipe_cache"
legacy_dir = os.path.join(settings_dir, legacy_dir_name)
if os.path.isdir(legacy_dir):
try:
for filename in os.listdir(legacy_dir):
if filename.endswith(".sqlite"):
library_name = filename[:-7] # Remove .sqlite
_check_legacy_for_cleanup(cache_type, library_name, files_to_remove)
except Exception:
pass
else:
_check_legacy_for_cleanup(cache_type, None, files_to_remove)
return files_to_remove
def _check_legacy_for_cleanup(
cache_type: CacheType,
library_name: Optional[str],
files_to_remove: List[str],
) -> None:
"""Check if a legacy cache file can be removed after migration.
Args:
cache_type: The type of cache file.
library_name: The library name (only used for MODEL and RECIPE types).
files_to_remove: List to append removable files to.
"""
canonical_path = get_cache_file_path(cache_type, library_name, create_dir=False)
if not os.path.exists(canonical_path):
return
legacy_paths = get_legacy_cache_paths(cache_type, library_name)
for legacy_path in legacy_paths:
if os.path.exists(legacy_path) and legacy_path not in files_to_remove:
files_to_remove.append(legacy_path)
def cleanup_legacy_cache_files(dry_run: bool = True) -> List[str]:
"""Remove legacy cache files that have been migrated.
Args:
dry_run: If True, only return the list of files that would be removed
without actually removing them.
Returns:
A list of files that were (or would be) removed.
"""
files = get_legacy_cache_files_for_cleanup()
if dry_run or not files:
return files
removed: List[str] = []
for file_path in files:
try:
os.remove(file_path)
removed.append(file_path)
logger.info("Removed legacy cache file: %s", file_path)
except Exception as exc:
logger.warning("Failed to remove legacy cache file %s: %s", file_path, exc)
# Try to remove empty legacy directories
try:
settings_dir = get_settings_dir(create=False)
for legacy_dir_name in ("model_cache", "recipe_cache"):
legacy_dir = os.path.join(settings_dir, legacy_dir_name)
if os.path.isdir(legacy_dir) and not os.listdir(legacy_dir):
os.rmdir(legacy_dir)
logger.info("Removed empty legacy directory: %s", legacy_dir)
except Exception:
pass
return removed