feat(search): add SQLite FTS5 full-text search index for recipes

Introduce a new RecipeFTSIndex class that provides fast prefix-based search across recipe fields (title, tags, LoRA names/models, prompts) using SQLite's FTS5 extension. The implementation supports sub-100ms search times for large datasets (20k+ recipes) and includes asynchronous indexing, incremental updates, and comprehensive unit tests.
This commit is contained in:
Will Miao
2026-01-18 20:44:22 +08:00
parent 7a7517cfb6
commit 7f2e8a0afb
3 changed files with 1153 additions and 37 deletions

View File

@@ -0,0 +1,547 @@
"""SQLite FTS5-based full-text search index for recipes.
This module provides fast recipe search using SQLite's FTS5 extension,
enabling sub-100ms search times even with 20k+ recipes.
"""
from __future__ import annotations
import asyncio
import logging
import os
import re
import sqlite3
import threading
import time
from typing import Any, Dict, List, Optional, Set
from ..utils.settings_paths import get_settings_dir
logger = logging.getLogger(__name__)
class RecipeFTSIndex:
"""SQLite FTS5-based full-text search index for recipes.
Provides fast prefix-based search across multiple recipe fields:
- title
- tags
- lora_names (file names)
- lora_models (model names)
- prompt
- negative_prompt
"""
_DEFAULT_FILENAME = "recipe_fts.sqlite"
# Map of search option keys to FTS column names
FIELD_MAP = {
'title': ['title'],
'tags': ['tags'],
'lora_name': ['lora_names'],
'lora_model': ['lora_models'],
'prompt': ['prompt', 'negative_prompt'],
}
def __init__(self, db_path: Optional[str] = None) -> None:
"""Initialize the FTS index.
Args:
db_path: Optional path to the SQLite database file.
If not provided, uses the default location in settings directory.
"""
self._db_path = db_path or self._resolve_default_path()
self._lock = threading.Lock()
self._ready = threading.Event()
self._indexing_in_progress = False
self._schema_initialized = False
self._warned_not_ready = False
# Ensure directory exists
try:
directory = os.path.dirname(self._db_path)
if directory:
os.makedirs(directory, exist_ok=True)
except Exception as exc:
logger.warning("Could not create FTS index directory %s: %s", directory, exc)
def _resolve_default_path(self) -> str:
"""Resolve the default database path."""
override = os.environ.get("LORA_MANAGER_RECIPE_FTS_DB")
if override:
return override
try:
settings_dir = get_settings_dir(create=True)
except Exception as exc:
logger.warning("Falling back to current directory for FTS index: %s", exc)
settings_dir = "."
return os.path.join(settings_dir, self._DEFAULT_FILENAME)
def get_database_path(self) -> str:
"""Return the resolved database path."""
return self._db_path
def is_ready(self) -> bool:
"""Check if the FTS index is ready for queries."""
return self._ready.is_set()
def is_indexing(self) -> bool:
"""Check if indexing is currently in progress."""
return self._indexing_in_progress
def initialize(self) -> None:
"""Initialize the database schema."""
if self._schema_initialized:
return
with self._lock:
if self._schema_initialized:
return
try:
conn = self._connect()
try:
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript("""
-- FTS5 virtual table for full-text search
-- Note: We use a regular FTS5 table (not contentless) so we can retrieve recipe_id
CREATE VIRTUAL TABLE IF NOT EXISTS recipe_fts USING fts5(
recipe_id,
title,
tags,
lora_names,
lora_models,
prompt,
negative_prompt,
tokenize='unicode61 remove_diacritics 2'
);
-- Recipe ID to rowid mapping for fast lookups and deletions
CREATE TABLE IF NOT EXISTS recipe_rowid (
recipe_id TEXT PRIMARY KEY,
fts_rowid INTEGER UNIQUE
);
-- Index version tracking
CREATE TABLE IF NOT EXISTS fts_metadata (
key TEXT PRIMARY KEY,
value TEXT
);
""")
conn.commit()
self._schema_initialized = True
logger.debug("FTS index schema initialized at %s", self._db_path)
finally:
conn.close()
except Exception as exc:
logger.error("Failed to initialize FTS schema: %s", exc)
def build_index(self, recipes: List[Dict[str, Any]]) -> None:
"""Build or rebuild the entire FTS index from recipe data.
Args:
recipes: List of recipe dictionaries to index.
"""
if self._indexing_in_progress:
logger.warning("FTS indexing already in progress, skipping")
return
self._indexing_in_progress = True
self._ready.clear()
start_time = time.time()
try:
self.initialize()
if not self._schema_initialized:
logger.error("Cannot build FTS index: schema not initialized")
return
with self._lock:
conn = self._connect()
try:
conn.execute("BEGIN")
# Clear existing data
conn.execute("DELETE FROM recipe_fts")
conn.execute("DELETE FROM recipe_rowid")
# Batch insert for performance
batch_size = 500
total = len(recipes)
inserted = 0
for i in range(0, total, batch_size):
batch = recipes[i:i + batch_size]
rows = []
rowid_mappings = []
for recipe in batch:
recipe_id = str(recipe.get('id', ''))
if not recipe_id:
continue
row = self._prepare_fts_row(recipe)
rows.append(row)
inserted += 1
if rows:
# Insert into FTS table
conn.executemany(
"""INSERT INTO recipe_fts (recipe_id, title, tags, lora_names,
lora_models, prompt, negative_prompt)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
rows
)
# Build rowid mappings
for row in rows:
recipe_id = row[0]
cursor = conn.execute(
"SELECT rowid FROM recipe_fts WHERE recipe_id = ?",
(recipe_id,)
)
result = cursor.fetchone()
if result:
rowid_mappings.append((recipe_id, result[0]))
if rowid_mappings:
conn.executemany(
"INSERT OR REPLACE INTO recipe_rowid (recipe_id, fts_rowid) VALUES (?, ?)",
rowid_mappings
)
# Update metadata
conn.execute(
"INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
('last_build_time', str(time.time()))
)
conn.execute(
"INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
('recipe_count', str(inserted))
)
conn.commit()
elapsed = time.time() - start_time
logger.info("FTS index built: %d recipes indexed in %.2fs", inserted, elapsed)
finally:
conn.close()
self._ready.set()
except Exception as exc:
logger.error("Failed to build FTS index: %s", exc, exc_info=True)
finally:
self._indexing_in_progress = False
def search(self, query: str, fields: Optional[Set[str]] = None) -> Set[str]:
"""Search recipes using FTS5 with prefix matching.
Args:
query: The search query string.
fields: Optional set of field names to search. If None, searches all fields.
Valid fields: 'title', 'tags', 'lora_name', 'lora_model', 'prompt'
Returns:
Set of matching recipe IDs.
"""
if not self.is_ready():
if not self._warned_not_ready:
logger.debug("FTS index not ready, returning empty results")
self._warned_not_ready = True
return set()
if not query or not query.strip():
return set()
fts_query = self._build_fts_query(query, fields)
if not fts_query:
return set()
try:
with self._lock:
conn = self._connect(readonly=True)
try:
cursor = conn.execute(
"SELECT recipe_id FROM recipe_fts WHERE recipe_fts MATCH ?",
(fts_query,)
)
return {row[0] for row in cursor.fetchall()}
finally:
conn.close()
except Exception as exc:
logger.debug("FTS search error for query '%s': %s", query, exc)
return set()
def add_recipe(self, recipe: Dict[str, Any]) -> bool:
"""Add a single recipe to the FTS index.
Args:
recipe: The recipe dictionary to add.
Returns:
True if successful, False otherwise.
"""
if not self.is_ready():
return False
recipe_id = str(recipe.get('id', ''))
if not recipe_id:
return False
try:
with self._lock:
conn = self._connect()
try:
# Remove existing entry if present
self._remove_recipe_locked(conn, recipe_id)
# Insert new entry
row = self._prepare_fts_row(recipe)
conn.execute(
"""INSERT INTO recipe_fts (recipe_id, title, tags, lora_names,
lora_models, prompt, negative_prompt)
VALUES (?, ?, ?, ?, ?, ?, ?)""",
row
)
# Update rowid mapping
cursor = conn.execute(
"SELECT rowid FROM recipe_fts WHERE recipe_id = ?",
(recipe_id,)
)
result = cursor.fetchone()
if result:
conn.execute(
"INSERT OR REPLACE INTO recipe_rowid (recipe_id, fts_rowid) VALUES (?, ?)",
(recipe_id, result[0])
)
conn.commit()
return True
finally:
conn.close()
except Exception as exc:
logger.debug("Failed to add recipe %s to FTS index: %s", recipe_id, exc)
return False
def remove_recipe(self, recipe_id: str) -> bool:
"""Remove a recipe from the FTS index.
Args:
recipe_id: The ID of the recipe to remove.
Returns:
True if successful, False otherwise.
"""
if not self.is_ready():
return False
if not recipe_id:
return False
try:
with self._lock:
conn = self._connect()
try:
self._remove_recipe_locked(conn, recipe_id)
conn.commit()
return True
finally:
conn.close()
except Exception as exc:
logger.debug("Failed to remove recipe %s from FTS index: %s", recipe_id, exc)
return False
def update_recipe(self, recipe: Dict[str, Any]) -> bool:
"""Update a recipe in the FTS index.
Args:
recipe: The updated recipe dictionary.
Returns:
True if successful, False otherwise.
"""
return self.add_recipe(recipe) # add_recipe handles removal and re-insertion
def clear(self) -> bool:
"""Clear all data from the FTS index.
Returns:
True if successful, False otherwise.
"""
try:
with self._lock:
conn = self._connect()
try:
conn.execute("DELETE FROM recipe_fts")
conn.execute("DELETE FROM recipe_rowid")
conn.commit()
self._ready.clear()
return True
finally:
conn.close()
except Exception as exc:
logger.error("Failed to clear FTS index: %s", exc)
return False
def get_indexed_count(self) -> int:
"""Return the number of recipes currently indexed."""
if not self._schema_initialized:
return 0
try:
with self._lock:
conn = self._connect(readonly=True)
try:
cursor = conn.execute("SELECT COUNT(*) FROM recipe_fts")
result = cursor.fetchone()
return result[0] if result else 0
finally:
conn.close()
except Exception:
return 0
# Internal helpers
def _connect(self, readonly: bool = False) -> sqlite3.Connection:
"""Create a database connection."""
uri = False
path = self._db_path
if readonly:
if not os.path.exists(path):
raise FileNotFoundError(path)
path = f"file:{path}?mode=ro"
uri = True
conn = sqlite3.connect(path, check_same_thread=False, uri=uri)
conn.row_factory = sqlite3.Row
return conn
def _remove_recipe_locked(self, conn: sqlite3.Connection, recipe_id: str) -> None:
"""Remove a recipe entry. Caller must hold the lock."""
# Get the rowid for deletion
cursor = conn.execute(
"SELECT fts_rowid FROM recipe_rowid WHERE recipe_id = ?",
(recipe_id,)
)
result = cursor.fetchone()
if result:
fts_rowid = result[0]
# Delete from FTS using rowid
conn.execute(
"DELETE FROM recipe_fts WHERE rowid = ?",
(fts_rowid,)
)
# Also try direct delete by recipe_id (handles edge cases)
conn.execute(
"DELETE FROM recipe_fts WHERE recipe_id = ?",
(recipe_id,)
)
conn.execute(
"DELETE FROM recipe_rowid WHERE recipe_id = ?",
(recipe_id,)
)
def _prepare_fts_row(self, recipe: Dict[str, Any]) -> tuple:
"""Prepare a row tuple for FTS insertion."""
recipe_id = str(recipe.get('id', ''))
title = str(recipe.get('title', ''))
# Extract tags as space-separated string
tags_list = recipe.get('tags', [])
tags = ' '.join(str(t) for t in tags_list if t) if tags_list else ''
# Extract LoRA file names and model names
loras = recipe.get('loras', [])
lora_names = []
lora_models = []
for lora in loras:
if isinstance(lora, dict):
file_name = lora.get('file_name', '')
if file_name:
lora_names.append(str(file_name))
model_name = lora.get('modelName', '')
if model_name:
lora_models.append(str(model_name))
lora_names_str = ' '.join(lora_names)
lora_models_str = ' '.join(lora_models)
# Extract prompts from gen_params
gen_params = recipe.get('gen_params', {})
prompt = str(gen_params.get('prompt', '')) if gen_params else ''
negative_prompt = str(gen_params.get('negative_prompt', '')) if gen_params else ''
return (recipe_id, title, tags, lora_names_str, lora_models_str, prompt, negative_prompt)
def _build_fts_query(self, query: str, fields: Optional[Set[str]] = None) -> str:
"""Build an FTS5 query string with prefix matching and field restrictions.
Args:
query: The user's search query.
fields: Optional set of field names to restrict search to.
Returns:
FTS5 query string.
"""
# Split query into words and clean them
words = query.lower().split()
if not words:
return ''
# Escape and add prefix wildcard to each word
prefix_terms = []
for word in words:
escaped = self._escape_fts_query(word)
if escaped:
# Add prefix wildcard for substring-like matching
# FTS5 prefix queries: word* matches words starting with "word"
prefix_terms.append(f'{escaped}*')
if not prefix_terms:
return ''
# Combine terms with implicit AND (all words must match)
term_expr = ' '.join(prefix_terms)
# If no field restriction, search all indexed fields (not recipe_id)
if not fields:
return term_expr
# Build field-restricted query with OR between fields
field_clauses = []
for field in fields:
if field in self.FIELD_MAP:
cols = self.FIELD_MAP[field]
for col in cols:
# FTS5 column filter syntax: column:term
# Need to handle multiple terms properly
for term in prefix_terms:
field_clauses.append(f'{col}:{term}')
if not field_clauses:
return term_expr
# Combine field clauses with OR
return ' OR '.join(field_clauses)
def _escape_fts_query(self, text: str) -> str:
"""Escape special FTS5 characters.
FTS5 special characters: " ( ) * : ^ -
We keep * for prefix matching but escape others.
"""
if not text:
return ''
# Replace FTS5 special characters with space
# Keep alphanumeric, CJK characters, and common punctuation
special = ['"', '(', ')', '*', ':', '^', '-', '{', '}', '[', ']']
result = text
for char in special:
result = result.replace(char, ' ')
# Collapse multiple spaces and strip
result = re.sub(r'\s+', ' ', result).strip()
return result

View File

@@ -5,9 +5,10 @@ import json
import logging
import os
import time
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple
from ..config import config
from .recipe_cache import RecipeCache
from .recipe_fts_index import RecipeFTSIndex
from .service_registry import ServiceRegistry
from .lora_scanner import LoraScanner
from .metadata_service import get_default_metadata_provider
@@ -74,6 +75,9 @@ class RecipeScanner:
self._post_scan_task: Optional[asyncio.Task] = None
self._resort_tasks: Set[asyncio.Task] = set()
self._cancel_requested = False
# FTS index for fast search
self._fts_index: Optional[RecipeFTSIndex] = None
self._fts_index_task: Optional[asyncio.Task] = None
if lora_scanner:
self._lora_scanner = lora_scanner
if checkpoint_scanner:
@@ -97,6 +101,14 @@ class RecipeScanner:
self._post_scan_task.cancel()
self._post_scan_task = None
# Cancel FTS index task and clear index
if self._fts_index_task and not self._fts_index_task.done():
self._fts_index_task.cancel()
self._fts_index_task = None
if self._fts_index:
self._fts_index.clear()
self._fts_index = None
self._cache = None
self._initialization_task = None
self._is_initializing = False
@@ -387,6 +399,8 @@ class RecipeScanner:
recipe_count = len(cache.raw_data) if cache and hasattr(cache, 'raw_data') else 0
logger.info(f"Recipe cache initialized in {elapsed_time:.2f} seconds. Found {recipe_count} recipes")
self._schedule_post_scan_enrichment()
# Schedule FTS index build in background (non-blocking)
self._schedule_fts_index_build()
finally:
# Mark initialization as complete regardless of outcome
self._is_initializing = False
@@ -555,6 +569,93 @@ class RecipeScanner:
self._post_scan_task = loop.create_task(_run_enrichment(), name="recipe_cache_enrichment")
def _schedule_fts_index_build(self) -> None:
"""Build FTS index in background without blocking."""
if self._fts_index_task and not self._fts_index_task.done():
return # Already running
try:
loop = asyncio.get_running_loop()
except RuntimeError:
return
async def _build_fts():
if self._cache is None:
return
try:
self._fts_index = RecipeFTSIndex()
# Run in thread pool (SQLite is blocking)
await loop.run_in_executor(
None,
self._fts_index.build_index,
self._cache.raw_data
)
except asyncio.CancelledError:
raise
except Exception as exc:
logger.error("Recipe Scanner: error building FTS index: %s", exc, exc_info=True)
self._fts_index_task = loop.create_task(_build_fts(), name="recipe_fts_index_build")
def _search_with_fts(self, search: str, search_options: Dict) -> Optional[Set[str]]:
"""Search recipes using FTS index if available.
Args:
search: The search query string.
search_options: Dictionary of search options (title, tags, lora_name, lora_model, prompt).
Returns:
Set of matching recipe IDs if FTS is available and search succeeded,
None if FTS is not ready (caller should fall back to fuzzy search).
"""
if not self._fts_index or not self._fts_index.is_ready():
return None
# Build the set of fields to search based on search_options
fields: Set[str] = set()
if search_options.get('title', True):
fields.add('title')
if search_options.get('tags', True):
fields.add('tags')
if search_options.get('lora_name', True):
fields.add('lora_name')
if search_options.get('lora_model', True):
fields.add('lora_model')
if search_options.get('prompt', False): # prompt search is opt-in by default
fields.add('prompt')
# If no fields enabled, search all fields
if not fields:
fields = None
try:
return self._fts_index.search(search, fields)
except Exception as exc:
logger.debug("FTS search failed, falling back to fuzzy search: %s", exc)
return None
def _update_fts_index_for_recipe(self, recipe: Dict[str, Any], operation: str = 'add') -> None:
"""Update FTS index for a single recipe (add, update, or remove).
Args:
recipe: The recipe dictionary.
operation: One of 'add', 'update', or 'remove'.
"""
if not self._fts_index or not self._fts_index.is_ready():
return
try:
if operation == 'remove':
recipe_id = str(recipe.get('id', '')) if isinstance(recipe, dict) else str(recipe)
self._fts_index.remove_recipe(recipe_id)
elif operation in ('add', 'update'):
self._fts_index.update_recipe(recipe)
except Exception as exc:
logger.debug("Failed to update FTS index for recipe: %s", exc)
async def _enrich_cache_metadata(self) -> None:
"""Perform remote metadata enrichment after the initial scan."""
@@ -766,6 +867,9 @@ class RecipeScanner:
self._update_folder_metadata(cache)
self._schedule_resort()
# Update FTS index
self._update_fts_index_for_recipe(recipe_data, 'add')
async def remove_recipe(self, recipe_id: str) -> bool:
"""Remove a recipe from the cache by ID."""
@@ -779,6 +883,9 @@ class RecipeScanner:
self._update_folder_metadata(cache)
self._schedule_resort()
# Update FTS index
self._update_fts_index_for_recipe(recipe_id, 'remove')
return True
async def bulk_remove(self, recipe_ids: Iterable[str]) -> int:
@@ -788,6 +895,9 @@ class RecipeScanner:
removed = await cache.bulk_remove(recipe_ids, resort=False)
if removed:
self._schedule_resort()
# Update FTS index for each removed recipe
for recipe_id in (str(r.get('id', '')) for r in removed):
self._update_fts_index_for_recipe(recipe_id, 'remove')
return len(removed)
async def scan_all_recipes(self) -> List[Dict]:
@@ -1331,45 +1441,55 @@ class RecipeScanner:
'lora_name': True,
'lora_model': True
}
# Build the search predicate based on search options
def matches_search(item):
# Search in title if enabled
if search_options.get('title', True):
if fuzzy_match(str(item.get('title', '')), search):
return True
# Search in tags if enabled
if search_options.get('tags', True) and 'tags' in item:
for tag in item['tags']:
if fuzzy_match(tag, search):
# Try FTS search first if available (much faster)
fts_matching_ids = self._search_with_fts(search, search_options)
if fts_matching_ids is not None:
# FTS search succeeded, filter by matching IDs
filtered_data = [
item for item in filtered_data
if str(item.get('id', '')) in fts_matching_ids
]
else:
# Fallback to fuzzy_match (slower but always available)
# Build the search predicate based on search options
def matches_search(item):
# Search in title if enabled
if search_options.get('title', True):
if fuzzy_match(str(item.get('title', '')), search):
return True
# Search in lora file names if enabled
if search_options.get('lora_name', True) and 'loras' in item:
for lora in item['loras']:
if fuzzy_match(str(lora.get('file_name', '')), search):
# Search in tags if enabled
if search_options.get('tags', True) and 'tags' in item:
for tag in item['tags']:
if fuzzy_match(tag, search):
return True
# Search in lora file names if enabled
if search_options.get('lora_name', True) and 'loras' in item:
for lora in item['loras']:
if fuzzy_match(str(lora.get('file_name', '')), search):
return True
# Search in lora model names if enabled
if search_options.get('lora_model', True) and 'loras' in item:
for lora in item['loras']:
if fuzzy_match(str(lora.get('modelName', '')), search):
return True
# Search in prompt and negative_prompt if enabled
if search_options.get('prompt', True) and 'gen_params' in item:
gen_params = item['gen_params']
if fuzzy_match(str(gen_params.get('prompt', '')), search):
return True
# Search in lora model names if enabled
if search_options.get('lora_model', True) and 'loras' in item:
for lora in item['loras']:
if fuzzy_match(str(lora.get('modelName', '')), search):
if fuzzy_match(str(gen_params.get('negative_prompt', '')), search):
return True
# Search in prompt and negative_prompt if enabled
if search_options.get('prompt', True) and 'gen_params' in item:
gen_params = item['gen_params']
if fuzzy_match(str(gen_params.get('prompt', '')), search):
return True
if fuzzy_match(str(gen_params.get('negative_prompt', '')), search):
return True
# No match found
return False
# Filter the data using the search predicate
filtered_data = [item for item in filtered_data if matches_search(item)]
# No match found
return False
# Filter the data using the search predicate
filtered_data = [item for item in filtered_data if matches_search(item)]
# Apply additional filters
if filters:
@@ -1601,6 +1721,9 @@ class RecipeScanner:
await self._cache.update_recipe_metadata(recipe_id, metadata, resort=False)
self._schedule_resort()
# Update FTS index
self._update_fts_index_for_recipe(recipe_data, 'update')
# If the recipe has an image, update its EXIF metadata
from ..utils.exif_utils import ExifUtils
image_path = recipe_data.get('file_path')
@@ -1669,6 +1792,9 @@ class RecipeScanner:
await cache.add_recipe(recipe_data, resort=False)
self._schedule_resort()
# Update FTS index
self._update_fts_index_for_recipe(recipe_data, 'update')
updated_lora = dict(lora_entry)
if target_lora is not None:
preview_url = target_lora.get('preview_url')