feat(autocomplete): add Danbooru/e621 tag search with category filtering

- Add TagFTSIndex service for fast SQLite FTS5-based tag search (221k+ tags) - Implement command-mode autocomplete: /char, /artist, /general, /meta, etc. - Support category filtering via category IDs or names - Return enriched results with post counts and category badges - Add UI styling for category badges and command list dropdown
2026-06-20 17:32:05 -03:00 · 2026-01-26 13:51:45 +08:00
parent 6142b3dc0c
commit 42f35be9d3
8 changed files with 223183 additions and 33 deletions
--- a/py/routes/handlers/misc_handlers.py
+++ b/py/routes/handlers/misc_handlers.py
@@ -1231,12 +1231,31 @@ class CustomWordsHandler:
            return web.json_response({"error": str(exc)}, status=500)

    async def search_custom_words(self, request: web.Request) -> web.Response:
-        """Search custom words with autocomplete."""
+        """Search custom words with autocomplete.
+
+        Query parameters:
+            search: The search term to match against.
+            limit: Maximum number of results to return (default: 20).
+            category: Optional category filter. Can be:
+                - A category name (e.g., "character", "artist", "general")
+                - Comma-separated category IDs (e.g., "4,11" for character)
+            enriched: If "true", return enriched results with category and post_count
+                      even without category filtering.
+        """
        try:
            search_term = request.query.get("search", "")
            limit = int(request.query.get("limit", "20"))
+            category_param = request.query.get("category", "")
+            enriched_param = request.query.get("enriched", "").lower() == "true"

-            results = self._service.search_words(search_term, limit)
+            # Parse category parameter
+            categories = None
+            if category_param:
+                categories = self._parse_category_param(category_param)
+
+            results = self._service.search_words(
+                search_term, limit, categories=categories, enriched=enriched_param
+            )

            return web.json_response({
                "success": True,
@@ -1246,6 +1265,37 @@ class CustomWordsHandler:
            logger.error("Error searching custom words: %s", exc, exc_info=True)
            return web.json_response({"error": str(exc)}, status=500)

+    def _parse_category_param(self, param: str) -> list[int] | None:
+        """Parse category parameter into list of category IDs.
+
+        Args:
+            param: Category parameter value (name or comma-separated IDs).
+
+        Returns:
+            List of category IDs, or None if parsing fails.
+        """
+        from ...services.tag_fts_index import CATEGORY_NAME_TO_IDS
+
+        param = param.strip().lower()
+        if not param:
+            return None
+
+        # Try to parse as category name first
+        if param in CATEGORY_NAME_TO_IDS:
+            return CATEGORY_NAME_TO_IDS[param]
+
+        # Try to parse as comma-separated integers
+        try:
+            category_ids = []
+            for part in param.split(","):
+                part = part.strip()
+                if part:
+                    category_ids.append(int(part))
+            return category_ids if category_ids else None
+        except ValueError:
+            logger.debug("Invalid category parameter: %s", param)
+            return None
+

 class NodeRegistryHandler:
    def __init__(
--- a/py/services/custom_words_service.py
+++ b/py/services/custom_words_service.py
@@ -2,6 +2,9 @@

 This service provides functionality to parse CSV-formatted custom words,
 search them with priority-based ranking, and manage storage.
+
+It also integrates with TagFTSIndex to search the Danbooru/e621 tag database
+for comprehensive autocomplete suggestions with category filtering.
 """

 from __future__ import annotations
@@ -10,7 +13,7 @@ import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Union

 logger = logging.getLogger(__name__)

@@ -35,6 +38,7 @@ class CustomWordsService:
    - Parses CSV format: word[,priority] or word[,alias][,priority]
    - Searches words with priority-based ranking
    - Caches parsed words for performance
+    - Integrates with TagFTSIndex for Danbooru/e621 tag search
    """

    _instance: Optional[CustomWordsService] = None
@@ -51,6 +55,7 @@ class CustomWordsService:

        self._words_cache: Dict[str, WordEntry] = {}
        self._file_path: Optional[Path] = None
+        self._tag_index: Optional[Any] = None  # Lazy-loaded TagFTSIndex
        self._initialized = True

        self._determine_file_path()
@@ -98,6 +103,17 @@ class CustomWordsService:
        """Get the current file path for custom words."""
        return self._file_path

+    def _get_tag_index(self):
+        """Get or create the TagFTSIndex instance (lazy initialization)."""
+        if self._tag_index is None:
+            try:
+                from .tag_fts_index import get_tag_fts_index
+                self._tag_index = get_tag_fts_index()
+            except Exception as e:
+                logger.warning(f"Failed to initialize TagFTSIndex: {e}")
+                self._tag_index = None
+        return self._tag_index
+
    def load_words(self) -> Dict[str, WordEntry]:
        """Load and parse words from the custom words file.

@@ -160,10 +176,20 @@ class CustomWordsService:

        return words

-    def search_words(self, search_term: str, limit: int = 20) -> List[str]:
+    def search_words(
+        self,
+        search_term: str,
+        limit: int = 20,
+        categories: Optional[List[int]] = None,
+        enriched: bool = False
+    ) -> Union[List[str], List[Dict[str, Any]]]:
        """Search custom words with priority-based ranking.

-        Matching priority:
+        When categories are provided or enriched is True, uses TagFTSIndex to search
+        the Danbooru/e621 tag database and returns enriched results with category
+        and post_count.
+
+        Matching priority (for custom words):
        1. Words with priority (sorted by priority descending)
        2. Prefix matches (word starts with search term)
        3. Include matches (word contains search term)
@@ -171,10 +197,29 @@ class CustomWordsService:
        Args:
            search_term: The search term to match against.
            limit: Maximum number of results to return.
+            categories: Optional list of category IDs to filter by.
+                       When provided, searches TagFTSIndex instead of custom words.
+            enriched: If True, return enriched results even without category filtering.

        Returns:
-            List of matching word texts.
+            List of matching word texts (when categories is None and enriched is False), or
+            List of dicts with tag_name, category, post_count (when categories is provided
+            or enriched is True).
        """
+        # Use TagFTSIndex when categories are specified or when explicitly requested
+        tag_index = self._get_tag_index()
+        if tag_index is not None:
+            # Search the tag database
+            results = tag_index.search(search_term, categories=categories, limit=limit)
+            if results:
+                # If categories were specified or enriched requested, return enriched results
+                if categories is not None or enriched:
+                    return results
+                # Otherwise, convert to simple string list for backward compatibility
+                return [r["tag_name"] for r in results]
+            # Fall through to custom words if no tag results
+
+        # Fall back to custom words search
        words = self._words_cache if self._words_cache else self.load_words()

        if not search_term:
@@ -212,14 +257,18 @@ class CustomWordsService:
        # Combine results: 20% top priority + all prefix matches + rest of priority + all include
        top_priority_count = max(1, limit // 5)

-        results = (
+        text_results = (
            [entry.text for entry, _ in priority_matches[:top_priority_count]]
            + [entry.text for entry, _ in prefix_matches]
            + [entry.text for entry, _ in priority_matches[top_priority_count:]]
            + [entry.text for entry, _ in include_matches]
        )

-        return results[:limit]
+        # If categories were requested but tag index failed, return empty enriched format
+        if categories is not None:
+            return [{"tag_name": t, "category": 0, "post_count": 0} for t in text_results[:limit]]
+
+        return text_results[:limit]

    def save_words(self, content: str) -> bool:
        """Save custom words content to file.
--- a/py/services/tag_fts_index.py
+++ b/py/services/tag_fts_index.py
@@ -0,0 +1,504 @@
+"""SQLite FTS5-based full-text search index for tags.
+
+This module provides fast tag search using SQLite's FTS5 extension,
+enabling sub-100ms search times for 221k+ Danbooru/e621 tags.
+"""
+
+from __future__ import annotations
+
+import csv
+import logging
+import os
+import re
+import sqlite3
+import threading
+import time
+from pathlib import Path
+from typing import Dict, List, Optional, Set
+
+from ..utils.settings_paths import get_settings_dir
+
+logger = logging.getLogger(__name__)
+
+
+# Category definitions for Danbooru and e621
+CATEGORY_NAMES = {
+    # Danbooru categories
+    0: "general",
+    1: "artist",
+    3: "copyright",
+    4: "character",
+    5: "meta",
+    # e621 categories
+    7: "general",
+    8: "artist",
+    10: "copyright",
+    11: "character",
+    12: "species",
+    14: "meta",
+    15: "lore",
+}
+
+# Map category names to their IDs (for filtering)
+CATEGORY_NAME_TO_IDS = {
+    "general": [0, 7],
+    "artist": [1, 8],
+    "copyright": [3, 10],
+    "character": [4, 11],
+    "meta": [5, 14],
+    "species": [12],
+    "lore": [15],
+}
+
+
+class TagFTSIndex:
+    """SQLite FTS5-based full-text search index for tags.
+
+    Provides fast prefix-based search across the Danbooru/e621 tag database.
+    Supports category-based filtering and returns enriched results with
+    post counts and category information.
+    """
+
+    _DEFAULT_FILENAME = "tag_fts.sqlite"
+    _CSV_FILENAME = "danbooru_e621_merged.csv"
+
+    def __init__(self, db_path: Optional[str] = None, csv_path: Optional[str] = None) -> None:
+        """Initialize the FTS index.
+
+        Args:
+            db_path: Optional path to the SQLite database file.
+                     If not provided, uses the default location in settings directory.
+            csv_path: Optional path to the CSV file containing tag data.
+                      If not provided, looks in the refs/ directory.
+        """
+        self._db_path = db_path or self._resolve_default_db_path()
+        self._csv_path = csv_path or self._resolve_default_csv_path()
+        self._lock = threading.Lock()
+        self._ready = threading.Event()
+        self._indexing_in_progress = False
+        self._schema_initialized = False
+        self._warned_not_ready = False
+
+        # Ensure directory exists
+        try:
+            directory = os.path.dirname(self._db_path)
+            if directory:
+                os.makedirs(directory, exist_ok=True)
+        except Exception as exc:
+            logger.warning("Could not create FTS index directory %s: %s", directory, exc)
+
+    def _resolve_default_db_path(self) -> str:
+        """Resolve the default database path."""
+        override = os.environ.get("LORA_MANAGER_TAG_FTS_DB")
+        if override:
+            return override
+
+        try:
+            settings_dir = get_settings_dir(create=True)
+        except Exception as exc:
+            logger.warning("Falling back to current directory for FTS index: %s", exc)
+            settings_dir = "."
+
+        return os.path.join(settings_dir, self._DEFAULT_FILENAME)
+
+    def _resolve_default_csv_path(self) -> str:
+        """Resolve the default CSV file path."""
+        # Look for the CSV in the refs/ directory relative to the package
+        package_dir = Path(__file__).parent.parent.parent
+        csv_path = package_dir / "refs" / self._CSV_FILENAME
+        return str(csv_path)
+
+    def get_database_path(self) -> str:
+        """Return the resolved database path."""
+        return self._db_path
+
+    def get_csv_path(self) -> str:
+        """Return the resolved CSV path."""
+        return self._csv_path
+
+    def is_ready(self) -> bool:
+        """Check if the FTS index is ready for queries."""
+        return self._ready.is_set()
+
+    def is_indexing(self) -> bool:
+        """Check if indexing is currently in progress."""
+        return self._indexing_in_progress
+
+    def initialize(self) -> None:
+        """Initialize the database schema."""
+        if self._schema_initialized:
+            return
+
+        with self._lock:
+            if self._schema_initialized:
+                return
+
+            try:
+                conn = self._connect()
+                try:
+                    conn.execute("PRAGMA journal_mode=WAL")
+                    conn.executescript("""
+                        -- FTS5 virtual table for full-text search
+                        CREATE VIRTUAL TABLE IF NOT EXISTS tag_fts USING fts5(
+                            tag_name,
+                            tokenize='unicode61 remove_diacritics 2'
+                        );
+
+                        -- Tags table with metadata
+                        CREATE TABLE IF NOT EXISTS tags (
+                            rowid INTEGER PRIMARY KEY,
+                            tag_name TEXT UNIQUE NOT NULL,
+                            category INTEGER NOT NULL DEFAULT 0,
+                            post_count INTEGER NOT NULL DEFAULT 0
+                        );
+
+                        -- Indexes for efficient filtering
+                        CREATE INDEX IF NOT EXISTS idx_tags_category ON tags(category);
+                        CREATE INDEX IF NOT EXISTS idx_tags_post_count ON tags(post_count DESC);
+
+                        -- Index version tracking
+                        CREATE TABLE IF NOT EXISTS fts_metadata (
+                            key TEXT PRIMARY KEY,
+                            value TEXT
+                        );
+                    """)
+                    conn.commit()
+                    self._schema_initialized = True
+                    logger.debug("Tag FTS index schema initialized at %s", self._db_path)
+                finally:
+                    conn.close()
+            except Exception as exc:
+                logger.error("Failed to initialize tag FTS schema: %s", exc)
+
+    def build_index(self) -> None:
+        """Build the FTS index from the CSV file.
+
+        This method parses the danbooru_e621_merged.csv file and creates
+        the FTS index for fast searching.
+        """
+        if self._indexing_in_progress:
+            logger.warning("Tag FTS indexing already in progress, skipping")
+            return
+
+        if not os.path.exists(self._csv_path):
+            logger.warning("CSV file not found at %s, cannot build tag index", self._csv_path)
+            return
+
+        self._indexing_in_progress = True
+        self._ready.clear()
+        start_time = time.time()
+
+        try:
+            self.initialize()
+            if not self._schema_initialized:
+                logger.error("Cannot build tag FTS index: schema not initialized")
+                return
+
+            with self._lock:
+                conn = self._connect()
+                try:
+                    conn.execute("BEGIN")
+
+                    # Clear existing data
+                    conn.execute("DELETE FROM tag_fts")
+                    conn.execute("DELETE FROM tags")
+
+                    # Parse CSV and insert in batches
+                    batch_size = 500
+                    rows = []
+                    total_inserted = 0
+
+                    with open(self._csv_path, "r", encoding="utf-8") as f:
+                        reader = csv.reader(f)
+                        for row in reader:
+                            if len(row) < 3:
+                                continue
+
+                            tag_name = row[0].strip()
+                            if not tag_name:
+                                continue
+
+                            try:
+                                category = int(row[1])
+                            except (ValueError, IndexError):
+                                category = 0
+
+                            try:
+                                post_count = int(row[2])
+                            except (ValueError, IndexError):
+                                post_count = 0
+
+                            rows.append((tag_name, category, post_count))
+
+                            if len(rows) >= batch_size:
+                                self._insert_batch(conn, rows)
+                                total_inserted += len(rows)
+                                rows = []
+
+                    # Insert remaining rows
+                    if rows:
+                        self._insert_batch(conn, rows)
+                        total_inserted += len(rows)
+
+                    # Update metadata
+                    conn.execute(
+                        "INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
+                        ("last_build_time", str(time.time()))
+                    )
+                    conn.execute(
+                        "INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
+                        ("tag_count", str(total_inserted))
+                    )
+
+                    conn.commit()
+                    elapsed = time.time() - start_time
+                    logger.info("Tag FTS index built: %d tags indexed in %.2fs", total_inserted, elapsed)
+                finally:
+                    conn.close()
+
+            self._ready.set()
+
+        except Exception as exc:
+            logger.error("Failed to build tag FTS index: %s", exc, exc_info=True)
+        finally:
+            self._indexing_in_progress = False
+
+    def _insert_batch(self, conn: sqlite3.Connection, rows: List[tuple]) -> None:
+        """Insert a batch of rows into the database."""
+        # Insert into tags table
+        conn.executemany(
+            "INSERT OR IGNORE INTO tags (tag_name, category, post_count) VALUES (?, ?, ?)",
+            rows
+        )
+
+        # Get rowids and insert into FTS table
+        tag_names = [row[0] for row in rows]
+        placeholders = ",".join("?" * len(tag_names))
+        cursor = conn.execute(
+            f"SELECT rowid, tag_name FROM tags WHERE tag_name IN ({placeholders})",
+            tag_names
+        )
+
+        fts_rows = [(tag_name,) for rowid, tag_name in cursor.fetchall()]
+        if fts_rows:
+            conn.executemany("INSERT INTO tag_fts (tag_name) VALUES (?)", fts_rows)
+
+    def ensure_ready(self) -> bool:
+        """Ensure the index is ready, building if necessary.
+
+        Returns:
+            True if the index is ready, False otherwise.
+        """
+        if self.is_ready():
+            return True
+
+        # Check if index already exists and has data
+        self.initialize()
+        if self._schema_initialized:
+            count = self.get_indexed_count()
+            if count > 0:
+                self._ready.set()
+                logger.debug("Tag FTS index already populated with %d tags", count)
+                return True
+
+        # Build the index
+        self.build_index()
+        return self.is_ready()
+
+    def search(
+        self,
+        query: str,
+        categories: Optional[List[int]] = None,
+        limit: int = 20
+    ) -> List[Dict]:
+        """Search tags using FTS5 with prefix matching.
+
+        Args:
+            query: The search query string.
+            categories: Optional list of category IDs to filter by.
+            limit: Maximum number of results to return.
+
+        Returns:
+            List of dictionaries with tag_name, category, and post_count.
+        """
+        # Ensure index is ready (lazy initialization)
+        if not self.ensure_ready():
+            if not self._warned_not_ready:
+                logger.debug("Tag FTS index not ready, returning empty results")
+                self._warned_not_ready = True
+            return []
+
+        if not query or not query.strip():
+            return []
+
+        fts_query = self._build_fts_query(query)
+        if not fts_query:
+            return []
+
+        try:
+            with self._lock:
+                conn = self._connect(readonly=True)
+                try:
+                    # Build the SQL query
+                    if categories:
+                        placeholders = ",".join("?" * len(categories))
+                        sql = f"""
+                            SELECT t.tag_name, t.category, t.post_count
+                            FROM tags t
+                            WHERE t.tag_name IN (
+                                SELECT tag_name FROM tag_fts WHERE tag_fts MATCH ?
+                            )
+                            AND t.category IN ({placeholders})
+                            ORDER BY t.post_count DESC
+                            LIMIT ?
+                        """
+                        params = [fts_query] + categories + [limit]
+                    else:
+                        sql = """
+                            SELECT t.tag_name, t.category, t.post_count
+                            FROM tags t
+                            WHERE t.tag_name IN (
+                                SELECT tag_name FROM tag_fts WHERE tag_fts MATCH ?
+                            )
+                            ORDER BY t.post_count DESC
+                            LIMIT ?
+                        """
+                        params = [fts_query, limit]
+
+                    cursor = conn.execute(sql, params)
+                    results = []
+                    for row in cursor.fetchall():
+                        results.append({
+                            "tag_name": row[0],
+                            "category": row[1],
+                            "post_count": row[2],
+                        })
+                    return results
+                finally:
+                    conn.close()
+        except Exception as exc:
+            logger.debug("Tag FTS search error for query '%s': %s", query, exc)
+            return []
+
+    def get_indexed_count(self) -> int:
+        """Return the number of tags currently indexed."""
+        if not self._schema_initialized:
+            return 0
+
+        try:
+            with self._lock:
+                conn = self._connect(readonly=True)
+                try:
+                    cursor = conn.execute("SELECT COUNT(*) FROM tags")
+                    result = cursor.fetchone()
+                    return result[0] if result else 0
+                finally:
+                    conn.close()
+        except Exception:
+            return 0
+
+    def clear(self) -> bool:
+        """Clear all data from the FTS index.
+
+        Returns:
+            True if successful, False otherwise.
+        """
+        try:
+            with self._lock:
+                conn = self._connect()
+                try:
+                    conn.execute("DELETE FROM tag_fts")
+                    conn.execute("DELETE FROM tags")
+                    conn.commit()
+                    self._ready.clear()
+                    return True
+                finally:
+                    conn.close()
+        except Exception as exc:
+            logger.error("Failed to clear tag FTS index: %s", exc)
+            return False
+
+    # Internal helpers
+
+    def _connect(self, readonly: bool = False) -> sqlite3.Connection:
+        """Create a database connection."""
+        uri = False
+        path = self._db_path
+        if readonly:
+            if not os.path.exists(path):
+                raise FileNotFoundError(path)
+            path = f"file:{path}?mode=ro"
+            uri = True
+        conn = sqlite3.connect(path, check_same_thread=False, uri=uri)
+        conn.row_factory = sqlite3.Row
+        return conn
+
+    def _build_fts_query(self, query: str) -> str:
+        """Build an FTS5 query string with prefix matching.
+
+        Args:
+            query: The user's search query.
+
+        Returns:
+            FTS5 query string.
+        """
+        # Split query into words and clean them
+        words = query.lower().split()
+        if not words:
+            return ""
+
+        # Escape and add prefix wildcard to each word
+        prefix_terms = []
+        for word in words:
+            escaped = self._escape_fts_query(word)
+            if escaped:
+                # Add prefix wildcard for substring-like matching
+                prefix_terms.append(f"{escaped}*")
+
+        if not prefix_terms:
+            return ""
+
+        # Combine terms with implicit AND (all words must match)
+        return " ".join(prefix_terms)
+
+    def _escape_fts_query(self, text: str) -> str:
+        """Escape special FTS5 characters.
+
+        FTS5 special characters: " ( ) * : ^ -
+        We keep * for prefix matching but escape others.
+        """
+        if not text:
+            return ""
+
+        # Replace FTS5 special characters with space
+        special = ['"', "(", ")", "*", ":", "^", "-", "{", "}", "[", "]"]
+        result = text
+        for char in special:
+            result = result.replace(char, " ")
+
+        # Collapse multiple spaces and strip
+        result = re.sub(r"\s+", " ", result).strip()
+        return result
+
+
+# Singleton instance
+_tag_fts_index: Optional[TagFTSIndex] = None
+_tag_fts_lock = threading.Lock()
+
+
+def get_tag_fts_index() -> TagFTSIndex:
+    """Get the singleton TagFTSIndex instance."""
+    global _tag_fts_index
+    if _tag_fts_index is None:
+        with _tag_fts_lock:
+            if _tag_fts_index is None:
+                _tag_fts_index = TagFTSIndex()
+    return _tag_fts_index
+
+
+__all__ = [
+    "TagFTSIndex",
+    "get_tag_fts_index",
+    "CATEGORY_NAMES",
+    "CATEGORY_NAME_TO_IDS",
+]