feat(autocomplete): add Danbooru/e621 tag search with category filtering

- Add TagFTSIndex service for fast SQLite FTS5-based tag search (221k+ tags)
- Implement command-mode autocomplete: /char, /artist, /general, /meta, etc.
- Support category filtering via category IDs or names
- Return enriched results with post counts and category badges
- Add UI styling for category badges and command list dropdown
This commit is contained in:
Will Miao
2026-01-26 13:51:45 +08:00
parent 6142b3dc0c
commit 42f35be9d3
8 changed files with 223183 additions and 33 deletions

View File

@@ -1231,12 +1231,31 @@ class CustomWordsHandler:
return web.json_response({"error": str(exc)}, status=500)
async def search_custom_words(self, request: web.Request) -> web.Response:
"""Search custom words with autocomplete."""
"""Search custom words with autocomplete.
Query parameters:
search: The search term to match against.
limit: Maximum number of results to return (default: 20).
category: Optional category filter. Can be:
- A category name (e.g., "character", "artist", "general")
- Comma-separated category IDs (e.g., "4,11" for character)
enriched: If "true", return enriched results with category and post_count
even without category filtering.
"""
try:
search_term = request.query.get("search", "")
limit = int(request.query.get("limit", "20"))
category_param = request.query.get("category", "")
enriched_param = request.query.get("enriched", "").lower() == "true"
results = self._service.search_words(search_term, limit)
# Parse category parameter
categories = None
if category_param:
categories = self._parse_category_param(category_param)
results = self._service.search_words(
search_term, limit, categories=categories, enriched=enriched_param
)
return web.json_response({
"success": True,
@@ -1246,6 +1265,37 @@ class CustomWordsHandler:
logger.error("Error searching custom words: %s", exc, exc_info=True)
return web.json_response({"error": str(exc)}, status=500)
def _parse_category_param(self, param: str) -> list[int] | None:
"""Parse category parameter into list of category IDs.
Args:
param: Category parameter value (name or comma-separated IDs).
Returns:
List of category IDs, or None if parsing fails.
"""
from ...services.tag_fts_index import CATEGORY_NAME_TO_IDS
param = param.strip().lower()
if not param:
return None
# Try to parse as category name first
if param in CATEGORY_NAME_TO_IDS:
return CATEGORY_NAME_TO_IDS[param]
# Try to parse as comma-separated integers
try:
category_ids = []
for part in param.split(","):
part = part.strip()
if part:
category_ids.append(int(part))
return category_ids if category_ids else None
except ValueError:
logger.debug("Invalid category parameter: %s", param)
return None
class NodeRegistryHandler:
def __init__(

View File

@@ -2,6 +2,9 @@
This service provides functionality to parse CSV-formatted custom words,
search them with priority-based ranking, and manage storage.
It also integrates with TagFTSIndex to search the Danbooru/e621 tag database
for comprehensive autocomplete suggestions with category filtering.
"""
from __future__ import annotations
@@ -10,7 +13,7 @@ import logging
import os
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict, Any, Optional
from typing import List, Dict, Any, Optional, Union
logger = logging.getLogger(__name__)
@@ -35,6 +38,7 @@ class CustomWordsService:
- Parses CSV format: word[,priority] or word[,alias][,priority]
- Searches words with priority-based ranking
- Caches parsed words for performance
- Integrates with TagFTSIndex for Danbooru/e621 tag search
"""
_instance: Optional[CustomWordsService] = None
@@ -51,6 +55,7 @@ class CustomWordsService:
self._words_cache: Dict[str, WordEntry] = {}
self._file_path: Optional[Path] = None
self._tag_index: Optional[Any] = None # Lazy-loaded TagFTSIndex
self._initialized = True
self._determine_file_path()
@@ -98,6 +103,17 @@ class CustomWordsService:
"""Get the current file path for custom words."""
return self._file_path
def _get_tag_index(self):
"""Get or create the TagFTSIndex instance (lazy initialization)."""
if self._tag_index is None:
try:
from .tag_fts_index import get_tag_fts_index
self._tag_index = get_tag_fts_index()
except Exception as e:
logger.warning(f"Failed to initialize TagFTSIndex: {e}")
self._tag_index = None
return self._tag_index
def load_words(self) -> Dict[str, WordEntry]:
"""Load and parse words from the custom words file.
@@ -160,10 +176,20 @@ class CustomWordsService:
return words
def search_words(self, search_term: str, limit: int = 20) -> List[str]:
def search_words(
self,
search_term: str,
limit: int = 20,
categories: Optional[List[int]] = None,
enriched: bool = False
) -> Union[List[str], List[Dict[str, Any]]]:
"""Search custom words with priority-based ranking.
Matching priority:
When categories are provided or enriched is True, uses TagFTSIndex to search
the Danbooru/e621 tag database and returns enriched results with category
and post_count.
Matching priority (for custom words):
1. Words with priority (sorted by priority descending)
2. Prefix matches (word starts with search term)
3. Include matches (word contains search term)
@@ -171,10 +197,29 @@ class CustomWordsService:
Args:
search_term: The search term to match against.
limit: Maximum number of results to return.
categories: Optional list of category IDs to filter by.
When provided, searches TagFTSIndex instead of custom words.
enriched: If True, return enriched results even without category filtering.
Returns:
List of matching word texts.
List of matching word texts (when categories is None and enriched is False), or
List of dicts with tag_name, category, post_count (when categories is provided
or enriched is True).
"""
# Use TagFTSIndex when categories are specified or when explicitly requested
tag_index = self._get_tag_index()
if tag_index is not None:
# Search the tag database
results = tag_index.search(search_term, categories=categories, limit=limit)
if results:
# If categories were specified or enriched requested, return enriched results
if categories is not None or enriched:
return results
# Otherwise, convert to simple string list for backward compatibility
return [r["tag_name"] for r in results]
# Fall through to custom words if no tag results
# Fall back to custom words search
words = self._words_cache if self._words_cache else self.load_words()
if not search_term:
@@ -212,14 +257,18 @@ class CustomWordsService:
# Combine results: 20% top priority + all prefix matches + rest of priority + all include
top_priority_count = max(1, limit // 5)
results = (
text_results = (
[entry.text for entry, _ in priority_matches[:top_priority_count]]
+ [entry.text for entry, _ in prefix_matches]
+ [entry.text for entry, _ in priority_matches[top_priority_count:]]
+ [entry.text for entry, _ in include_matches]
)
return results[:limit]
# If categories were requested but tag index failed, return empty enriched format
if categories is not None:
return [{"tag_name": t, "category": 0, "post_count": 0} for t in text_results[:limit]]
return text_results[:limit]
def save_words(self, content: str) -> bool:
"""Save custom words content to file.

View File

@@ -0,0 +1,504 @@
"""SQLite FTS5-based full-text search index for tags.
This module provides fast tag search using SQLite's FTS5 extension,
enabling sub-100ms search times for 221k+ Danbooru/e621 tags.
"""
from __future__ import annotations
import csv
import logging
import os
import re
import sqlite3
import threading
import time
from pathlib import Path
from typing import Dict, List, Optional, Set
from ..utils.settings_paths import get_settings_dir
logger = logging.getLogger(__name__)
# Category definitions for Danbooru and e621
CATEGORY_NAMES = {
# Danbooru categories
0: "general",
1: "artist",
3: "copyright",
4: "character",
5: "meta",
# e621 categories
7: "general",
8: "artist",
10: "copyright",
11: "character",
12: "species",
14: "meta",
15: "lore",
}
# Map category names to their IDs (for filtering)
CATEGORY_NAME_TO_IDS = {
"general": [0, 7],
"artist": [1, 8],
"copyright": [3, 10],
"character": [4, 11],
"meta": [5, 14],
"species": [12],
"lore": [15],
}
class TagFTSIndex:
"""SQLite FTS5-based full-text search index for tags.
Provides fast prefix-based search across the Danbooru/e621 tag database.
Supports category-based filtering and returns enriched results with
post counts and category information.
"""
_DEFAULT_FILENAME = "tag_fts.sqlite"
_CSV_FILENAME = "danbooru_e621_merged.csv"
def __init__(self, db_path: Optional[str] = None, csv_path: Optional[str] = None) -> None:
"""Initialize the FTS index.
Args:
db_path: Optional path to the SQLite database file.
If not provided, uses the default location in settings directory.
csv_path: Optional path to the CSV file containing tag data.
If not provided, looks in the refs/ directory.
"""
self._db_path = db_path or self._resolve_default_db_path()
self._csv_path = csv_path or self._resolve_default_csv_path()
self._lock = threading.Lock()
self._ready = threading.Event()
self._indexing_in_progress = False
self._schema_initialized = False
self._warned_not_ready = False
# Ensure directory exists
try:
directory = os.path.dirname(self._db_path)
if directory:
os.makedirs(directory, exist_ok=True)
except Exception as exc:
logger.warning("Could not create FTS index directory %s: %s", directory, exc)
def _resolve_default_db_path(self) -> str:
"""Resolve the default database path."""
override = os.environ.get("LORA_MANAGER_TAG_FTS_DB")
if override:
return override
try:
settings_dir = get_settings_dir(create=True)
except Exception as exc:
logger.warning("Falling back to current directory for FTS index: %s", exc)
settings_dir = "."
return os.path.join(settings_dir, self._DEFAULT_FILENAME)
def _resolve_default_csv_path(self) -> str:
"""Resolve the default CSV file path."""
# Look for the CSV in the refs/ directory relative to the package
package_dir = Path(__file__).parent.parent.parent
csv_path = package_dir / "refs" / self._CSV_FILENAME
return str(csv_path)
def get_database_path(self) -> str:
"""Return the resolved database path."""
return self._db_path
def get_csv_path(self) -> str:
"""Return the resolved CSV path."""
return self._csv_path
def is_ready(self) -> bool:
"""Check if the FTS index is ready for queries."""
return self._ready.is_set()
def is_indexing(self) -> bool:
"""Check if indexing is currently in progress."""
return self._indexing_in_progress
def initialize(self) -> None:
"""Initialize the database schema."""
if self._schema_initialized:
return
with self._lock:
if self._schema_initialized:
return
try:
conn = self._connect()
try:
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript("""
-- FTS5 virtual table for full-text search
CREATE VIRTUAL TABLE IF NOT EXISTS tag_fts USING fts5(
tag_name,
tokenize='unicode61 remove_diacritics 2'
);
-- Tags table with metadata
CREATE TABLE IF NOT EXISTS tags (
rowid INTEGER PRIMARY KEY,
tag_name TEXT UNIQUE NOT NULL,
category INTEGER NOT NULL DEFAULT 0,
post_count INTEGER NOT NULL DEFAULT 0
);
-- Indexes for efficient filtering
CREATE INDEX IF NOT EXISTS idx_tags_category ON tags(category);
CREATE INDEX IF NOT EXISTS idx_tags_post_count ON tags(post_count DESC);
-- Index version tracking
CREATE TABLE IF NOT EXISTS fts_metadata (
key TEXT PRIMARY KEY,
value TEXT
);
""")
conn.commit()
self._schema_initialized = True
logger.debug("Tag FTS index schema initialized at %s", self._db_path)
finally:
conn.close()
except Exception as exc:
logger.error("Failed to initialize tag FTS schema: %s", exc)
def build_index(self) -> None:
"""Build the FTS index from the CSV file.
This method parses the danbooru_e621_merged.csv file and creates
the FTS index for fast searching.
"""
if self._indexing_in_progress:
logger.warning("Tag FTS indexing already in progress, skipping")
return
if not os.path.exists(self._csv_path):
logger.warning("CSV file not found at %s, cannot build tag index", self._csv_path)
return
self._indexing_in_progress = True
self._ready.clear()
start_time = time.time()
try:
self.initialize()
if not self._schema_initialized:
logger.error("Cannot build tag FTS index: schema not initialized")
return
with self._lock:
conn = self._connect()
try:
conn.execute("BEGIN")
# Clear existing data
conn.execute("DELETE FROM tag_fts")
conn.execute("DELETE FROM tags")
# Parse CSV and insert in batches
batch_size = 500
rows = []
total_inserted = 0
with open(self._csv_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
if len(row) < 3:
continue
tag_name = row[0].strip()
if not tag_name:
continue
try:
category = int(row[1])
except (ValueError, IndexError):
category = 0
try:
post_count = int(row[2])
except (ValueError, IndexError):
post_count = 0
rows.append((tag_name, category, post_count))
if len(rows) >= batch_size:
self._insert_batch(conn, rows)
total_inserted += len(rows)
rows = []
# Insert remaining rows
if rows:
self._insert_batch(conn, rows)
total_inserted += len(rows)
# Update metadata
conn.execute(
"INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
("last_build_time", str(time.time()))
)
conn.execute(
"INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
("tag_count", str(total_inserted))
)
conn.commit()
elapsed = time.time() - start_time
logger.info("Tag FTS index built: %d tags indexed in %.2fs", total_inserted, elapsed)
finally:
conn.close()
self._ready.set()
except Exception as exc:
logger.error("Failed to build tag FTS index: %s", exc, exc_info=True)
finally:
self._indexing_in_progress = False
def _insert_batch(self, conn: sqlite3.Connection, rows: List[tuple]) -> None:
"""Insert a batch of rows into the database."""
# Insert into tags table
conn.executemany(
"INSERT OR IGNORE INTO tags (tag_name, category, post_count) VALUES (?, ?, ?)",
rows
)
# Get rowids and insert into FTS table
tag_names = [row[0] for row in rows]
placeholders = ",".join("?" * len(tag_names))
cursor = conn.execute(
f"SELECT rowid, tag_name FROM tags WHERE tag_name IN ({placeholders})",
tag_names
)
fts_rows = [(tag_name,) for rowid, tag_name in cursor.fetchall()]
if fts_rows:
conn.executemany("INSERT INTO tag_fts (tag_name) VALUES (?)", fts_rows)
def ensure_ready(self) -> bool:
"""Ensure the index is ready, building if necessary.
Returns:
True if the index is ready, False otherwise.
"""
if self.is_ready():
return True
# Check if index already exists and has data
self.initialize()
if self._schema_initialized:
count = self.get_indexed_count()
if count > 0:
self._ready.set()
logger.debug("Tag FTS index already populated with %d tags", count)
return True
# Build the index
self.build_index()
return self.is_ready()
def search(
self,
query: str,
categories: Optional[List[int]] = None,
limit: int = 20
) -> List[Dict]:
"""Search tags using FTS5 with prefix matching.
Args:
query: The search query string.
categories: Optional list of category IDs to filter by.
limit: Maximum number of results to return.
Returns:
List of dictionaries with tag_name, category, and post_count.
"""
# Ensure index is ready (lazy initialization)
if not self.ensure_ready():
if not self._warned_not_ready:
logger.debug("Tag FTS index not ready, returning empty results")
self._warned_not_ready = True
return []
if not query or not query.strip():
return []
fts_query = self._build_fts_query(query)
if not fts_query:
return []
try:
with self._lock:
conn = self._connect(readonly=True)
try:
# Build the SQL query
if categories:
placeholders = ",".join("?" * len(categories))
sql = f"""
SELECT t.tag_name, t.category, t.post_count
FROM tags t
WHERE t.tag_name IN (
SELECT tag_name FROM tag_fts WHERE tag_fts MATCH ?
)
AND t.category IN ({placeholders})
ORDER BY t.post_count DESC
LIMIT ?
"""
params = [fts_query] + categories + [limit]
else:
sql = """
SELECT t.tag_name, t.category, t.post_count
FROM tags t
WHERE t.tag_name IN (
SELECT tag_name FROM tag_fts WHERE tag_fts MATCH ?
)
ORDER BY t.post_count DESC
LIMIT ?
"""
params = [fts_query, limit]
cursor = conn.execute(sql, params)
results = []
for row in cursor.fetchall():
results.append({
"tag_name": row[0],
"category": row[1],
"post_count": row[2],
})
return results
finally:
conn.close()
except Exception as exc:
logger.debug("Tag FTS search error for query '%s': %s", query, exc)
return []
def get_indexed_count(self) -> int:
"""Return the number of tags currently indexed."""
if not self._schema_initialized:
return 0
try:
with self._lock:
conn = self._connect(readonly=True)
try:
cursor = conn.execute("SELECT COUNT(*) FROM tags")
result = cursor.fetchone()
return result[0] if result else 0
finally:
conn.close()
except Exception:
return 0
def clear(self) -> bool:
"""Clear all data from the FTS index.
Returns:
True if successful, False otherwise.
"""
try:
with self._lock:
conn = self._connect()
try:
conn.execute("DELETE FROM tag_fts")
conn.execute("DELETE FROM tags")
conn.commit()
self._ready.clear()
return True
finally:
conn.close()
except Exception as exc:
logger.error("Failed to clear tag FTS index: %s", exc)
return False
# Internal helpers
def _connect(self, readonly: bool = False) -> sqlite3.Connection:
"""Create a database connection."""
uri = False
path = self._db_path
if readonly:
if not os.path.exists(path):
raise FileNotFoundError(path)
path = f"file:{path}?mode=ro"
uri = True
conn = sqlite3.connect(path, check_same_thread=False, uri=uri)
conn.row_factory = sqlite3.Row
return conn
def _build_fts_query(self, query: str) -> str:
"""Build an FTS5 query string with prefix matching.
Args:
query: The user's search query.
Returns:
FTS5 query string.
"""
# Split query into words and clean them
words = query.lower().split()
if not words:
return ""
# Escape and add prefix wildcard to each word
prefix_terms = []
for word in words:
escaped = self._escape_fts_query(word)
if escaped:
# Add prefix wildcard for substring-like matching
prefix_terms.append(f"{escaped}*")
if not prefix_terms:
return ""
# Combine terms with implicit AND (all words must match)
return " ".join(prefix_terms)
def _escape_fts_query(self, text: str) -> str:
"""Escape special FTS5 characters.
FTS5 special characters: " ( ) * : ^ -
We keep * for prefix matching but escape others.
"""
if not text:
return ""
# Replace FTS5 special characters with space
special = ['"', "(", ")", "*", ":", "^", "-", "{", "}", "[", "]"]
result = text
for char in special:
result = result.replace(char, " ")
# Collapse multiple spaces and strip
result = re.sub(r"\s+", " ", result).strip()
return result
# Singleton instance
_tag_fts_index: Optional[TagFTSIndex] = None
_tag_fts_lock = threading.Lock()
def get_tag_fts_index() -> TagFTSIndex:
"""Get the singleton TagFTSIndex instance."""
global _tag_fts_index
if _tag_fts_index is None:
with _tag_fts_lock:
if _tag_fts_index is None:
_tag_fts_index = TagFTSIndex()
return _tag_fts_index
__all__ = [
"TagFTSIndex",
"get_tag_fts_index",
"CATEGORY_NAMES",
"CATEGORY_NAME_TO_IDS",
]