feat(tag-search): add alias support to FTS index

- Add aliases column to tags table to store comma-separated alias lists
- Update FTS schema to version 2 with searchable_text field containing tag names and aliases
- Implement schema migration to rebuild index when upgrading from old schema
- Modify search logic to match aliases and return canonical tag with matched alias info
- Update index building to include aliases in searchable text for FTS matching

This enables users to search for tag aliases (e.g., "miku") and get results for the canonical tag (e.g., "hatsune_miku") with indication of which alias was matched.
This commit is contained in:
Will Miao
2026-01-27 00:36:06 +08:00
parent 0ff551551e
commit 5dc5fd5971
3 changed files with 392 additions and 28 deletions

View File

@@ -2,6 +2,10 @@
This module provides fast tag search using SQLite's FTS5 extension,
enabling sub-100ms search times for 221k+ Danbooru/e621 tags.
Supports alias search: when a user searches for an alias (e.g., "miku"),
the system returns the canonical tag (e.g., "hatsune_miku") and indicates
which alias was matched.
"""
from __future__ import annotations
@@ -20,6 +24,9 @@ from ..utils.cache_paths import CacheType, resolve_cache_path_with_migration
logger = logging.getLogger(__name__)
# Schema version for tracking migrations
SCHEMA_VERSION = 2 # Version 2: Added aliases support
# Category definitions for Danbooru and e621
CATEGORY_NAMES = {
@@ -131,19 +138,25 @@ class TagFTSIndex:
conn = self._connect()
try:
conn.execute("PRAGMA journal_mode=WAL")
# Check if we need to migrate from old schema
needs_rebuild = self._check_and_migrate_schema(conn)
conn.executescript("""
-- FTS5 virtual table for full-text search
-- searchable_text contains "tag_name alias1 alias2 ..." for alias matching
CREATE VIRTUAL TABLE IF NOT EXISTS tag_fts USING fts5(
tag_name,
searchable_text,
tokenize='unicode61 remove_diacritics 2'
);
-- Tags table with metadata
-- Tags table with metadata and aliases
CREATE TABLE IF NOT EXISTS tags (
rowid INTEGER PRIMARY KEY,
tag_name TEXT UNIQUE NOT NULL,
category INTEGER NOT NULL DEFAULT 0,
post_count INTEGER NOT NULL DEFAULT 0
post_count INTEGER NOT NULL DEFAULT 0,
aliases TEXT DEFAULT ''
);
-- Indexes for efficient filtering
@@ -156,19 +169,77 @@ class TagFTSIndex:
value TEXT
);
""")
# Set schema version
conn.execute(
"INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
("schema_version", str(SCHEMA_VERSION))
)
conn.commit()
self._schema_initialized = True
self._needs_rebuild = needs_rebuild
logger.debug("Tag FTS index schema initialized at %s", self._db_path)
finally:
conn.close()
except Exception as exc:
logger.error("Failed to initialize tag FTS schema: %s", exc)
def _check_and_migrate_schema(self, conn: sqlite3.Connection) -> bool:
"""Check schema version and migrate if necessary.
Returns:
True if the index needs to be rebuilt, False otherwise.
"""
try:
# Check if fts_metadata table exists
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='fts_metadata'"
)
if not cursor.fetchone():
return False # Fresh database, no migration needed
# Check schema version
cursor = conn.execute(
"SELECT value FROM fts_metadata WHERE key='schema_version'"
)
row = cursor.fetchone()
if not row:
# Old schema without version, needs rebuild
logger.info("Migrating tag FTS index to schema version %d (adding alias support)", SCHEMA_VERSION)
self._drop_old_tables(conn)
return True
current_version = int(row[0])
if current_version < SCHEMA_VERSION:
logger.info("Migrating tag FTS index from version %d to %d", current_version, SCHEMA_VERSION)
self._drop_old_tables(conn)
return True
return False
except Exception as exc:
logger.warning("Error checking schema version: %s", exc)
return False
def _drop_old_tables(self, conn: sqlite3.Connection) -> None:
"""Drop old tables for schema migration."""
try:
conn.executescript("""
DROP TABLE IF EXISTS tag_fts;
DROP TABLE IF EXISTS tags;
""")
conn.commit()
except Exception as exc:
logger.warning("Error dropping old tables: %s", exc)
def build_index(self) -> None:
"""Build the FTS index from the CSV file.
This method parses the danbooru_e621_merged.csv file and creates
the FTS index for fast searching.
the FTS index for fast searching. The CSV format is:
tag_name,category,post_count,aliases
Where aliases is a comma-separated string (e.g., "miku,vocaloid_miku,39").
"""
if self._indexing_in_progress:
logger.warning("Tag FTS indexing already in progress, skipping")
@@ -201,6 +272,7 @@ class TagFTSIndex:
batch_size = 500
rows = []
total_inserted = 0
tags_with_aliases = 0
with open(self._csv_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
@@ -222,7 +294,12 @@ class TagFTSIndex:
except (ValueError, IndexError):
post_count = 0
rows.append((tag_name, category, post_count))
# Parse aliases from column 4 (if present)
aliases = row[3].strip() if len(row) >= 4 else ""
if aliases:
tags_with_aliases += 1
rows.append((tag_name, category, post_count, aliases))
if len(rows) >= batch_size:
self._insert_batch(conn, rows)
@@ -243,10 +320,17 @@ class TagFTSIndex:
"INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
("tag_count", str(total_inserted))
)
conn.execute(
"INSERT OR REPLACE INTO fts_metadata (key, value) VALUES (?, ?)",
("schema_version", str(SCHEMA_VERSION))
)
conn.commit()
elapsed = time.time() - start_time
logger.info("Tag FTS index built: %d tags indexed in %.2fs", total_inserted, elapsed)
logger.info(
"Tag FTS index built: %d tags indexed (%d with aliases) in %.2fs",
total_inserted, tags_with_aliases, elapsed
)
finally:
conn.close()
@@ -258,14 +342,22 @@ class TagFTSIndex:
self._indexing_in_progress = False
def _insert_batch(self, conn: sqlite3.Connection, rows: List[tuple]) -> None:
"""Insert a batch of rows into the database."""
# Insert into tags table
"""Insert a batch of rows into the database.
Each row is a tuple of (tag_name, category, post_count, aliases).
The FTS searchable_text is built as "tag_name alias1 alias2 ..." for alias matching.
"""
# Insert into tags table (with aliases)
conn.executemany(
"INSERT OR IGNORE INTO tags (tag_name, category, post_count) VALUES (?, ?, ?)",
"INSERT OR IGNORE INTO tags (tag_name, category, post_count, aliases) VALUES (?, ?, ?, ?)",
rows
)
# Get rowids and insert into FTS table
# Build a map of tag_name -> aliases for FTS insertion
aliases_map = {row[0]: row[3] for row in rows}
# Get rowids and insert into FTS table with explicit rowid
# to ensure tags.rowid matches tag_fts.rowid for JOINs
tag_names = [row[0] for row in rows]
placeholders = ",".join("?" * len(tag_names))
cursor = conn.execute(
@@ -273,9 +365,27 @@ class TagFTSIndex:
tag_names
)
fts_rows = [(tag_name,) for rowid, tag_name in cursor.fetchall()]
# Build FTS rows with (rowid, searchable_text) = (tags.rowid, "tag_name alias1 alias2 ...")
fts_rows = []
for rowid, tag_name in cursor.fetchall():
aliases = aliases_map.get(tag_name, "")
if aliases:
# Replace commas with spaces to create searchable text
# Strip "/" prefix from aliases as it's an FTS5 special character
alias_parts = []
for alias in aliases.split(","):
alias = alias.strip()
if alias.startswith("/"):
alias = alias[1:] # Remove leading slash
if alias:
alias_parts.append(alias)
searchable_text = f"{tag_name} {' '.join(alias_parts)}" if alias_parts else tag_name
else:
searchable_text = tag_name
fts_rows.append((rowid, searchable_text))
if fts_rows:
conn.executemany("INSERT INTO tag_fts (tag_name) VALUES (?)", fts_rows)
conn.executemany("INSERT INTO tag_fts (rowid, searchable_text) VALUES (?, ?)", fts_rows)
def ensure_ready(self) -> bool:
"""Ensure the index is ready, building if necessary.
@@ -289,6 +399,13 @@ class TagFTSIndex:
# Check if index already exists and has data
self.initialize()
if self._schema_initialized:
# Check if schema migration requires rebuild
if getattr(self, "_needs_rebuild", False):
logger.info("Schema migration requires index rebuild")
self._needs_rebuild = False
self.build_index()
return self.is_ready()
count = self.get_indexed_count()
if count > 0:
self._ready.set()
@@ -307,13 +424,17 @@ class TagFTSIndex:
) -> List[Dict]:
"""Search tags using FTS5 with prefix matching.
Supports alias search: if the query matches an alias rather than
the tag_name, the result will include a "matched_alias" field.
Args:
query: The search query string.
categories: Optional list of category IDs to filter by.
limit: Maximum number of results to return.
Returns:
List of dictionaries with tag_name, category, and post_count.
List of dictionaries with tag_name, category, post_count,
and optionally matched_alias.
"""
# Ensure index is ready (lazy initialization)
if not self.ensure_ready():
@@ -333,14 +454,15 @@ class TagFTSIndex:
with self._lock:
conn = self._connect(readonly=True)
try:
# Build the SQL query
# Build the SQL query - now also fetch aliases for matched_alias detection
# Use subquery for category filter to ensure FTS is evaluated first
if categories:
placeholders = ",".join("?" * len(categories))
sql = f"""
SELECT t.tag_name, t.category, t.post_count
SELECT t.tag_name, t.category, t.post_count, t.aliases
FROM tags t
WHERE t.tag_name IN (
SELECT tag_name FROM tag_fts WHERE tag_fts MATCH ?
WHERE t.rowid IN (
SELECT rowid FROM tag_fts WHERE searchable_text MATCH ?
)
AND t.category IN ({placeholders})
ORDER BY t.post_count DESC
@@ -349,11 +471,10 @@ class TagFTSIndex:
params = [fts_query] + categories + [limit]
else:
sql = """
SELECT t.tag_name, t.category, t.post_count
FROM tags t
WHERE t.tag_name IN (
SELECT tag_name FROM tag_fts WHERE tag_fts MATCH ?
)
SELECT t.tag_name, t.category, t.post_count, t.aliases
FROM tag_fts f
JOIN tags t ON f.rowid = t.rowid
WHERE f.searchable_text MATCH ?
ORDER BY t.post_count DESC
LIMIT ?
"""
@@ -362,11 +483,18 @@ class TagFTSIndex:
cursor = conn.execute(sql, params)
results = []
for row in cursor.fetchall():
results.append({
result = {
"tag_name": row[0],
"category": row[1],
"post_count": row[2],
})
}
# Check if search matched an alias rather than the tag_name
matched_alias = self._find_matched_alias(query, row[0], row[3])
if matched_alias:
result["matched_alias"] = matched_alias
results.append(result)
return results
finally:
conn.close()
@@ -374,6 +502,59 @@ class TagFTSIndex:
logger.debug("Tag FTS search error for query '%s': %s", query, exc)
return []
def _find_matched_alias(self, query: str, tag_name: str, aliases_str: str) -> Optional[str]:
"""Find which alias matched the query, if any.
Args:
query: The original search query.
tag_name: The canonical tag name.
aliases_str: Comma-separated string of aliases.
Returns:
The matched alias string, or None if the query matched the tag_name directly.
"""
query_lower = query.lower().strip()
if not query_lower:
return None
# Strip leading "/" from query if present (FTS index strips these)
query_normalized = query_lower.lstrip("/")
# Check if query matches tag_name prefix (direct match, no alias needed)
if tag_name.lower().startswith(query_normalized):
return None
# Check aliases first - if query matches an alias or a word within an alias, return it
if aliases_str:
for alias in aliases_str.split(","):
alias = alias.strip()
if not alias:
continue
# Normalize alias for comparison (strip leading slash)
alias_normalized = alias.lower().lstrip("/")
# Check if alias starts with query
if alias_normalized.startswith(query_normalized):
return alias # Return original alias (with "/" if present)
# Check if any word within the alias starts with query
# (mirrors FTS5 tokenization which splits on underscores)
alias_words = alias_normalized.replace("_", " ").split()
for word in alias_words:
if word.startswith(query_normalized):
return alias
# If no alias matched, check if query matches a word in tag_name
# (handles cases like "long_hair" matching "long" - no alias indicator needed)
tag_words = tag_name.lower().replace("_", " ").split()
for word in tag_words:
if word.startswith(query_normalized):
return None
# Query matched via FTS but not tag_name words or aliases
# This shouldn't normally happen, but return None for safety
return None
def get_indexed_count(self) -> int:
"""Return the number of tags currently indexed."""
if not self._schema_initialized:
@@ -458,14 +639,15 @@ class TagFTSIndex:
def _escape_fts_query(self, text: str) -> str:
"""Escape special FTS5 characters.
FTS5 special characters: " ( ) * : ^ -
FTS5 special characters: " ( ) * : ^ - /
We keep * for prefix matching but escape others.
"""
if not text:
return ""
# Replace FTS5 special characters with space
special = ['"', "(", ")", "*", ":", "^", "-", "{", "}", "[", "]"]
# Note: "/" is special in FTS5 (column filter syntax), so we strip it
special = ['"', "(", ")", "*", ":", "^", "-", "{", "}", "[", "]", "/"]
result = text
for char in special:
result = result.replace(char, " ")