Merge pull request #562 from willmiao/incremental-cache, see #561

feat(model-scanner): add metadata tracking and improve cache management
2026-06-20 17:32:05 -03:00 · 2025-10-12 17:09:19 +08:00
parent a9a6f66035 acf610ddff
commit 97141b01e1
3 changed files with 372 additions and 32 deletions
--- a/py/services/model_scanner.py
+++ b/py/services/model_scanner.py
@@ -189,6 +189,7 @@ class ModelScanner:
            'favorite': bool(get_value('favorite', False)),
            'notes': notes,
            'usage_tips': usage_tips,
+            'metadata_source': get_value('metadata_source', None),
            'exclude': bool(get_value('exclude', False)),
            'db_checked': bool(get_value('db_checked', False)),
            'last_checked_at': float(get_value('last_checked_at', 0.0) or 0.0),
--- a/py/services/persistent_model_cache.py
+++ b/py/services/persistent_model_cache.py
@@ -25,6 +25,34 @@ class PersistentModelCache:
    """Persist core model metadata and hash index data in SQLite."""

    _DEFAULT_FILENAME = "model_cache.sqlite"
+    _MODEL_COLUMNS: Tuple[str, ...] = (
+        "model_type",
+        "file_path",
+        "file_name",
+        "model_name",
+        "folder",
+        "size",
+        "modified",
+        "sha256",
+        "base_model",
+        "preview_url",
+        "preview_nsfw_level",
+        "from_civitai",
+        "favorite",
+        "notes",
+        "usage_tips",
+        "metadata_source",
+        "civitai_id",
+        "civitai_model_id",
+        "civitai_name",
+        "civitai_creator_username",
+        "trained_words",
+        "civitai_deleted",
+        "exclude",
+        "db_checked",
+        "last_checked_at",
+    )
+    _MODEL_UPDATE_COLUMNS: Tuple[str, ...] = _MODEL_COLUMNS[2:]
    _instances: Dict[str, "PersistentModelCache"] = {}
    _instance_lock = threading.Lock()

@@ -64,12 +92,9 @@ class PersistentModelCache:
            with self._db_lock:
                conn = self._connect(readonly=True)
                try:
+                    model_columns_sql = ", ".join(self._MODEL_COLUMNS[1:])
                    rows = conn.execute(
-                        "SELECT file_path, file_name, model_name, folder, size, modified, sha256, base_model,"
-                        " preview_url, preview_nsfw_level, from_civitai, favorite, notes, usage_tips,"
-                        " civitai_id, civitai_model_id, civitai_name, trained_words, exclude, db_checked,"
-                        " last_checked_at"
-                        " FROM models WHERE model_type = ?",
+                        f"SELECT {model_columns_sql} FROM models WHERE model_type = ?",
                        (model_type,),
                    ).fetchall()

@@ -101,8 +126,12 @@ class PersistentModelCache:
                except json.JSONDecodeError:
                    trained_words = []

+            creator_username = row["civitai_creator_username"]
            civitai: Optional[Dict] = None
-            if any(row[col] is not None for col in ("civitai_id", "civitai_model_id", "civitai_name")):
+            civitai_has_data = any(
+                row[col] is not None for col in ("civitai_id", "civitai_model_id", "civitai_name")
+            ) or trained_words or creator_username
+            if civitai_has_data:
                civitai = {}
                if row["civitai_id"] is not None:
                    civitai["id"] = row["civitai_id"]
@@ -112,6 +141,8 @@ class PersistentModelCache:
                    civitai["name"] = row["civitai_name"]
                if trained_words:
                    civitai["trainedWords"] = trained_words
+                if creator_username:
+                    civitai.setdefault("creator", {})["username"] = creator_username

            item = {
                "file_path": file_path,
@@ -128,11 +159,13 @@ class PersistentModelCache:
                "favorite": bool(row["favorite"]),
                "notes": row["notes"] or "",
                "usage_tips": row["usage_tips"] or "",
+                "metadata_source": row["metadata_source"] or None,
                "exclude": bool(row["exclude"]),
                "db_checked": bool(row["db_checked"]),
                "last_checked_at": row["last_checked_at"] or 0.0,
                "tags": tags.get(file_path, []),
                "civitai": civitai,
+                "civitai_deleted": bool(row["civitai_deleted"]),
            }
            raw_data.append(item)

@@ -159,45 +192,190 @@ class PersistentModelCache:
                conn = self._connect()
                try:
                    conn.execute("PRAGMA foreign_keys = ON")
-                    conn.execute("DELETE FROM models WHERE model_type = ?", (model_type,))
-                    conn.execute("DELETE FROM model_tags WHERE model_type = ?", (model_type,))
-                    conn.execute("DELETE FROM hash_index WHERE model_type = ?", (model_type,))
-                    conn.execute("DELETE FROM excluded_models WHERE model_type = ?", (model_type,))
+                    conn.execute("BEGIN")

                    model_rows = [self._prepare_model_row(model_type, item) for item in raw_data]
-                    conn.executemany(self._insert_model_sql(), model_rows)
+                    model_map: Dict[str, Tuple] = {
+                        row[1]: row for row in model_rows if row[1]  # row[1] is file_path
+                    }

-                    tag_rows = []
+                    existing_models = conn.execute(
+                        "SELECT "
+                        + ", ".join(self._MODEL_COLUMNS[1:])
+                        + " FROM models WHERE model_type = ?",
+                        (model_type,),
+                    ).fetchall()
+                    existing_model_map: Dict[str, sqlite3.Row] = {
+                        row["file_path"]: row for row in existing_models
+                    }
+
+                    to_remove_models = [
+                        (model_type, path)
+                        for path in existing_model_map.keys()
+                        if path not in model_map
+                    ]
+                    if to_remove_models:
+                        conn.executemany(
+                            "DELETE FROM models WHERE model_type = ? AND file_path = ?",
+                            to_remove_models,
+                        )
+                        conn.executemany(
+                            "DELETE FROM model_tags WHERE model_type = ? AND file_path = ?",
+                            to_remove_models,
+                        )
+                        conn.executemany(
+                            "DELETE FROM hash_index WHERE model_type = ? AND file_path = ?",
+                            to_remove_models,
+                        )
+                        conn.executemany(
+                            "DELETE FROM excluded_models WHERE model_type = ? AND file_path = ?",
+                            to_remove_models,
+                        )
+
+                    insert_rows: List[Tuple] = []
+                    update_rows: List[Tuple] = []
+
+                    for file_path, row in model_map.items():
+                        existing = existing_model_map.get(file_path)
+                        if existing is None:
+                            insert_rows.append(row)
+                            continue
+
+                        existing_values = tuple(
+                            existing[column] for column in self._MODEL_COLUMNS[1:]
+                        )
+                        current_values = row[1:]
+                        if existing_values != current_values:
+                            update_rows.append(row[2:] + (model_type, file_path))
+
+                    if insert_rows:
+                        conn.executemany(self._insert_model_sql(), insert_rows)
+
+                    if update_rows:
+                        set_clause = ", ".join(
+                            f"{column} = ?"
+                            for column in self._MODEL_UPDATE_COLUMNS
+                        )
+                        update_sql = (
+                            f"UPDATE models SET {set_clause} WHERE model_type = ? AND file_path = ?"
+                        )
+                        conn.executemany(update_sql, update_rows)
+
+                    existing_tags_rows = conn.execute(
+                        "SELECT file_path, tag FROM model_tags WHERE model_type = ?",
+                        (model_type,),
+                    ).fetchall()
+                    existing_tags: Dict[str, set] = {}
+                    for row in existing_tags_rows:
+                        existing_tags.setdefault(row["file_path"], set()).add(row["tag"])
+
+                    new_tags: Dict[str, set] = {}
                    for item in raw_data:
                        file_path = item.get("file_path")
                        if not file_path:
                            continue
-                        for tag in item.get("tags") or []:
-                            tag_rows.append((model_type, file_path, tag))
-                    if tag_rows:
+                        tags = set(item.get("tags") or [])
+                        if tags:
+                            new_tags[file_path] = tags
+
+                    tag_inserts: List[Tuple[str, str, str]] = []
+                    tag_deletes: List[Tuple[str, str, str]] = []
+
+                    all_tag_paths = set(existing_tags.keys()) | set(new_tags.keys())
+                    for path in all_tag_paths:
+                        existing_set = existing_tags.get(path, set())
+                        new_set = new_tags.get(path, set())
+                        to_add = new_set - existing_set
+                        to_remove = existing_set - new_set
+
+                        for tag in to_add:
+                            tag_inserts.append((model_type, path, tag))
+                        for tag in to_remove:
+                            tag_deletes.append((model_type, path, tag))
+
+                    if tag_deletes:
+                        conn.executemany(
+                            "DELETE FROM model_tags WHERE model_type = ? AND file_path = ? AND tag = ?",
+                            tag_deletes,
+                        )
+                    if tag_inserts:
                        conn.executemany(
                            "INSERT INTO model_tags (model_type, file_path, tag) VALUES (?, ?, ?)",
-                            tag_rows,
+                            tag_inserts,
                        )

-                    hash_rows: List[Tuple[str, str, str]] = []
+                    existing_hash_rows = conn.execute(
+                        "SELECT sha256, file_path FROM hash_index WHERE model_type = ?",
+                        (model_type,),
+                    ).fetchall()
+                    existing_hash_map: Dict[str, set] = {}
+                    for row in existing_hash_rows:
+                        sha_value = (row["sha256"] or "").lower()
+                        if not sha_value:
+                            continue
+                        existing_hash_map.setdefault(sha_value, set()).add(row["file_path"])
+
+                    new_hash_map: Dict[str, set] = {}
                    for sha_value, paths in hash_index.items():
+                        normalized_sha = (sha_value or "").lower()
+                        if not normalized_sha:
+                            continue
+                        bucket = new_hash_map.setdefault(normalized_sha, set())
                        for path in paths:
-                            if not sha_value or not path:
-                                continue
-                            hash_rows.append((model_type, sha_value.lower(), path))
-                    if hash_rows:
+                            if path:
+                                bucket.add(path)
+
+                    hash_inserts: List[Tuple[str, str, str]] = []
+                    hash_deletes: List[Tuple[str, str, str]] = []
+
+                    all_shas = set(existing_hash_map.keys()) | set(new_hash_map.keys())
+                    for sha_value in all_shas:
+                        existing_paths = existing_hash_map.get(sha_value, set())
+                        new_paths = new_hash_map.get(sha_value, set())
+
+                        for path in existing_paths - new_paths:
+                            hash_deletes.append((model_type, sha_value, path))
+                        for path in new_paths - existing_paths:
+                            hash_inserts.append((model_type, sha_value, path))
+
+                    if hash_deletes:
+                        conn.executemany(
+                            "DELETE FROM hash_index WHERE model_type = ? AND sha256 = ? AND file_path = ?",
+                            hash_deletes,
+                        )
+                    if hash_inserts:
                        conn.executemany(
                            "INSERT OR IGNORE INTO hash_index (model_type, sha256, file_path) VALUES (?, ?, ?)",
-                            hash_rows,
+                            hash_inserts,
                        )

-                    excluded_rows = [(model_type, path) for path in excluded_models]
-                    if excluded_rows:
+                    existing_excluded_rows = conn.execute(
+                        "SELECT file_path FROM excluded_models WHERE model_type = ?",
+                        (model_type,),
+                    ).fetchall()
+                    existing_excluded = {row["file_path"] for row in existing_excluded_rows}
+                    new_excluded = {path for path in excluded_models if path}
+
+                    excluded_deletes = [
+                        (model_type, path)
+                        for path in existing_excluded - new_excluded
+                    ]
+                    excluded_inserts = [
+                        (model_type, path)
+                        for path in new_excluded - existing_excluded
+                    ]
+
+                    if excluded_deletes:
+                        conn.executemany(
+                            "DELETE FROM excluded_models WHERE model_type = ? AND file_path = ?",
+                            excluded_deletes,
+                        )
+                    if excluded_inserts:
                        conn.executemany(
                            "INSERT OR IGNORE INTO excluded_models (model_type, file_path) VALUES (?, ?)",
-                            excluded_rows,
+                            excluded_inserts,
                        )
+
                    conn.commit()
                finally:
                    conn.close()
@@ -248,10 +426,13 @@ class PersistentModelCache:
                            favorite INTEGER,
                            notes TEXT,
                            usage_tips TEXT,
+                            metadata_source TEXT,
                            civitai_id INTEGER,
                            civitai_model_id INTEGER,
                            civitai_name TEXT,
+                            civitai_creator_username TEXT,
                            trained_words TEXT,
+                            civitai_deleted INTEGER,
                            exclude INTEGER,
                            db_checked INTEGER,
                            last_checked_at REAL,
@@ -279,11 +460,31 @@ class PersistentModelCache:
                        );
                        """
                    )
+                    self._ensure_additional_model_columns(conn)
                    conn.commit()
                self._schema_initialized = True
            except Exception as exc:  # pragma: no cover - defensive guard
                logger.warning("Failed to initialize persistent cache schema: %s", exc)

+    def _ensure_additional_model_columns(self, conn: sqlite3.Connection) -> None:
+        try:
+            existing_columns = {
+                row["name"]
+                for row in conn.execute("PRAGMA table_info(models)").fetchall()
+            }
+        except Exception:  # pragma: no cover - defensive guard
+            return
+
+        required_columns = {
+            "metadata_source": "TEXT",
+            "civitai_creator_username": "TEXT",
+            "civitai_deleted": "INTEGER DEFAULT 0",
+        }
+
+        for column, definition in required_columns.items():
+            if column not in existing_columns:
+                conn.execute(f"ALTER TABLE models ADD COLUMN {column} {definition}")
+
    def _connect(self, readonly: bool = False) -> sqlite3.Connection:
        uri = False
        path = self._db_path
@@ -306,6 +507,12 @@ class PersistentModelCache:
        else:
            trained_words_json = json.dumps(trained_words)

+        metadata_source = item.get("metadata_source") or None
+        creator_username = None
+        creator_data = civitai.get("creator") if isinstance(civitai, dict) else None
+        if isinstance(creator_data, dict):
+            creator_username = creator_data.get("username") or None
+
        return (
            model_type,
            item.get("file_path"),
@@ -322,22 +529,22 @@ class PersistentModelCache:
            1 if item.get("favorite") else 0,
            item.get("notes"),
            item.get("usage_tips"),
+            metadata_source,
            civitai.get("id"),
            civitai.get("modelId"),
            civitai.get("name"),
+            creator_username,
            trained_words_json,
+            1 if item.get("civitai_deleted") else 0,
            1 if item.get("exclude") else 0,
            1 if item.get("db_checked") else 0,
            float(item.get("last_checked_at") or 0.0),
        )

    def _insert_model_sql(self) -> str:
-        return (
-            "INSERT INTO models (model_type, file_path, file_name, model_name, folder, size, modified, sha256,"
-            " base_model, preview_url, preview_nsfw_level, from_civitai, favorite, notes, usage_tips,"
-            " civitai_id, civitai_model_id, civitai_name, trained_words, exclude, db_checked, last_checked_at)"
-            " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
-        )
+        columns = ", ".join(self._MODEL_COLUMNS)
+        placeholders = ", ".join(["?"] * len(self._MODEL_COLUMNS))
+        return f"INSERT INTO models ({columns}) VALUES ({placeholders})"

    def _load_tags(self, conn: sqlite3.Connection, model_type: str) -> Dict[str, List[str]]:
        tag_rows = conn.execute(
--- a/tests/services/test_persistent_model_cache.py
+++ b/tests/services/test_persistent_model_cache.py
@@ -30,11 +30,19 @@ def test_persistent_cache_roundtrip(tmp_path: Path, monkeypatch) -> None:
            'favorite': True,
            'notes': 'note',
            'usage_tips': '{}',
+            'metadata_source': 'civitai_api',
            'exclude': False,
            'db_checked': True,
            'last_checked_at': 200.0,
            'tags': ['alpha', 'beta'],
-            'civitai': {'id': 1, 'modelId': 2, 'name': 'verA', 'trainedWords': ['word1']},
+            'civitai_deleted': False,
+            'civitai': {
+                'id': 1,
+                'modelId': 2,
+                'name': 'verA',
+                'trainedWords': ['word1'],
+                'creator': {'username': 'artist42'},
+            },
        },
        {
            'file_path': file_b,
@@ -51,10 +59,12 @@ def test_persistent_cache_roundtrip(tmp_path: Path, monkeypatch) -> None:
            'favorite': False,
            'notes': '',
            'usage_tips': '',
+            'metadata_source': None,
            'exclude': True,
            'db_checked': False,
            'last_checked_at': 0.0,
            'tags': [],
+            'civitai_deleted': True,
            'civitai': None,
        },
    ]
@@ -78,10 +88,15 @@ def test_persistent_cache_roundtrip(tmp_path: Path, monkeypatch) -> None:
    assert first['civitai']['id'] == 1
    assert first['civitai']['trainedWords'] == ['word1']
    assert first['tags'] == ['alpha', 'beta']
+    assert first['metadata_source'] == 'civitai_api'
+    assert first['civitai']['creator']['username'] == 'artist42'
+    assert first['civitai_deleted'] is False

    second = items[file_b]
    assert second['exclude'] is True
    assert second['civitai'] is None
+    assert second['metadata_source'] is None
+    assert second['civitai_deleted'] is True

    expected_hash_pairs = {
        ('hash-a', file_a),
@@ -90,3 +105,120 @@ def test_persistent_cache_roundtrip(tmp_path: Path, monkeypatch) -> None:
    }
    assert set((sha, path) for sha, path in persisted.hash_rows) == expected_hash_pairs
    assert persisted.excluded_models == excluded
+
+
+def test_incremental_updates_only_touch_changed_rows(tmp_path: Path, monkeypatch) -> None:
+    monkeypatch.setenv('LORA_MANAGER_DISABLE_PERSISTENT_CACHE', '0')
+    db_path = tmp_path / 'cache.sqlite'
+    store = PersistentModelCache(db_path=str(db_path))
+
+    file_a = (tmp_path / 'a.txt').as_posix()
+    file_b = (tmp_path / 'b.txt').as_posix()
+
+    initial_payload = [
+        {
+            'file_path': file_a,
+            'file_name': 'a',
+            'model_name': 'Model A',
+            'folder': '',
+            'size': 10,
+            'modified': 100.0,
+            'sha256': 'hash-a',
+            'base_model': 'base',
+            'preview_url': '',
+            'preview_nsfw_level': 0,
+            'from_civitai': True,
+            'favorite': False,
+            'notes': '',
+            'usage_tips': '',
+            'metadata_source': None,
+            'exclude': False,
+            'db_checked': False,
+            'last_checked_at': 0.0,
+            'tags': ['alpha'],
+            'civitai_deleted': False,
+            'civitai': None,
+        },
+        {
+            'file_path': file_b,
+            'file_name': 'b',
+            'model_name': 'Model B',
+            'folder': '',
+            'size': 20,
+            'modified': 120.0,
+            'sha256': 'hash-b',
+            'base_model': '',
+            'preview_url': '',
+            'preview_nsfw_level': 0,
+            'from_civitai': False,
+            'favorite': False,
+            'notes': '',
+            'usage_tips': '',
+            'metadata_source': 'civarchive',
+            'exclude': False,
+            'db_checked': False,
+            'last_checked_at': 0.0,
+            'tags': ['beta'],
+            'civitai_deleted': False,
+            'civitai': {'creator': {'username': 'builder'}},
+        },
+    ]
+
+    statements: list[str] = []
+    original_connect = store._connect
+
+    def _recording_connect(readonly: bool = False):
+        conn = original_connect(readonly=readonly)
+        conn.set_trace_callback(statements.append)
+        return conn
+
+    store._connect = _recording_connect  # type: ignore[method-assign]
+
+    store.save_cache('dummy', initial_payload, {'hash-a': [file_a], 'hash-b': [file_b]}, [])
+    statements.clear()
+
+    updated_payload = [
+        initial_payload[0],
+        {
+            **initial_payload[1],
+            'model_name': 'Model B Updated',
+            'favorite': True,
+            'tags': ['beta', 'gamma'],
+            'metadata_source': 'archive_db',
+            'civitai_deleted': True,
+            'civitai': {'creator': {'username': 'builder_v2'}},
+        },
+    ]
+    hash_index = {'hash-a': [file_a], 'hash-b': [file_b]}
+
+    store.save_cache('dummy', updated_payload, hash_index, [])
+
+    broad_delete = [
+        stmt for stmt in statements if "DELETE FROM models WHERE model_type = 'dummy'" in stmt and "file_path" not in stmt
+    ]
+    assert not broad_delete
+
+    updated_stmt_present = any(
+        "UPDATE models" in stmt and f"file_path = '{file_b}'" in stmt for stmt in statements
+    )
+    assert updated_stmt_present
+
+    unchanged_stmt_present = any(
+        "UPDATE models" in stmt and f"file_path = '{file_a}'" in stmt for stmt in statements
+    )
+    assert not unchanged_stmt_present
+
+    tag_insert = any(
+        "INSERT INTO model_tags" in stmt and "gamma" in stmt for stmt in statements
+    )
+    assert tag_insert
+
+    assert any("metadata_source" in stmt for stmt in statements if "UPDATE models" in stmt)
+
+    persisted = store.load_cache('dummy')
+    assert persisted is not None
+    items = {item['file_path']: item for item in persisted.raw_data}
+    second = items[file_b]
+    assert second['metadata_source'] == 'archive_db'
+    assert second['civitai_deleted'] is True
+    assert second['civitai']['creator']['username'] == 'builder_v2'