From c9289ed2dc74eb5b446c5087c189ca8094638bf7 Mon Sep 17 00:00:00 2001 From: Will Miao <13051207myq@gmail.com> Date: Mon, 11 Aug 2025 17:13:21 +0800 Subject: [PATCH] fix: improve duplicate filename handling and logging in ModelScanner and ModelHashIndex --- py/services/model_hash_index.py | 31 ++++++++++++++++++------------- py/services/model_scanner.py | 22 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/py/services/model_hash_index.py b/py/services/model_hash_index.py index ac70f9ef..632d223d 100644 --- a/py/services/model_hash_index.py +++ b/py/services/model_hash_index.py @@ -31,29 +31,34 @@ class ModelHashIndex: if file_path not in self._duplicate_hashes.get(sha256, []): self._duplicate_hashes.setdefault(sha256, []).append(file_path) - # Track duplicates by filename + # Track duplicates by filename - FIXED LOGIC if filename in self._filename_to_hash: - old_hash = self._filename_to_hash[filename] - if old_hash != sha256: # Different models with the same name - old_path = self._hash_to_path.get(old_hash) - if old_path: - if filename not in self._duplicate_filenames: - self._duplicate_filenames[filename] = [old_path] - if file_path not in self._duplicate_filenames.get(filename, []): - self._duplicate_filenames.setdefault(filename, []).append(file_path) + existing_hash = self._filename_to_hash[filename] + existing_path = self._hash_to_path.get(existing_hash) + + # If this is a different file with the same filename + if existing_path and existing_path != file_path: + # Initialize duplicates tracking if needed + if filename not in self._duplicate_filenames: + self._duplicate_filenames[filename] = [existing_path] + + # Add current file to duplicates if not already present + if file_path not in self._duplicate_filenames[filename]: + self._duplicate_filenames[filename].append(file_path) # Remove old path mapping if hash exists if sha256 in self._hash_to_path: old_path = self._hash_to_path[sha256] old_filename = self._get_filename_from_path(old_path) - if old_filename in self._filename_to_hash: + if old_filename in self._filename_to_hash and self._filename_to_hash[old_filename] == sha256: del self._filename_to_hash[old_filename] - # Remove old hash mapping if filename exists + # Remove old hash mapping if filename exists and points to different hash if filename in self._filename_to_hash: old_hash = self._filename_to_hash[filename] - if old_hash in self._hash_to_path: - del self._hash_to_path[old_hash] + if old_hash != sha256 and old_hash in self._hash_to_path: + # Don't delete the old hash mapping, just update filename mapping + pass # Add new mappings self._hash_to_path[sha256] = file_path diff --git a/py/services/model_scanner.py b/py/services/model_scanner.py index 6fd16a82..dd995ee5 100644 --- a/py/services/model_scanner.py +++ b/py/services/model_scanner.py @@ -302,6 +302,13 @@ class ModelScanner: for tag in model_data['tags']: self._tags_count[tag] = self._tags_count.get(tag, 0) + 1 + # Log duplicate filename warnings after building the index + duplicate_filenames = self._hash_index.get_duplicate_filenames() + if duplicate_filenames: + logger.warning(f"Found {len(duplicate_filenames)} filename(s) with duplicates during {self.model_type} cache build:") + for filename, paths in duplicate_filenames.items(): + logger.warning(f" Duplicate filename '{filename}': {paths}") + # Update cache self._cache.raw_data = raw_data loop.run_until_complete(self._cache.resort()) @@ -367,6 +374,13 @@ class ModelScanner: for tag in model_data['tags']: self._tags_count[tag] = self._tags_count.get(tag, 0) + 1 + # Log duplicate filename warnings after building the index + duplicate_filenames = self._hash_index.get_duplicate_filenames() + if duplicate_filenames: + logger.warning(f"Found {len(duplicate_filenames)} filename(s) with duplicates during {self.model_type} cache build:") + for filename, paths in duplicate_filenames.items(): + logger.warning(f" Duplicate filename '{filename}': {paths}") + # Update cache self._cache = ModelCache( raw_data=raw_data, @@ -670,6 +684,14 @@ class ModelScanner: if model_data.get('exclude', False): self._excluded_models.append(model_data['file_path']) return None + + # Check for duplicate filename before adding to hash index + filename = os.path.splitext(os.path.basename(file_path))[0] + existing_hash = self._hash_index.get_hash_by_filename(filename) + if existing_hash and existing_hash != model_data.get('sha256', '').lower(): + existing_path = self._hash_index.get_path(existing_hash) + if existing_path and existing_path != file_path: + logger.warning(f"Duplicate filename detected: '{filename}' - files: '{existing_path}' and '{file_path}'") await self._fetch_missing_metadata(file_path, model_data) rel_path = os.path.relpath(file_path, root_path)