From c9289ed2dc74eb5b446c5087c189ca8094638bf7 Mon Sep 17 00:00:00 2001
From: Will Miao <13051207myq@gmail.com>
Date: Mon, 11 Aug 2025 17:13:21 +0800
Subject: [PATCH] fix: improve duplicate filename handling and logging in
 ModelScanner and ModelHashIndex

---
 py/services/model_hash_index.py | 31 ++++++++++++++++++-------------
 py/services/model_scanner.py    | 22 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/py/services/model_hash_index.py b/py/services/model_hash_index.py
index ac70f9ef..632d223d 100644
--- a/py/services/model_hash_index.py
+++ b/py/services/model_hash_index.py
@@ -31,29 +31,34 @@ class ModelHashIndex:
                 if file_path not in self._duplicate_hashes.get(sha256, []):
                     self._duplicate_hashes.setdefault(sha256, []).append(file_path)
         
-        # Track duplicates by filename
+        # Track duplicates by filename - FIXED LOGIC
         if filename in self._filename_to_hash:
-            old_hash = self._filename_to_hash[filename]
-            if old_hash != sha256:  # Different models with the same name
-                old_path = self._hash_to_path.get(old_hash)
-                if old_path:
-                    if filename not in self._duplicate_filenames:
-                        self._duplicate_filenames[filename] = [old_path]
-                    if file_path not in self._duplicate_filenames.get(filename, []):
-                        self._duplicate_filenames.setdefault(filename, []).append(file_path)
+            existing_hash = self._filename_to_hash[filename]
+            existing_path = self._hash_to_path.get(existing_hash)
+            
+            # If this is a different file with the same filename
+            if existing_path and existing_path != file_path:
+                # Initialize duplicates tracking if needed
+                if filename not in self._duplicate_filenames:
+                    self._duplicate_filenames[filename] = [existing_path]
+                
+                # Add current file to duplicates if not already present
+                if file_path not in self._duplicate_filenames[filename]:
+                    self._duplicate_filenames[filename].append(file_path)
         
         # Remove old path mapping if hash exists
         if sha256 in self._hash_to_path:
             old_path = self._hash_to_path[sha256]
             old_filename = self._get_filename_from_path(old_path)
-            if old_filename in self._filename_to_hash:
+            if old_filename in self._filename_to_hash and self._filename_to_hash[old_filename] == sha256:
                 del self._filename_to_hash[old_filename]
         
-        # Remove old hash mapping if filename exists
+        # Remove old hash mapping if filename exists and points to different hash
         if filename in self._filename_to_hash:
             old_hash = self._filename_to_hash[filename]
-            if old_hash in self._hash_to_path:
-                del self._hash_to_path[old_hash]
+            if old_hash != sha256 and old_hash in self._hash_to_path:
+                # Don't delete the old hash mapping, just update filename mapping
+                pass
         
         # Add new mappings
         self._hash_to_path[sha256] = file_path
diff --git a/py/services/model_scanner.py b/py/services/model_scanner.py
index 6fd16a82..dd995ee5 100644
--- a/py/services/model_scanner.py
+++ b/py/services/model_scanner.py
@@ -302,6 +302,13 @@ class ModelScanner:
                         for tag in model_data['tags']:
                             self._tags_count[tag] = self._tags_count.get(tag, 0) + 1
                 
+                # Log duplicate filename warnings after building the index
+                duplicate_filenames = self._hash_index.get_duplicate_filenames()
+                if duplicate_filenames:
+                    logger.warning(f"Found {len(duplicate_filenames)} filename(s) with duplicates during {self.model_type} cache build:")
+                    for filename, paths in duplicate_filenames.items():
+                        logger.warning(f"  Duplicate filename '{filename}': {paths}")
+                
                 # Update cache
                 self._cache.raw_data = raw_data
                 loop.run_until_complete(self._cache.resort())
@@ -367,6 +374,13 @@ class ModelScanner:
                     for tag in model_data['tags']:
                         self._tags_count[tag] = self._tags_count.get(tag, 0) + 1
             
+            # Log duplicate filename warnings after building the index
+            duplicate_filenames = self._hash_index.get_duplicate_filenames()
+            if duplicate_filenames:
+                logger.warning(f"Found {len(duplicate_filenames)} filename(s) with duplicates during {self.model_type} cache build:")
+                for filename, paths in duplicate_filenames.items():
+                    logger.warning(f"  Duplicate filename '{filename}': {paths}")
+            
             # Update cache
             self._cache = ModelCache(
                 raw_data=raw_data,
@@ -670,6 +684,14 @@ class ModelScanner:
         if model_data.get('exclude', False):
             self._excluded_models.append(model_data['file_path'])
             return None
+        
+        # Check for duplicate filename before adding to hash index
+        filename = os.path.splitext(os.path.basename(file_path))[0]
+        existing_hash = self._hash_index.get_hash_by_filename(filename)
+        if existing_hash and existing_hash != model_data.get('sha256', '').lower():
+            existing_path = self._hash_index.get_path(existing_hash)
+            if existing_path and existing_path != file_path:
+                logger.warning(f"Duplicate filename detected: '{filename}' - files: '{existing_path}' and '{file_path}'")
             
         await self._fetch_missing_metadata(file_path, model_data)
         rel_path = os.path.relpath(file_path, root_path)