fix: improve duplicate filename handling and logging in ModelScanner and ModelHashIndex

This commit is contained in:
Will Miao
2025-08-11 17:13:21 +08:00
parent 96517cbdef
commit c9289ed2dc
2 changed files with 40 additions and 13 deletions

View File

@@ -31,29 +31,34 @@ class ModelHashIndex:
if file_path not in self._duplicate_hashes.get(sha256, []): if file_path not in self._duplicate_hashes.get(sha256, []):
self._duplicate_hashes.setdefault(sha256, []).append(file_path) self._duplicate_hashes.setdefault(sha256, []).append(file_path)
# Track duplicates by filename # Track duplicates by filename - FIXED LOGIC
if filename in self._filename_to_hash: if filename in self._filename_to_hash:
old_hash = self._filename_to_hash[filename] existing_hash = self._filename_to_hash[filename]
if old_hash != sha256: # Different models with the same name existing_path = self._hash_to_path.get(existing_hash)
old_path = self._hash_to_path.get(old_hash)
if old_path: # If this is a different file with the same filename
if filename not in self._duplicate_filenames: if existing_path and existing_path != file_path:
self._duplicate_filenames[filename] = [old_path] # Initialize duplicates tracking if needed
if file_path not in self._duplicate_filenames.get(filename, []): if filename not in self._duplicate_filenames:
self._duplicate_filenames.setdefault(filename, []).append(file_path) self._duplicate_filenames[filename] = [existing_path]
# Add current file to duplicates if not already present
if file_path not in self._duplicate_filenames[filename]:
self._duplicate_filenames[filename].append(file_path)
# Remove old path mapping if hash exists # Remove old path mapping if hash exists
if sha256 in self._hash_to_path: if sha256 in self._hash_to_path:
old_path = self._hash_to_path[sha256] old_path = self._hash_to_path[sha256]
old_filename = self._get_filename_from_path(old_path) old_filename = self._get_filename_from_path(old_path)
if old_filename in self._filename_to_hash: if old_filename in self._filename_to_hash and self._filename_to_hash[old_filename] == sha256:
del self._filename_to_hash[old_filename] del self._filename_to_hash[old_filename]
# Remove old hash mapping if filename exists # Remove old hash mapping if filename exists and points to different hash
if filename in self._filename_to_hash: if filename in self._filename_to_hash:
old_hash = self._filename_to_hash[filename] old_hash = self._filename_to_hash[filename]
if old_hash in self._hash_to_path: if old_hash != sha256 and old_hash in self._hash_to_path:
del self._hash_to_path[old_hash] # Don't delete the old hash mapping, just update filename mapping
pass
# Add new mappings # Add new mappings
self._hash_to_path[sha256] = file_path self._hash_to_path[sha256] = file_path

View File

@@ -302,6 +302,13 @@ class ModelScanner:
for tag in model_data['tags']: for tag in model_data['tags']:
self._tags_count[tag] = self._tags_count.get(tag, 0) + 1 self._tags_count[tag] = self._tags_count.get(tag, 0) + 1
# Log duplicate filename warnings after building the index
duplicate_filenames = self._hash_index.get_duplicate_filenames()
if duplicate_filenames:
logger.warning(f"Found {len(duplicate_filenames)} filename(s) with duplicates during {self.model_type} cache build:")
for filename, paths in duplicate_filenames.items():
logger.warning(f" Duplicate filename '{filename}': {paths}")
# Update cache # Update cache
self._cache.raw_data = raw_data self._cache.raw_data = raw_data
loop.run_until_complete(self._cache.resort()) loop.run_until_complete(self._cache.resort())
@@ -367,6 +374,13 @@ class ModelScanner:
for tag in model_data['tags']: for tag in model_data['tags']:
self._tags_count[tag] = self._tags_count.get(tag, 0) + 1 self._tags_count[tag] = self._tags_count.get(tag, 0) + 1
# Log duplicate filename warnings after building the index
duplicate_filenames = self._hash_index.get_duplicate_filenames()
if duplicate_filenames:
logger.warning(f"Found {len(duplicate_filenames)} filename(s) with duplicates during {self.model_type} cache build:")
for filename, paths in duplicate_filenames.items():
logger.warning(f" Duplicate filename '{filename}': {paths}")
# Update cache # Update cache
self._cache = ModelCache( self._cache = ModelCache(
raw_data=raw_data, raw_data=raw_data,
@@ -670,6 +684,14 @@ class ModelScanner:
if model_data.get('exclude', False): if model_data.get('exclude', False):
self._excluded_models.append(model_data['file_path']) self._excluded_models.append(model_data['file_path'])
return None return None
# Check for duplicate filename before adding to hash index
filename = os.path.splitext(os.path.basename(file_path))[0]
existing_hash = self._hash_index.get_hash_by_filename(filename)
if existing_hash and existing_hash != model_data.get('sha256', '').lower():
existing_path = self._hash_index.get_path(existing_hash)
if existing_path and existing_path != file_path:
logger.warning(f"Duplicate filename detected: '{filename}' - files: '{existing_path}' and '{file_path}'")
await self._fetch_missing_metadata(file_path, model_data) await self._fetch_missing_metadata(file_path, model_data)
rel_path = os.path.relpath(file_path, root_path) rel_path = os.path.relpath(file_path, root_path)