From 6f9245df0130ebc116dcdef20a230e6f1a64f18d Mon Sep 17 00:00:00 2001 From: Will Miao <13051207myq@gmail.com> Date: Mon, 15 Sep 2025 18:53:04 +0800 Subject: [PATCH] refactor(downloader): enhance download_to_memory to return response headers and improve error handling --- py/services/downloader.py | 27 ++-- py/utils/example_images_processor.py | 182 +++++++++++++++++---------- 2 files changed, 136 insertions(+), 73 deletions(-) diff --git a/py/services/downloader.py b/py/services/downloader.py index dd2c8c32..4f6b5f97 100644 --- a/py/services/downloader.py +++ b/py/services/downloader.py @@ -366,8 +366,9 @@ class Downloader: self, url: str, use_auth: bool = False, - custom_headers: Optional[Dict[str, str]] = None - ) -> Tuple[bool, Union[bytes, str]]: + custom_headers: Optional[Dict[str, str]] = None, + return_headers: bool = False + ) -> Tuple[bool, Union[bytes, str], Optional[Dict]]: """ Download a file to memory (for small files like preview images) @@ -375,9 +376,10 @@ class Downloader: url: Download URL use_auth: Whether to include authentication headers custom_headers: Additional headers to include in request + return_headers: Whether to return response headers along with content Returns: - Tuple[bool, Union[bytes, str]]: (success, content or error message) + Tuple[bool, Union[bytes, str], Optional[Dict]]: (success, content or error message, response headers if requested) """ try: session = await self.session @@ -395,19 +397,26 @@ class Downloader: async with session.get(url, headers=headers, proxy=self.proxy_url) as response: if response.status == 200: content = await response.read() - return True, content + if return_headers: + return True, content, dict(response.headers) + else: + return True, content, None elif response.status == 401: - return False, "Unauthorized access - invalid or missing API key" + error_msg = "Unauthorized access - invalid or missing API key" + return False, error_msg, None elif response.status == 403: - return False, "Access forbidden" + error_msg = "Access forbidden" + return False, error_msg, None elif response.status == 404: - return False, "File not found" + error_msg = "File not found" + return False, error_msg, None else: - return False, f"Download failed with status {response.status}" + error_msg = f"Download failed with status {response.status}" + return False, error_msg, None except Exception as e: logger.error(f"Error downloading to memory from {url}: {e}") - return False, str(e) + return False, str(e), None async def get_response_headers( self, diff --git a/py/utils/example_images_processor.py b/py/utils/example_images_processor.py index 9dba4e2c..f1cfd2bf 100644 --- a/py/utils/example_images_processor.py +++ b/py/utils/example_images_processor.py @@ -23,17 +23,60 @@ class ExampleImagesProcessor: return ''.join(random.choice(chars) for _ in range(length)) @staticmethod - def get_civitai_optimized_url(image_url): - """Convert Civitai image URL to its optimized WebP version""" + def get_civitai_optimized_url(media_url): + """Convert Civitai media URL (image or video) to its optimized version""" base_pattern = r'(https://image\.civitai\.com/[^/]+/[^/]+)' - match = re.match(base_pattern, image_url) + match = re.match(base_pattern, media_url) if match: base_url = match.group(1) - return f"{base_url}/optimized=true/image.webp" + return f"{base_url}/optimized=true" - return image_url + return media_url + @staticmethod + def _get_file_extension_from_content_or_headers(content, headers, fallback_url=None): + """Determine file extension from content magic bytes or headers""" + # Check magic bytes for common formats + if content: + if content.startswith(b'\xFF\xD8\xFF'): + return '.jpg' + elif content.startswith(b'\x89PNG\r\n\x1A\n'): + return '.png' + elif content.startswith(b'GIF87a') or content.startswith(b'GIF89a'): + return '.gif' + elif content.startswith(b'RIFF') and b'WEBP' in content[:12]: + return '.webp' + elif content.startswith(b'\x00\x00\x00\x18ftypmp4') or content.startswith(b'\x00\x00\x00\x20ftypmp4'): + return '.mp4' + elif content.startswith(b'\x1A\x45\xDF\xA3'): + return '.webm' + + # Check Content-Type header + if headers: + content_type = headers.get('content-type', '').lower() + type_map = { + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'video/mp4': '.mp4', + 'video/webm': '.webm', + 'video/quicktime': '.mov' + } + if content_type in type_map: + return type_map[content_type] + + # Fallback to URL extension if available + if fallback_url: + filename = os.path.basename(fallback_url.split('?')[0]) + ext = os.path.splitext(filename)[1].lower() + if ext in SUPPORTED_MEDIA_EXTENSIONS['images'] or ext in SUPPORTED_MEDIA_EXTENSIONS['videos']: + return ext + + # Default fallback + return '.jpg' + @staticmethod async def download_model_images(model_hash, model_name, model_images, model_dir, optimize, downloader): """Download images for a single model @@ -48,45 +91,49 @@ class ExampleImagesProcessor: if not image_url: continue - # Get image filename from URL - image_filename = os.path.basename(image_url.split('?')[0]) - image_ext = os.path.splitext(image_filename)[1].lower() - - # Handle images and videos - is_image = image_ext in SUPPORTED_MEDIA_EXTENSIONS['images'] - is_video = image_ext in SUPPORTED_MEDIA_EXTENSIONS['videos'] - - if not (is_image or is_video): - logger.debug(f"Skipping unsupported file type: {image_filename}") - continue - - # Use 0-based indexing instead of 1-based indexing - save_filename = f"image_{i}{image_ext}" - - # If optimizing images and this is a Civitai image, use their pre-optimized WebP version - if is_image and optimize and 'civitai.com' in image_url: + # Apply optimization for Civitai URLs if enabled + original_url = image_url + if optimize and 'civitai.com' in image_url: image_url = ExampleImagesProcessor.get_civitai_optimized_url(image_url) - save_filename = f"image_{i}.webp" - # Check if already downloaded - save_path = os.path.join(model_dir, save_filename) - if os.path.exists(save_path): - logger.debug(f"File already exists: {save_path}") - continue - - # Download the file + # Download the file first to determine the actual file type try: - logger.debug(f"Downloading {save_filename} for {model_name}") + logger.debug(f"Downloading media file {i} for {model_name}") - # Download using the unified downloader - success, content = await downloader.download_to_memory( + # Download using the unified downloader with headers + success, content, headers = await downloader.download_to_memory( image_url, - use_auth=False # Example images don't need auth + use_auth=False, # Example images don't need auth + return_headers=True ) if success: + # Determine file extension from content or headers + media_ext = ExampleImagesProcessor._get_file_extension_from_content_or_headers( + content, headers, original_url + ) + + # Check if the detected file type is supported + is_image = media_ext in SUPPORTED_MEDIA_EXTENSIONS['images'] + is_video = media_ext in SUPPORTED_MEDIA_EXTENSIONS['videos'] + + if not (is_image or is_video): + logger.debug(f"Skipping unsupported file type: {media_ext}") + continue + + # Use 0-based indexing with the detected extension + save_filename = f"image_{i}{media_ext}" + save_path = os.path.join(model_dir, save_filename) + + # Check if already downloaded + if os.path.exists(save_path): + logger.debug(f"File already exists: {save_path}") + continue + + # Save the file with open(save_path, 'wb') as f: f.write(content) + elif "404" in str(content): error_msg = f"Failed to download file: {image_url}, status code: 404 - Model metadata might be stale" logger.warning(error_msg) @@ -119,45 +166,49 @@ class ExampleImagesProcessor: if not image_url: continue - # Get image filename from URL - image_filename = os.path.basename(image_url.split('?')[0]) - image_ext = os.path.splitext(image_filename)[1].lower() - - # Handle images and videos - is_image = image_ext in SUPPORTED_MEDIA_EXTENSIONS['images'] - is_video = image_ext in SUPPORTED_MEDIA_EXTENSIONS['videos'] - - if not (is_image or is_video): - logger.debug(f"Skipping unsupported file type: {image_filename}") - continue - - # Use 0-based indexing instead of 1-based indexing - save_filename = f"image_{i}{image_ext}" - - # If optimizing images and this is a Civitai image, use their pre-optimized WebP version - if is_image and optimize and 'civitai.com' in image_url: + # Apply optimization for Civitai URLs if enabled + original_url = image_url + if optimize and 'civitai.com' in image_url: image_url = ExampleImagesProcessor.get_civitai_optimized_url(image_url) - save_filename = f"image_{i}.webp" - # Check if already downloaded - save_path = os.path.join(model_dir, save_filename) - if os.path.exists(save_path): - logger.debug(f"File already exists: {save_path}") - continue - - # Download the file + # Download the file first to determine the actual file type try: - logger.debug(f"Downloading {save_filename} for {model_name}") + logger.debug(f"Downloading media file {i} for {model_name}") - # Download using the unified downloader - success, content = await downloader.download_to_memory( + # Download using the unified downloader with headers + success, content, headers = await downloader.download_to_memory( image_url, - use_auth=False # Example images don't need auth + use_auth=False, # Example images don't need auth + return_headers=True ) if success: + # Determine file extension from content or headers + media_ext = ExampleImagesProcessor._get_file_extension_from_content_or_headers( + content, headers, original_url + ) + + # Check if the detected file type is supported + is_image = media_ext in SUPPORTED_MEDIA_EXTENSIONS['images'] + is_video = media_ext in SUPPORTED_MEDIA_EXTENSIONS['videos'] + + if not (is_image or is_video): + logger.debug(f"Skipping unsupported file type: {media_ext}") + continue + + # Use 0-based indexing with the detected extension + save_filename = f"image_{i}{media_ext}" + save_path = os.path.join(model_dir, save_filename) + + # Check if already downloaded + if os.path.exists(save_path): + logger.debug(f"File already exists: {save_path}") + continue + + # Save the file with open(save_path, 'wb') as f: f.write(content) + elif "404" in str(content): error_msg = f"Failed to download file: {image_url}, status code: 404 - Model metadata might be stale" logger.warning(error_msg) @@ -569,4 +620,7 @@ class ExampleImagesProcessor: return web.json_response({ 'success': False, 'error': str(e) - }, status=500) \ No newline at end of file + }, status=500) + + + \ No newline at end of file