From 719e18adb6e1cb4edcf793281a4d0a6738a401d5 Mon Sep 17 00:00:00 2001 From: Will Miao Date: Sat, 31 Jan 2026 19:39:37 +0800 Subject: [PATCH] feat(media): add media type hint support for file extension detection, fixes #795 and fixes #751 - Add optional `media_type_hint` parameter to `_get_file_extension_from_content_or_headers` method - When `media_type_hint` is "video" and no extension can be determined from content/headers/URL, default to `.mp4` - Pass image metadata type as hint in both `process_example_images` and `process_example_images_batch` methods - Add unit tests to verify media type hint behavior and priority --- py/utils/example_images_processor.py | 19 +++++++++++--- .../test_example_images_processor_unit.py | 25 +++++++++++++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/py/utils/example_images_processor.py b/py/utils/example_images_processor.py index f20ddcf1..a44b0c61 100644 --- a/py/utils/example_images_processor.py +++ b/py/utils/example_images_processor.py @@ -43,8 +43,15 @@ class ExampleImagesProcessor: return media_url @staticmethod - def _get_file_extension_from_content_or_headers(content, headers, fallback_url=None): - """Determine file extension from content magic bytes or headers""" + def _get_file_extension_from_content_or_headers(content, headers, fallback_url=None, media_type_hint=None): + """Determine file extension from content magic bytes or headers + + Args: + content: File content bytes + headers: HTTP response headers + fallback_url: Original URL for extension extraction + media_type_hint: Optional media type hint from metadata (e.g., "video" or "image") + """ # Check magic bytes for common formats if content: if content.startswith(b'\xFF\xD8\xFF'): @@ -82,6 +89,10 @@ class ExampleImagesProcessor: if ext in SUPPORTED_MEDIA_EXTENSIONS['images'] or ext in SUPPORTED_MEDIA_EXTENSIONS['videos']: return ext + # Use media type hint from metadata if available + if media_type_hint == "video": + return '.mp4' + # Default fallback return '.jpg' @@ -136,7 +147,7 @@ class ExampleImagesProcessor: if success: # Determine file extension from content or headers media_ext = ExampleImagesProcessor._get_file_extension_from_content_or_headers( - content, headers, original_url + content, headers, original_url, image.get("type") ) # Check if the detected file type is supported @@ -219,7 +230,7 @@ class ExampleImagesProcessor: if success: # Determine file extension from content or headers media_ext = ExampleImagesProcessor._get_file_extension_from_content_or_headers( - content, headers, original_url + content, headers, original_url, image.get("type") ) # Check if the detected file type is supported diff --git a/tests/utils/test_example_images_processor_unit.py b/tests/utils/test_example_images_processor_unit.py index f79333b3..1b26accf 100644 --- a/tests/utils/test_example_images_processor_unit.py +++ b/tests/utils/test_example_images_processor_unit.py @@ -75,6 +75,31 @@ def test_get_file_extension_defaults_to_jpg() -> None: assert ext == ".jpg" +def test_get_file_extension_from_media_type_hint_video() -> None: + """Test that media_type_hint='video' returns .mp4 when other methods fail""" + ext = processor_module.ExampleImagesProcessor._get_file_extension_from_content_or_headers( + b"", {}, "https://c.genur.art/536be3c9-e506-4365-b078-bfbc5df9ceec", "video" + ) + assert ext == ".mp4" + + +def test_get_file_extension_from_media_type_hint_image() -> None: + """Test that media_type_hint='image' falls back to .jpg""" + ext = processor_module.ExampleImagesProcessor._get_file_extension_from_content_or_headers( + b"", {}, "https://example.com/no-extension", "image" + ) + assert ext == ".jpg" + + +def test_get_file_extension_media_type_hint_low_priority() -> None: + """Test that media_type_hint is only used as last resort (after URL extension)""" + # URL has extension, should use that instead of media_type_hint + ext = processor_module.ExampleImagesProcessor._get_file_extension_from_content_or_headers( + b"", {}, "https://example.com/video.mp4", "image" + ) + assert ext == ".mp4" + + class StubScanner: def __init__(self, models: list[Dict[str, Any]]) -> None: self._cache = SimpleNamespace(raw_data=models)