fix(agent): preserve preview URLs for collection repo models with flat heading structure

Three-part fix for enrich_hf_metadata failing to extract correct preview_url from HuggingFace collection repos where models share flat heading levels: 1. _strip_standalone_images() now converts <img> tags to markdown image syntax ![alt](src) instead of stripping the URL entirely, so the LLM can still extract preview URLs. 2. _extract_section() uses a line-count-based forward window (stopping at <a id> anchors) for non-heading matches, instead of stopping at the very next heading. This prevents same-level sub-headings (# Download, # Trigger, # Sample prompt within a single model section) from truncating the window before sample images are included. 3. Post-processor preview fallback now filters gallery images to the model-specific README section before falling back to the repo-wide first image.
2026-07-05 17:01:16 -03:00 · 2026-07-05 17:05:47 +08:00
parent 5494a70f40
commit 7b19bbb14e
4 changed files with 75 additions and 31 deletions
--- a/py/services/agent/post_processor.py
+++ b/py/services/agent/post_processor.py
@@ -82,6 +82,7 @@ class PostProcessor:
            convert_readme_to_html,
            extract_gallery_images,
            extract_gallery_table_images,
+            extract_relevant_section,
            extract_simple_markdown_images,
            extract_html_img_tags,
            extract_repo_from_hf_url,
@@ -215,8 +216,21 @@ class PostProcessor:

        preview_remote_url = (llm_output.get("preview_url") or "").strip()
        # Fallback: if the LLM couldn't find a preview image in the cleaned
-        # README, use the first gallery image extracted from the YAML widget
-        # section.
+        # README, find the first gallery image from the *model-specific
+        # section* of the README (not the repo-wide first image, which
+        # belongs to a different model in collection repos).
+        if not preview_remote_url and readme_content and is_hf_model:
+            model_basename = os.path.splitext(os.path.basename(model_path))[0]
+            relevant_section = extract_relevant_section(
+                readme_content, model_basename,
+            )
+            if relevant_section and relevant_section != readme_content:
+                for img in gallery_images:
+                    img_url = img.get("url", "")
+                    if img_url and img_url in relevant_section:
+                        preview_remote_url = img_url
+                        break
+        # Last resort: use the first gallery image from the full README.
        if not preview_remote_url and gallery_images:
            preview_remote_url = gallery_images[0].get("url", "")
        current_preview = metadata.get("preview_url") or ""
--- a/py/services/agent/skills/enrich_hf_metadata/readme_processor.py
+++ b/py/services/agent/skills/enrich_hf_metadata/readme_processor.py
@@ -495,20 +495,26 @@ def _strip_standalone_images(text: str) -> str:
    extract a ``preview_url`` from them.  Only the alt text is needed for
    content signal; the URL is needed for image extraction.

-    HTML ``<img>`` tags on their own line are replaced by their alt text
-    (if any) or removed, since the LLM has difficulty extracting URLs from
-    raw HTML attributes.
+    HTML ``<img>`` tags on their own line are **converted to markdown
+    image syntax** ``![alt](src)`` so both the alt text and the image URL
+    are preserved in a format the LLM can easily extract.  Previously the
+    URL was stripped entirely, making it impossible for the LLM to return
+    a ``preview_url`` for repos that use HTML ``<img>`` tags exclusively.
    """
-    # HTML: ``<img src="..." alt="..." ...>`` on its own line → keep alt text
-    text = re.sub(
-        r'^\s*<img\s[^>]*alt="([^"]*)"[^>]*/?>(?:</img>)?\s*$',
-        r"\1",
-        text,
-        flags=re.MULTILINE | re.IGNORECASE,
-    )
+    def _img_to_md(match: re.Match) -> str:
+        """Convert an ``<img>`` tag to markdown image syntax ``![alt](src)``."""
+        tag = match.group(0)
+        src_m = re.search(r'src="([^"]+)"', tag) or re.search(r"src='([^']+)'", tag)
+        if not src_m:
+            return ""
+        src = src_m.group(1)
+        alt_m = re.search(r'alt="([^"]*)"', tag) or re.search(r"alt='([^']*)'", tag)
+        alt = alt_m.group(1) if alt_m else ""
+        return f"![{alt}]({src})"
+
    text = re.sub(
        r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
-        "",
+        _img_to_md,
        text,
        flags=re.MULTILINE | re.IGNORECASE,
    )
@@ -782,9 +788,16 @@ def _extract_section(

    When *match_idx* is itself a heading line, the section starts *at*
    that heading (no backward walk), avoiding pulling in content from
-    earlier sibling sections.  The forward walk only stops at a heading
-    of **equal or higher** level (e.g. a ``#`` match includes all its
-    ``##`` children).
+    earlier sibling sections.  The forward walk stops at a heading of
+    **equal or higher** level (e.g. a ``# Title`` match includes all its
+    ``## Children``).
+
+    When *match_idx* is **not** a heading (e.g. a download link matched
+    inside a sub-section like ``# Download``), the forward walk uses a
+    generous line-count-based window instead of stopping at the very next
+    heading.  This prevents same-level sub-headings (e.g. ``# Download``,
+    ``# Trigger``, ``# Sample prompt`` within a single model section)
+    from prematurely truncating the window before sample images.

    Always includes the YAML frontmatter if the original lines contain one,
    because it carries critical metadata (``base_model``, ``tags``,
@@ -808,17 +821,31 @@ def _extract_section(
                start = i
                break

-    # Walk forward.  Stop at a heading of EQUAL or HIGHER (fewer #) level,
-    # so that a ``# Title`` match encompasses all its ``## Children``.
-    # Start from the full remaining lines so we don't truncate content
-    # when the YAML frontmatter pushes the matched heading far down.
+    # Walk forward.
    end = n
-    walk_limit = min(n, match_idx + max(context_lines * 3, 120))
-    for i in range(match_idx + 1, walk_limit):
-        hl = _heading_level(lines[i])
-        if hl > 0 and (match_level == 0 or hl <= match_level):
-            end = i
-            break
+    if match_level == 0:
+        # Non-heading match (e.g. a download link).  Use a line-based
+        # window so that same-level sub-headings (# Download, # Trigger,
+        # # Sample prompt within a single model section) don't truncate
+        # the window.  Stop at the next <a id="..."> anchor (which
+        # typically starts a new model section in collection repos), or
+        # fall back to a generous line limit.
+        forward_limit = min(n, match_idx + max(context_lines * 3, 250))
+        for i in range(match_idx + 1, forward_limit):
+            if re.search(r'<a\s+id="', lines[i], re.IGNORECASE):
+                end = i
+                break
+        else:
+            end = forward_limit
+    else:
+        # Heading match — stop at the next heading of equal or higher
+        # level, so that a # Title encompasses all its ## Children.
+        walk_limit = min(n, match_idx + max(context_lines * 3, 120))
+        for i in range(match_idx + 1, walk_limit):
+            hl = _heading_level(lines[i])
+            if hl > 0 and hl <= match_level:
+                end = i
+                break

    # If YAML frontmatter exists before the matched section, prepend it.
    if start > 0 and len(lines) > 1 and lines[0].strip() == "---":
--- a/tests/agent_cli/test_agent_cli.py
+++ b/tests/agent_cli/test_agent_cli.py
@@ -803,10 +803,12 @@ pixel art sprite, game asset, transparent background
        assert "## Gallery" in result
        assert "Some text." in result

-    def test_html_img_tag_stripped(self):
+    def test_html_img_tag_converted_to_markdown_image(self):
+        """``<img>`` converted to ``![](src)``, preserving URL for LLM."""
        md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
        result = self._clean(md)
-        assert "cdn.hf.co" not in result
+        assert "![](https://cdn.hf.co/img.webp)" in result
+        assert "cdn.hf.co" in result  # URL preserved for LLM extraction
        assert "Description." in result

    def test_inline_image_within_paragraph_preserved(self):
--- a/tests/agent_cli/test_readme_processor.py
+++ b/tests/agent_cli/test_readme_processor.py
@@ -242,11 +242,12 @@ After"""
        cleaned = R.clean_readme_for_llm(text)
        assert "./preview.png" in cleaned

-    def test_strips_html_img_tag(self, R):
-        """``<img src="...">`` → stripped."""
+    def test_converts_html_img_tag_to_markdown_image(self, R):
+        """``<img src="...">`` → ``![](src)`` preserving URL for LLM."""
        text = 'before\n<img src="logo.png">\nafter'
        cleaned = R.clean_readme_for_llm(text)
-        assert "logo.png" not in cleaned
+        assert "![](logo.png)" in cleaned
+        assert "logo.png" in cleaned  # URL preserved for LLM extraction

    def test_widget_stripped_frontmatter_preserved(self, R):
        """Widget YAML stripped but ``base_model:`` kept."""