fix(agent): preserve preview URLs for collection repo models with flat heading structure

Three-part fix for enrich_hf_metadata failing to extract correct preview_url from HuggingFace collection repos where models share flat heading levels: 1. _strip_standalone_images() now converts <img> tags to markdown image syntax ![alt](src) instead of stripping the URL entirely, so the LLM can still extract preview URLs. 2. _extract_section() uses a line-count-based forward window (stopping at <a id> anchors) for non-heading matches, instead of stopping at the very next heading. This prevents same-level sub-headings (# Download, # Trigger, # Sample prompt within a single model section) from truncating the window before sample images are included. 3. Post-processor preview fallback now filters gallery images to the model-specific README section before falling back to the repo-wide first image.
2026-07-05 17:01:16 -03:00 · 2026-07-05 17:05:47 +08:00
parent 5494a70f40
commit 7b19bbb14e
4 changed files with 75 additions and 31 deletions
--- a/py/services/agent/post_processor.py
+++ b/py/services/agent/post_processor.py
@@ -82,6 +82,7 @@ class PostProcessor:
            convert_readme_to_html,
            extract_gallery_images,
            extract_gallery_table_images,
            extract_relevant_section,
            extract_simple_markdown_images,
            extract_html_img_tags,
            extract_repo_from_hf_url,
@@ -215,8 +216,21 @@ class PostProcessor:
        preview_remote_url = (llm_output.get("preview_url") or "").strip()
        # Fallback: if the LLM couldn't find a preview image in the cleaned
-        # README, use the first gallery image extracted from the YAML widget
+        # README, find the first gallery image from the *model-specific
-        # section.
+        # section* of the README (not the repo-wide first image, which
        # belongs to a different model in collection repos).
        if not preview_remote_url and readme_content and is_hf_model:
            model_basename = os.path.splitext(os.path.basename(model_path))[0]
            relevant_section = extract_relevant_section(
                readme_content, model_basename,
            )
            if relevant_section and relevant_section != readme_content:
                for img in gallery_images:
                    img_url = img.get("url", "")
                    if img_url and img_url in relevant_section:
                        preview_remote_url = img_url
                        break
        # Last resort: use the first gallery image from the full README.
        if not preview_remote_url and gallery_images:
            preview_remote_url = gallery_images[0].get("url", "")
        current_preview = metadata.get("preview_url") or ""
--- a/py/services/agent/skills/enrich_hf_metadata/readme_processor.py
+++ b/py/services/agent/skills/enrich_hf_metadata/readme_processor.py
@@ -495,20 +495,26 @@ def _strip_standalone_images(text: str) -> str:
    extract a ``preview_url`` from them.  Only the alt text is needed for
    content signal; the URL is needed for image extraction.
-    HTML ``<img>`` tags on their own line are replaced by their alt text
+    HTML ``<img>`` tags on their own line are **converted to markdown
-    (if any) or removed, since the LLM has difficulty extracting URLs from
+    image syntax** ``![alt](src)`` so both the alt text and the image URL
-    raw HTML attributes.
+    are preserved in a format the LLM can easily extract.  Previously the
    URL was stripped entirely, making it impossible for the LLM to return
    a ``preview_url`` for repos that use HTML ``<img>`` tags exclusively.
    """
-    # HTML: ``<img src="..." alt="..." ...>`` on its own line → keep alt text
+    def _img_to_md(match: re.Match) -> str:
-    text = re.sub(
+        """Convert an ``<img>`` tag to markdown image syntax ``![alt](src)``."""
-        r'^\s*<img\s[^>]*alt="([^"]*)"[^>]*/?>(?:</img>)?\s*$',
+        tag = match.group(0)
-        r"\1",
+        src_m = re.search(r'src="([^"]+)"', tag) or re.search(r"src='([^']+)'", tag)
-        text,
+        if not src_m:
-        flags=re.MULTILINE | re.IGNORECASE,
+            return ""
-    )
+        src = src_m.group(1)
        alt_m = re.search(r'alt="([^"]*)"', tag) or re.search(r"alt='([^']*)'", tag)
        alt = alt_m.group(1) if alt_m else ""
        return f"![{alt}]({src})"
    text = re.sub(
        r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
-        "",
+        _img_to_md,
        text,
        flags=re.MULTILINE | re.IGNORECASE,
    )
@@ -782,9 +788,16 @@ def _extract_section(
    When *match_idx* is itself a heading line, the section starts *at*
    that heading (no backward walk), avoiding pulling in content from
-    earlier sibling sections.  The forward walk only stops at a heading
+    earlier sibling sections.  The forward walk stops at a heading of
-    of **equal or higher** level (e.g. a ``#`` match includes all its
+    **equal or higher** level (e.g. a ``# Title`` match includes all its
-    ``##`` children).
+    ``## Children``).
    When *match_idx* is **not** a heading (e.g. a download link matched
    inside a sub-section like ``# Download``), the forward walk uses a
    generous line-count-based window instead of stopping at the very next
    heading.  This prevents same-level sub-headings (e.g. ``# Download``,
    ``# Trigger``, ``# Sample prompt`` within a single model section)
    from prematurely truncating the window before sample images.
    Always includes the YAML frontmatter if the original lines contain one,
    because it carries critical metadata (``base_model``, ``tags``,
@@ -808,15 +821,29 @@ def _extract_section(
                start = i
                break
-    # Walk forward.  Stop at a heading of EQUAL or HIGHER (fewer #) level,
+    # Walk forward.
    # so that a ``# Title`` match encompasses all its ``## Children``.
    # Start from the full remaining lines so we don't truncate content
    # when the YAML frontmatter pushes the matched heading far down.
    end = n
    if match_level == 0:
        # Non-heading match (e.g. a download link).  Use a line-based
        # window so that same-level sub-headings (# Download, # Trigger,
        # # Sample prompt within a single model section) don't truncate
        # the window.  Stop at the next <a id="..."> anchor (which
        # typically starts a new model section in collection repos), or
        # fall back to a generous line limit.
        forward_limit = min(n, match_idx + max(context_lines * 3, 250))
        for i in range(match_idx + 1, forward_limit):
            if re.search(r'<a\s+id="', lines[i], re.IGNORECASE):
                end = i
                break
        else:
            end = forward_limit
    else:
        # Heading match — stop at the next heading of equal or higher
        # level, so that a # Title encompasses all its ## Children.
        walk_limit = min(n, match_idx + max(context_lines * 3, 120))
        for i in range(match_idx + 1, walk_limit):
            hl = _heading_level(lines[i])
-        if hl > 0 and (match_level == 0 or hl <= match_level):
+            if hl > 0 and hl <= match_level:
                end = i
                break
--- a/tests/agent_cli/test_agent_cli.py
+++ b/tests/agent_cli/test_agent_cli.py
@@ -803,10 +803,12 @@ pixel art sprite, game asset, transparent background
        assert "## Gallery" in result
        assert "Some text." in result
-    def test_html_img_tag_stripped(self):
+    def test_html_img_tag_converted_to_markdown_image(self):
        """``<img>`` converted to ``![](src)``, preserving URL for LLM."""
        md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
        result = self._clean(md)
-        assert "cdn.hf.co" not in result
+        assert "![](https://cdn.hf.co/img.webp)" in result
        assert "cdn.hf.co" in result  # URL preserved for LLM extraction
        assert "Description." in result
    def test_inline_image_within_paragraph_preserved(self):
--- a/tests/agent_cli/test_readme_processor.py
+++ b/tests/agent_cli/test_readme_processor.py
@@ -242,11 +242,12 @@ After"""
        cleaned = R.clean_readme_for_llm(text)
        assert "./preview.png" in cleaned
-    def test_strips_html_img_tag(self, R):
+    def test_converts_html_img_tag_to_markdown_image(self, R):
-        """``<img src="...">`` → stripped."""
+        """``<img src="...">`` → ``![](src)`` preserving URL for LLM."""
        text = 'before\n<img src="logo.png">\nafter'
        cleaned = R.clean_readme_for_llm(text)
-        assert "logo.png" not in cleaned
+        assert "![](logo.png)" in cleaned
        assert "logo.png" in cleaned  # URL preserved for LLM extraction
    def test_widget_stripped_frontmatter_preserved(self, R):
        """Widget YAML stripped but ``base_model:`` kept."""