fix(agent): preserve preview URLs for collection repo models with flat heading structure

Three-part fix for enrich_hf_metadata failing to extract correct preview_url
from HuggingFace collection repos where models share flat heading levels:

1. _strip_standalone_images() now converts <img> tags to markdown image
   syntax ![alt](src) instead of stripping the URL entirely, so the LLM
   can still extract preview URLs.

2. _extract_section() uses a line-count-based forward window (stopping at
   <a id> anchors) for non-heading matches, instead of stopping at the
   very next heading. This prevents same-level sub-headings (# Download,
   # Trigger, # Sample prompt within a single model section) from
   truncating the window before sample images are included.

3. Post-processor preview fallback now filters gallery images to the
   model-specific README section before falling back to the repo-wide
   first image.
This commit is contained in:
Will Miao
2026-07-05 17:05:47 +08:00
parent 5494a70f40
commit 7b19bbb14e
4 changed files with 75 additions and 31 deletions

View File

@@ -82,6 +82,7 @@ class PostProcessor:
convert_readme_to_html, convert_readme_to_html,
extract_gallery_images, extract_gallery_images,
extract_gallery_table_images, extract_gallery_table_images,
extract_relevant_section,
extract_simple_markdown_images, extract_simple_markdown_images,
extract_html_img_tags, extract_html_img_tags,
extract_repo_from_hf_url, extract_repo_from_hf_url,
@@ -215,8 +216,21 @@ class PostProcessor:
preview_remote_url = (llm_output.get("preview_url") or "").strip() preview_remote_url = (llm_output.get("preview_url") or "").strip()
# Fallback: if the LLM couldn't find a preview image in the cleaned # Fallback: if the LLM couldn't find a preview image in the cleaned
# README, use the first gallery image extracted from the YAML widget # README, find the first gallery image from the *model-specific
# section. # section* of the README (not the repo-wide first image, which
# belongs to a different model in collection repos).
if not preview_remote_url and readme_content and is_hf_model:
model_basename = os.path.splitext(os.path.basename(model_path))[0]
relevant_section = extract_relevant_section(
readme_content, model_basename,
)
if relevant_section and relevant_section != readme_content:
for img in gallery_images:
img_url = img.get("url", "")
if img_url and img_url in relevant_section:
preview_remote_url = img_url
break
# Last resort: use the first gallery image from the full README.
if not preview_remote_url and gallery_images: if not preview_remote_url and gallery_images:
preview_remote_url = gallery_images[0].get("url", "") preview_remote_url = gallery_images[0].get("url", "")
current_preview = metadata.get("preview_url") or "" current_preview = metadata.get("preview_url") or ""

View File

@@ -495,20 +495,26 @@ def _strip_standalone_images(text: str) -> str:
extract a ``preview_url`` from them. Only the alt text is needed for extract a ``preview_url`` from them. Only the alt text is needed for
content signal; the URL is needed for image extraction. content signal; the URL is needed for image extraction.
HTML ``<img>`` tags on their own line are replaced by their alt text HTML ``<img>`` tags on their own line are **converted to markdown
(if any) or removed, since the LLM has difficulty extracting URLs from image syntax** ``![alt](src)`` so both the alt text and the image URL
raw HTML attributes. are preserved in a format the LLM can easily extract. Previously the
URL was stripped entirely, making it impossible for the LLM to return
a ``preview_url`` for repos that use HTML ``<img>`` tags exclusively.
""" """
# HTML: ``<img src="..." alt="..." ...>`` on its own line → keep alt text def _img_to_md(match: re.Match) -> str:
text = re.sub( """Convert an ``<img>`` tag to markdown image syntax ``![alt](src)``."""
r'^\s*<img\s[^>]*alt="([^"]*)"[^>]*/?>(?:</img>)?\s*$', tag = match.group(0)
r"\1", src_m = re.search(r'src="([^"]+)"', tag) or re.search(r"src='([^']+)'", tag)
text, if not src_m:
flags=re.MULTILINE | re.IGNORECASE, return ""
) src = src_m.group(1)
alt_m = re.search(r'alt="([^"]*)"', tag) or re.search(r"alt='([^']*)'", tag)
alt = alt_m.group(1) if alt_m else ""
return f"![{alt}]({src})"
text = re.sub( text = re.sub(
r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$', r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
"", _img_to_md,
text, text,
flags=re.MULTILINE | re.IGNORECASE, flags=re.MULTILINE | re.IGNORECASE,
) )
@@ -782,9 +788,16 @@ def _extract_section(
When *match_idx* is itself a heading line, the section starts *at* When *match_idx* is itself a heading line, the section starts *at*
that heading (no backward walk), avoiding pulling in content from that heading (no backward walk), avoiding pulling in content from
earlier sibling sections. The forward walk only stops at a heading earlier sibling sections. The forward walk stops at a heading of
of **equal or higher** level (e.g. a ``#`` match includes all its **equal or higher** level (e.g. a ``# Title`` match includes all its
``##`` children). ``## Children``).
When *match_idx* is **not** a heading (e.g. a download link matched
inside a sub-section like ``# Download``), the forward walk uses a
generous line-count-based window instead of stopping at the very next
heading. This prevents same-level sub-headings (e.g. ``# Download``,
``# Trigger``, ``# Sample prompt`` within a single model section)
from prematurely truncating the window before sample images.
Always includes the YAML frontmatter if the original lines contain one, Always includes the YAML frontmatter if the original lines contain one,
because it carries critical metadata (``base_model``, ``tags``, because it carries critical metadata (``base_model``, ``tags``,
@@ -808,15 +821,29 @@ def _extract_section(
start = i start = i
break break
# Walk forward. Stop at a heading of EQUAL or HIGHER (fewer #) level, # Walk forward.
# so that a ``# Title`` match encompasses all its ``## Children``.
# Start from the full remaining lines so we don't truncate content
# when the YAML frontmatter pushes the matched heading far down.
end = n end = n
if match_level == 0:
# Non-heading match (e.g. a download link). Use a line-based
# window so that same-level sub-headings (# Download, # Trigger,
# # Sample prompt within a single model section) don't truncate
# the window. Stop at the next <a id="..."> anchor (which
# typically starts a new model section in collection repos), or
# fall back to a generous line limit.
forward_limit = min(n, match_idx + max(context_lines * 3, 250))
for i in range(match_idx + 1, forward_limit):
if re.search(r'<a\s+id="', lines[i], re.IGNORECASE):
end = i
break
else:
end = forward_limit
else:
# Heading match — stop at the next heading of equal or higher
# level, so that a # Title encompasses all its ## Children.
walk_limit = min(n, match_idx + max(context_lines * 3, 120)) walk_limit = min(n, match_idx + max(context_lines * 3, 120))
for i in range(match_idx + 1, walk_limit): for i in range(match_idx + 1, walk_limit):
hl = _heading_level(lines[i]) hl = _heading_level(lines[i])
if hl > 0 and (match_level == 0 or hl <= match_level): if hl > 0 and hl <= match_level:
end = i end = i
break break

View File

@@ -803,10 +803,12 @@ pixel art sprite, game asset, transparent background
assert "## Gallery" in result assert "## Gallery" in result
assert "Some text." in result assert "Some text." in result
def test_html_img_tag_stripped(self): def test_html_img_tag_converted_to_markdown_image(self):
"""``<img>`` converted to ``![](src)``, preserving URL for LLM."""
md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.' md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
result = self._clean(md) result = self._clean(md)
assert "cdn.hf.co" not in result assert "![](https://cdn.hf.co/img.webp)" in result
assert "cdn.hf.co" in result # URL preserved for LLM extraction
assert "Description." in result assert "Description." in result
def test_inline_image_within_paragraph_preserved(self): def test_inline_image_within_paragraph_preserved(self):

View File

@@ -242,11 +242,12 @@ After"""
cleaned = R.clean_readme_for_llm(text) cleaned = R.clean_readme_for_llm(text)
assert "./preview.png" in cleaned assert "./preview.png" in cleaned
def test_strips_html_img_tag(self, R): def test_converts_html_img_tag_to_markdown_image(self, R):
"""``<img src="...">`` → stripped.""" """``<img src="...">`` → ``![](src)`` preserving URL for LLM."""
text = 'before\n<img src="logo.png">\nafter' text = 'before\n<img src="logo.png">\nafter'
cleaned = R.clean_readme_for_llm(text) cleaned = R.clean_readme_for_llm(text)
assert "logo.png" not in cleaned assert "![](logo.png)" in cleaned
assert "logo.png" in cleaned # URL preserved for LLM extraction
def test_widget_stripped_frontmatter_preserved(self, R): def test_widget_stripped_frontmatter_preserved(self, R):
"""Widget YAML stripped but ``base_model:`` kept.""" """Widget YAML stripped but ``base_model:`` kept."""