fix(agent): preserve preview URLs for collection repo models with flat heading structure

Three-part fix for enrich_hf_metadata failing to extract correct preview_url
from HuggingFace collection repos where models share flat heading levels:

1. _strip_standalone_images() now converts <img> tags to markdown image
   syntax ![alt](src) instead of stripping the URL entirely, so the LLM
   can still extract preview URLs.

2. _extract_section() uses a line-count-based forward window (stopping at
   <a id> anchors) for non-heading matches, instead of stopping at the
   very next heading. This prevents same-level sub-headings (# Download,
   # Trigger, # Sample prompt within a single model section) from
   truncating the window before sample images are included.

3. Post-processor preview fallback now filters gallery images to the
   model-specific README section before falling back to the repo-wide
   first image.
This commit is contained in:
Will Miao
2026-07-05 17:05:47 +08:00
parent 5494a70f40
commit 7b19bbb14e
4 changed files with 75 additions and 31 deletions

View File

@@ -82,6 +82,7 @@ class PostProcessor:
convert_readme_to_html,
extract_gallery_images,
extract_gallery_table_images,
extract_relevant_section,
extract_simple_markdown_images,
extract_html_img_tags,
extract_repo_from_hf_url,
@@ -215,8 +216,21 @@ class PostProcessor:
preview_remote_url = (llm_output.get("preview_url") or "").strip()
# Fallback: if the LLM couldn't find a preview image in the cleaned
# README, use the first gallery image extracted from the YAML widget
# section.
# README, find the first gallery image from the *model-specific
# section* of the README (not the repo-wide first image, which
# belongs to a different model in collection repos).
if not preview_remote_url and readme_content and is_hf_model:
model_basename = os.path.splitext(os.path.basename(model_path))[0]
relevant_section = extract_relevant_section(
readme_content, model_basename,
)
if relevant_section and relevant_section != readme_content:
for img in gallery_images:
img_url = img.get("url", "")
if img_url and img_url in relevant_section:
preview_remote_url = img_url
break
# Last resort: use the first gallery image from the full README.
if not preview_remote_url and gallery_images:
preview_remote_url = gallery_images[0].get("url", "")
current_preview = metadata.get("preview_url") or ""

View File

@@ -495,20 +495,26 @@ def _strip_standalone_images(text: str) -> str:
extract a ``preview_url`` from them. Only the alt text is needed for
content signal; the URL is needed for image extraction.
HTML ``<img>`` tags on their own line are replaced by their alt text
(if any) or removed, since the LLM has difficulty extracting URLs from
raw HTML attributes.
HTML ``<img>`` tags on their own line are **converted to markdown
image syntax** ``![alt](src)`` so both the alt text and the image URL
are preserved in a format the LLM can easily extract. Previously the
URL was stripped entirely, making it impossible for the LLM to return
a ``preview_url`` for repos that use HTML ``<img>`` tags exclusively.
"""
# HTML: ``<img src="..." alt="..." ...>`` on its own line → keep alt text
text = re.sub(
r'^\s*<img\s[^>]*alt="([^"]*)"[^>]*/?>(?:</img>)?\s*$',
r"\1",
text,
flags=re.MULTILINE | re.IGNORECASE,
)
def _img_to_md(match: re.Match) -> str:
"""Convert an ``<img>`` tag to markdown image syntax ``![alt](src)``."""
tag = match.group(0)
src_m = re.search(r'src="([^"]+)"', tag) or re.search(r"src='([^']+)'", tag)
if not src_m:
return ""
src = src_m.group(1)
alt_m = re.search(r'alt="([^"]*)"', tag) or re.search(r"alt='([^']*)'", tag)
alt = alt_m.group(1) if alt_m else ""
return f"![{alt}]({src})"
text = re.sub(
r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
"",
_img_to_md,
text,
flags=re.MULTILINE | re.IGNORECASE,
)
@@ -782,9 +788,16 @@ def _extract_section(
When *match_idx* is itself a heading line, the section starts *at*
that heading (no backward walk), avoiding pulling in content from
earlier sibling sections. The forward walk only stops at a heading
of **equal or higher** level (e.g. a ``#`` match includes all its
``##`` children).
earlier sibling sections. The forward walk stops at a heading of
**equal or higher** level (e.g. a ``# Title`` match includes all its
``## Children``).
When *match_idx* is **not** a heading (e.g. a download link matched
inside a sub-section like ``# Download``), the forward walk uses a
generous line-count-based window instead of stopping at the very next
heading. This prevents same-level sub-headings (e.g. ``# Download``,
``# Trigger``, ``# Sample prompt`` within a single model section)
from prematurely truncating the window before sample images.
Always includes the YAML frontmatter if the original lines contain one,
because it carries critical metadata (``base_model``, ``tags``,
@@ -808,15 +821,29 @@ def _extract_section(
start = i
break
# Walk forward. Stop at a heading of EQUAL or HIGHER (fewer #) level,
# so that a ``# Title`` match encompasses all its ``## Children``.
# Start from the full remaining lines so we don't truncate content
# when the YAML frontmatter pushes the matched heading far down.
# Walk forward.
end = n
if match_level == 0:
# Non-heading match (e.g. a download link). Use a line-based
# window so that same-level sub-headings (# Download, # Trigger,
# # Sample prompt within a single model section) don't truncate
# the window. Stop at the next <a id="..."> anchor (which
# typically starts a new model section in collection repos), or
# fall back to a generous line limit.
forward_limit = min(n, match_idx + max(context_lines * 3, 250))
for i in range(match_idx + 1, forward_limit):
if re.search(r'<a\s+id="', lines[i], re.IGNORECASE):
end = i
break
else:
end = forward_limit
else:
# Heading match — stop at the next heading of equal or higher
# level, so that a # Title encompasses all its ## Children.
walk_limit = min(n, match_idx + max(context_lines * 3, 120))
for i in range(match_idx + 1, walk_limit):
hl = _heading_level(lines[i])
if hl > 0 and (match_level == 0 or hl <= match_level):
if hl > 0 and hl <= match_level:
end = i
break

View File

@@ -803,10 +803,12 @@ pixel art sprite, game asset, transparent background
assert "## Gallery" in result
assert "Some text." in result
def test_html_img_tag_stripped(self):
def test_html_img_tag_converted_to_markdown_image(self):
"""``<img>`` converted to ``![](src)``, preserving URL for LLM."""
md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
result = self._clean(md)
assert "cdn.hf.co" not in result
assert "![](https://cdn.hf.co/img.webp)" in result
assert "cdn.hf.co" in result # URL preserved for LLM extraction
assert "Description." in result
def test_inline_image_within_paragraph_preserved(self):

View File

@@ -242,11 +242,12 @@ After"""
cleaned = R.clean_readme_for_llm(text)
assert "./preview.png" in cleaned
def test_strips_html_img_tag(self, R):
"""``<img src="...">`` → stripped."""
def test_converts_html_img_tag_to_markdown_image(self, R):
"""``<img src="...">`` → ``![](src)`` preserving URL for LLM."""
text = 'before\n<img src="logo.png">\nafter'
cleaned = R.clean_readme_for_llm(text)
assert "logo.png" not in cleaned
assert "![](logo.png)" in cleaned
assert "logo.png" in cleaned # URL preserved for LLM extraction
def test_widget_stripped_frontmatter_preserved(self, R):
"""Widget YAML stripped but ``base_model:`` kept."""