mirror of
https://github.com/willmiao/ComfyUI-Lora-Manager.git
synced 2026-07-05 17:01:16 -03:00
fix(agent): preserve preview URLs for collection repo models with flat heading structure
Three-part fix for enrich_hf_metadata failing to extract correct preview_url from HuggingFace collection repos where models share flat heading levels: 1. _strip_standalone_images() now converts <img> tags to markdown image syntax  instead of stripping the URL entirely, so the LLM can still extract preview URLs. 2. _extract_section() uses a line-count-based forward window (stopping at <a id> anchors) for non-heading matches, instead of stopping at the very next heading. This prevents same-level sub-headings (# Download, # Trigger, # Sample prompt within a single model section) from truncating the window before sample images are included. 3. Post-processor preview fallback now filters gallery images to the model-specific README section before falling back to the repo-wide first image.
This commit is contained in:
@@ -82,6 +82,7 @@ class PostProcessor:
|
||||
convert_readme_to_html,
|
||||
extract_gallery_images,
|
||||
extract_gallery_table_images,
|
||||
extract_relevant_section,
|
||||
extract_simple_markdown_images,
|
||||
extract_html_img_tags,
|
||||
extract_repo_from_hf_url,
|
||||
@@ -215,8 +216,21 @@ class PostProcessor:
|
||||
|
||||
preview_remote_url = (llm_output.get("preview_url") or "").strip()
|
||||
# Fallback: if the LLM couldn't find a preview image in the cleaned
|
||||
# README, use the first gallery image extracted from the YAML widget
|
||||
# section.
|
||||
# README, find the first gallery image from the *model-specific
|
||||
# section* of the README (not the repo-wide first image, which
|
||||
# belongs to a different model in collection repos).
|
||||
if not preview_remote_url and readme_content and is_hf_model:
|
||||
model_basename = os.path.splitext(os.path.basename(model_path))[0]
|
||||
relevant_section = extract_relevant_section(
|
||||
readme_content, model_basename,
|
||||
)
|
||||
if relevant_section and relevant_section != readme_content:
|
||||
for img in gallery_images:
|
||||
img_url = img.get("url", "")
|
||||
if img_url and img_url in relevant_section:
|
||||
preview_remote_url = img_url
|
||||
break
|
||||
# Last resort: use the first gallery image from the full README.
|
||||
if not preview_remote_url and gallery_images:
|
||||
preview_remote_url = gallery_images[0].get("url", "")
|
||||
current_preview = metadata.get("preview_url") or ""
|
||||
|
||||
@@ -495,20 +495,26 @@ def _strip_standalone_images(text: str) -> str:
|
||||
extract a ``preview_url`` from them. Only the alt text is needed for
|
||||
content signal; the URL is needed for image extraction.
|
||||
|
||||
HTML ``<img>`` tags on their own line are replaced by their alt text
|
||||
(if any) or removed, since the LLM has difficulty extracting URLs from
|
||||
raw HTML attributes.
|
||||
HTML ``<img>`` tags on their own line are **converted to markdown
|
||||
image syntax** ```` so both the alt text and the image URL
|
||||
are preserved in a format the LLM can easily extract. Previously the
|
||||
URL was stripped entirely, making it impossible for the LLM to return
|
||||
a ``preview_url`` for repos that use HTML ``<img>`` tags exclusively.
|
||||
"""
|
||||
# HTML: ``<img src="..." alt="..." ...>`` on its own line → keep alt text
|
||||
text = re.sub(
|
||||
r'^\s*<img\s[^>]*alt="([^"]*)"[^>]*/?>(?:</img>)?\s*$',
|
||||
r"\1",
|
||||
text,
|
||||
flags=re.MULTILINE | re.IGNORECASE,
|
||||
)
|
||||
def _img_to_md(match: re.Match) -> str:
|
||||
"""Convert an ``<img>`` tag to markdown image syntax ````."""
|
||||
tag = match.group(0)
|
||||
src_m = re.search(r'src="([^"]+)"', tag) or re.search(r"src='([^']+)'", tag)
|
||||
if not src_m:
|
||||
return ""
|
||||
src = src_m.group(1)
|
||||
alt_m = re.search(r'alt="([^"]*)"', tag) or re.search(r"alt='([^']*)'", tag)
|
||||
alt = alt_m.group(1) if alt_m else ""
|
||||
return f""
|
||||
|
||||
text = re.sub(
|
||||
r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
|
||||
"",
|
||||
_img_to_md,
|
||||
text,
|
||||
flags=re.MULTILINE | re.IGNORECASE,
|
||||
)
|
||||
@@ -782,9 +788,16 @@ def _extract_section(
|
||||
|
||||
When *match_idx* is itself a heading line, the section starts *at*
|
||||
that heading (no backward walk), avoiding pulling in content from
|
||||
earlier sibling sections. The forward walk only stops at a heading
|
||||
of **equal or higher** level (e.g. a ``#`` match includes all its
|
||||
``##`` children).
|
||||
earlier sibling sections. The forward walk stops at a heading of
|
||||
**equal or higher** level (e.g. a ``# Title`` match includes all its
|
||||
``## Children``).
|
||||
|
||||
When *match_idx* is **not** a heading (e.g. a download link matched
|
||||
inside a sub-section like ``# Download``), the forward walk uses a
|
||||
generous line-count-based window instead of stopping at the very next
|
||||
heading. This prevents same-level sub-headings (e.g. ``# Download``,
|
||||
``# Trigger``, ``# Sample prompt`` within a single model section)
|
||||
from prematurely truncating the window before sample images.
|
||||
|
||||
Always includes the YAML frontmatter if the original lines contain one,
|
||||
because it carries critical metadata (``base_model``, ``tags``,
|
||||
@@ -808,17 +821,31 @@ def _extract_section(
|
||||
start = i
|
||||
break
|
||||
|
||||
# Walk forward. Stop at a heading of EQUAL or HIGHER (fewer #) level,
|
||||
# so that a ``# Title`` match encompasses all its ``## Children``.
|
||||
# Start from the full remaining lines so we don't truncate content
|
||||
# when the YAML frontmatter pushes the matched heading far down.
|
||||
# Walk forward.
|
||||
end = n
|
||||
walk_limit = min(n, match_idx + max(context_lines * 3, 120))
|
||||
for i in range(match_idx + 1, walk_limit):
|
||||
hl = _heading_level(lines[i])
|
||||
if hl > 0 and (match_level == 0 or hl <= match_level):
|
||||
end = i
|
||||
break
|
||||
if match_level == 0:
|
||||
# Non-heading match (e.g. a download link). Use a line-based
|
||||
# window so that same-level sub-headings (# Download, # Trigger,
|
||||
# # Sample prompt within a single model section) don't truncate
|
||||
# the window. Stop at the next <a id="..."> anchor (which
|
||||
# typically starts a new model section in collection repos), or
|
||||
# fall back to a generous line limit.
|
||||
forward_limit = min(n, match_idx + max(context_lines * 3, 250))
|
||||
for i in range(match_idx + 1, forward_limit):
|
||||
if re.search(r'<a\s+id="', lines[i], re.IGNORECASE):
|
||||
end = i
|
||||
break
|
||||
else:
|
||||
end = forward_limit
|
||||
else:
|
||||
# Heading match — stop at the next heading of equal or higher
|
||||
# level, so that a # Title encompasses all its ## Children.
|
||||
walk_limit = min(n, match_idx + max(context_lines * 3, 120))
|
||||
for i in range(match_idx + 1, walk_limit):
|
||||
hl = _heading_level(lines[i])
|
||||
if hl > 0 and hl <= match_level:
|
||||
end = i
|
||||
break
|
||||
|
||||
# If YAML frontmatter exists before the matched section, prepend it.
|
||||
if start > 0 and len(lines) > 1 and lines[0].strip() == "---":
|
||||
|
||||
@@ -803,10 +803,12 @@ pixel art sprite, game asset, transparent background
|
||||
assert "## Gallery" in result
|
||||
assert "Some text." in result
|
||||
|
||||
def test_html_img_tag_stripped(self):
|
||||
def test_html_img_tag_converted_to_markdown_image(self):
|
||||
"""``<img>`` converted to ````, preserving URL for LLM."""
|
||||
md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
|
||||
result = self._clean(md)
|
||||
assert "cdn.hf.co" not in result
|
||||
assert "" in result
|
||||
assert "cdn.hf.co" in result # URL preserved for LLM extraction
|
||||
assert "Description." in result
|
||||
|
||||
def test_inline_image_within_paragraph_preserved(self):
|
||||
|
||||
@@ -242,11 +242,12 @@ After"""
|
||||
cleaned = R.clean_readme_for_llm(text)
|
||||
assert "./preview.png" in cleaned
|
||||
|
||||
def test_strips_html_img_tag(self, R):
|
||||
"""``<img src="...">`` → stripped."""
|
||||
def test_converts_html_img_tag_to_markdown_image(self, R):
|
||||
"""``<img src="...">`` → ```` preserving URL for LLM."""
|
||||
text = 'before\n<img src="logo.png">\nafter'
|
||||
cleaned = R.clean_readme_for_llm(text)
|
||||
assert "logo.png" not in cleaned
|
||||
assert "" in cleaned
|
||||
assert "logo.png" in cleaned # URL preserved for LLM extraction
|
||||
|
||||
def test_widget_stripped_frontmatter_preserved(self, R):
|
||||
"""Widget YAML stripped but ``base_model:`` kept."""
|
||||
|
||||
Reference in New Issue
Block a user