mirror of
https://github.com/willmiao/ComfyUI-Lora-Manager.git
synced 2026-07-05 17:01:16 -03:00
fix(agent): preserve preview URLs for collection repo models with flat heading structure
Three-part fix for enrich_hf_metadata failing to extract correct preview_url from HuggingFace collection repos where models share flat heading levels: 1. _strip_standalone_images() now converts <img> tags to markdown image syntax  instead of stripping the URL entirely, so the LLM can still extract preview URLs. 2. _extract_section() uses a line-count-based forward window (stopping at <a id> anchors) for non-heading matches, instead of stopping at the very next heading. This prevents same-level sub-headings (# Download, # Trigger, # Sample prompt within a single model section) from truncating the window before sample images are included. 3. Post-processor preview fallback now filters gallery images to the model-specific README section before falling back to the repo-wide first image.
This commit is contained in:
@@ -82,6 +82,7 @@ class PostProcessor:
|
|||||||
convert_readme_to_html,
|
convert_readme_to_html,
|
||||||
extract_gallery_images,
|
extract_gallery_images,
|
||||||
extract_gallery_table_images,
|
extract_gallery_table_images,
|
||||||
|
extract_relevant_section,
|
||||||
extract_simple_markdown_images,
|
extract_simple_markdown_images,
|
||||||
extract_html_img_tags,
|
extract_html_img_tags,
|
||||||
extract_repo_from_hf_url,
|
extract_repo_from_hf_url,
|
||||||
@@ -215,8 +216,21 @@ class PostProcessor:
|
|||||||
|
|
||||||
preview_remote_url = (llm_output.get("preview_url") or "").strip()
|
preview_remote_url = (llm_output.get("preview_url") or "").strip()
|
||||||
# Fallback: if the LLM couldn't find a preview image in the cleaned
|
# Fallback: if the LLM couldn't find a preview image in the cleaned
|
||||||
# README, use the first gallery image extracted from the YAML widget
|
# README, find the first gallery image from the *model-specific
|
||||||
# section.
|
# section* of the README (not the repo-wide first image, which
|
||||||
|
# belongs to a different model in collection repos).
|
||||||
|
if not preview_remote_url and readme_content and is_hf_model:
|
||||||
|
model_basename = os.path.splitext(os.path.basename(model_path))[0]
|
||||||
|
relevant_section = extract_relevant_section(
|
||||||
|
readme_content, model_basename,
|
||||||
|
)
|
||||||
|
if relevant_section and relevant_section != readme_content:
|
||||||
|
for img in gallery_images:
|
||||||
|
img_url = img.get("url", "")
|
||||||
|
if img_url and img_url in relevant_section:
|
||||||
|
preview_remote_url = img_url
|
||||||
|
break
|
||||||
|
# Last resort: use the first gallery image from the full README.
|
||||||
if not preview_remote_url and gallery_images:
|
if not preview_remote_url and gallery_images:
|
||||||
preview_remote_url = gallery_images[0].get("url", "")
|
preview_remote_url = gallery_images[0].get("url", "")
|
||||||
current_preview = metadata.get("preview_url") or ""
|
current_preview = metadata.get("preview_url") or ""
|
||||||
|
|||||||
@@ -495,20 +495,26 @@ def _strip_standalone_images(text: str) -> str:
|
|||||||
extract a ``preview_url`` from them. Only the alt text is needed for
|
extract a ``preview_url`` from them. Only the alt text is needed for
|
||||||
content signal; the URL is needed for image extraction.
|
content signal; the URL is needed for image extraction.
|
||||||
|
|
||||||
HTML ``<img>`` tags on their own line are replaced by their alt text
|
HTML ``<img>`` tags on their own line are **converted to markdown
|
||||||
(if any) or removed, since the LLM has difficulty extracting URLs from
|
image syntax** ```` so both the alt text and the image URL
|
||||||
raw HTML attributes.
|
are preserved in a format the LLM can easily extract. Previously the
|
||||||
|
URL was stripped entirely, making it impossible for the LLM to return
|
||||||
|
a ``preview_url`` for repos that use HTML ``<img>`` tags exclusively.
|
||||||
"""
|
"""
|
||||||
# HTML: ``<img src="..." alt="..." ...>`` on its own line → keep alt text
|
def _img_to_md(match: re.Match) -> str:
|
||||||
text = re.sub(
|
"""Convert an ``<img>`` tag to markdown image syntax ````."""
|
||||||
r'^\s*<img\s[^>]*alt="([^"]*)"[^>]*/?>(?:</img>)?\s*$',
|
tag = match.group(0)
|
||||||
r"\1",
|
src_m = re.search(r'src="([^"]+)"', tag) or re.search(r"src='([^']+)'", tag)
|
||||||
text,
|
if not src_m:
|
||||||
flags=re.MULTILINE | re.IGNORECASE,
|
return ""
|
||||||
)
|
src = src_m.group(1)
|
||||||
|
alt_m = re.search(r'alt="([^"]*)"', tag) or re.search(r"alt='([^']*)'", tag)
|
||||||
|
alt = alt_m.group(1) if alt_m else ""
|
||||||
|
return f""
|
||||||
|
|
||||||
text = re.sub(
|
text = re.sub(
|
||||||
r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
|
r'^\s*<img\s[^>]+/?>(?:</img>)?\s*$',
|
||||||
"",
|
_img_to_md,
|
||||||
text,
|
text,
|
||||||
flags=re.MULTILINE | re.IGNORECASE,
|
flags=re.MULTILINE | re.IGNORECASE,
|
||||||
)
|
)
|
||||||
@@ -782,9 +788,16 @@ def _extract_section(
|
|||||||
|
|
||||||
When *match_idx* is itself a heading line, the section starts *at*
|
When *match_idx* is itself a heading line, the section starts *at*
|
||||||
that heading (no backward walk), avoiding pulling in content from
|
that heading (no backward walk), avoiding pulling in content from
|
||||||
earlier sibling sections. The forward walk only stops at a heading
|
earlier sibling sections. The forward walk stops at a heading of
|
||||||
of **equal or higher** level (e.g. a ``#`` match includes all its
|
**equal or higher** level (e.g. a ``# Title`` match includes all its
|
||||||
``##`` children).
|
``## Children``).
|
||||||
|
|
||||||
|
When *match_idx* is **not** a heading (e.g. a download link matched
|
||||||
|
inside a sub-section like ``# Download``), the forward walk uses a
|
||||||
|
generous line-count-based window instead of stopping at the very next
|
||||||
|
heading. This prevents same-level sub-headings (e.g. ``# Download``,
|
||||||
|
``# Trigger``, ``# Sample prompt`` within a single model section)
|
||||||
|
from prematurely truncating the window before sample images.
|
||||||
|
|
||||||
Always includes the YAML frontmatter if the original lines contain one,
|
Always includes the YAML frontmatter if the original lines contain one,
|
||||||
because it carries critical metadata (``base_model``, ``tags``,
|
because it carries critical metadata (``base_model``, ``tags``,
|
||||||
@@ -808,15 +821,29 @@ def _extract_section(
|
|||||||
start = i
|
start = i
|
||||||
break
|
break
|
||||||
|
|
||||||
# Walk forward. Stop at a heading of EQUAL or HIGHER (fewer #) level,
|
# Walk forward.
|
||||||
# so that a ``# Title`` match encompasses all its ``## Children``.
|
|
||||||
# Start from the full remaining lines so we don't truncate content
|
|
||||||
# when the YAML frontmatter pushes the matched heading far down.
|
|
||||||
end = n
|
end = n
|
||||||
|
if match_level == 0:
|
||||||
|
# Non-heading match (e.g. a download link). Use a line-based
|
||||||
|
# window so that same-level sub-headings (# Download, # Trigger,
|
||||||
|
# # Sample prompt within a single model section) don't truncate
|
||||||
|
# the window. Stop at the next <a id="..."> anchor (which
|
||||||
|
# typically starts a new model section in collection repos), or
|
||||||
|
# fall back to a generous line limit.
|
||||||
|
forward_limit = min(n, match_idx + max(context_lines * 3, 250))
|
||||||
|
for i in range(match_idx + 1, forward_limit):
|
||||||
|
if re.search(r'<a\s+id="', lines[i], re.IGNORECASE):
|
||||||
|
end = i
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
end = forward_limit
|
||||||
|
else:
|
||||||
|
# Heading match — stop at the next heading of equal or higher
|
||||||
|
# level, so that a # Title encompasses all its ## Children.
|
||||||
walk_limit = min(n, match_idx + max(context_lines * 3, 120))
|
walk_limit = min(n, match_idx + max(context_lines * 3, 120))
|
||||||
for i in range(match_idx + 1, walk_limit):
|
for i in range(match_idx + 1, walk_limit):
|
||||||
hl = _heading_level(lines[i])
|
hl = _heading_level(lines[i])
|
||||||
if hl > 0 and (match_level == 0 or hl <= match_level):
|
if hl > 0 and hl <= match_level:
|
||||||
end = i
|
end = i
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|||||||
@@ -803,10 +803,12 @@ pixel art sprite, game asset, transparent background
|
|||||||
assert "## Gallery" in result
|
assert "## Gallery" in result
|
||||||
assert "Some text." in result
|
assert "Some text." in result
|
||||||
|
|
||||||
def test_html_img_tag_stripped(self):
|
def test_html_img_tag_converted_to_markdown_image(self):
|
||||||
|
"""``<img>`` converted to ````, preserving URL for LLM."""
|
||||||
md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
|
md = '## Preview\n<img src="https://cdn.hf.co/img.webp"></img>\n\nDescription.'
|
||||||
result = self._clean(md)
|
result = self._clean(md)
|
||||||
assert "cdn.hf.co" not in result
|
assert "" in result
|
||||||
|
assert "cdn.hf.co" in result # URL preserved for LLM extraction
|
||||||
assert "Description." in result
|
assert "Description." in result
|
||||||
|
|
||||||
def test_inline_image_within_paragraph_preserved(self):
|
def test_inline_image_within_paragraph_preserved(self):
|
||||||
|
|||||||
@@ -242,11 +242,12 @@ After"""
|
|||||||
cleaned = R.clean_readme_for_llm(text)
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
assert "./preview.png" in cleaned
|
assert "./preview.png" in cleaned
|
||||||
|
|
||||||
def test_strips_html_img_tag(self, R):
|
def test_converts_html_img_tag_to_markdown_image(self, R):
|
||||||
"""``<img src="...">`` → stripped."""
|
"""``<img src="...">`` → ```` preserving URL for LLM."""
|
||||||
text = 'before\n<img src="logo.png">\nafter'
|
text = 'before\n<img src="logo.png">\nafter'
|
||||||
cleaned = R.clean_readme_for_llm(text)
|
cleaned = R.clean_readme_for_llm(text)
|
||||||
assert "logo.png" not in cleaned
|
assert "" in cleaned
|
||||||
|
assert "logo.png" in cleaned # URL preserved for LLM extraction
|
||||||
|
|
||||||
def test_widget_stripped_frontmatter_preserved(self, R):
|
def test_widget_stripped_frontmatter_preserved(self, R):
|
||||||
"""Widget YAML stripped but ``base_model:`` kept."""
|
"""Widget YAML stripped but ``base_model:`` kept."""
|
||||||
|
|||||||
Reference in New Issue
Block a user