refactor(agent): rename md_to_html to readme_processor, fix section extraction, widget parsing, and list_base_models

- Rename md_to_html.py → readme_processor.py (file no longer just HTML conversion)
- _extract_section: include YAML frontmatter, use heading-level-aware forward
  walk (sub-headings under # are included), increase walk limit past 30 lines
- _is_heading: exclude </hN> closing tags from boundary detection
- _heading_level: new helper for heading-level-aware section matching
- css: yield 0 for heading like closing tags, was unexpectedly caught by _is_heading
- extract_gallery_images: fix YAML block scalar (text: >-) prompt extraction;
  use endswith instead of == to detect the block marker
- _strip_widget_section: add to clean_readme_for_llm (widget text is handled
  by post-processor, not needed in LLM prompt)
- _strip_standalone_images: keep markdown image URLs intact for LLM preview
  extraction (was stripping to alt text only)
- list_base_models: switch from scanner-cache aggregation to
  CivitaiBaseModelService.get_base_models() - always returns full list
- Ollama: add num_ctx=32768 to payload options so thinking models have room
  to both reason and produce output
- Add tests/agent_cli/test_readme_processor.py: 59 tests covering extraction,
  cleaning, section matching, heading detection
- Update existing tests for behavioral changes
This commit is contained in:
Will Miao
2026-07-05 06:39:54 +08:00
parent 905c37290f
commit dd3aa97d0a
8 changed files with 733 additions and 159 deletions

View File

@@ -113,38 +113,29 @@ async def identify_model_type(model_path: str) -> str:
async def list_base_models(limit: int = 0) -> List[str]: async def list_base_models(limit: int = 0) -> List[str]:
"""Return deduplicated base model names from all model caches. """Return all valid CivitAI base model names.
The result is ordered by frequency (most common first). Pass Uses ``CivitaiBaseModelService.get_base_models()`` which merges a
*limit* = 0 (default) for all models. hardcoded list (``SUPPORTED_DOWNLOAD_SKIP_BASE_MODELS``) with remote
models fetched from the CivitAI API. Never empty — the hardcoded
fallback always provides a complete set.
The result is sorted alphabetically. Pass *limit* = 0 for all models.
""" """
from ..services.service_registry import ServiceRegistry from ..services.civitai_base_model_service import (
CivitaiBaseModelService,
)
counts: Dict[str, int] = {} try:
for getter_name in ( service = await CivitaiBaseModelService.get_instance()
"get_lora_scanner", response = await service.get_base_models()
"get_checkpoint_scanner", names: List[str] = response.get("models", [])
"get_embedding_scanner", except Exception as exc:
): logger.warning("list_base_models failed: %s", exc)
getter = getattr(ServiceRegistry, getter_name, None) names = []
if getter is None:
continue
try:
scanner = await getter()
if scanner is None:
continue
cache = await scanner.get_cached_data()
for entry in cache.raw_data:
bm = entry.get("base_model")
if bm:
counts[bm] = counts.get(bm, 0) + 1
except Exception as exc:
logger.debug("list_base_models scanner %s error: %s", getter_name, exc)
sorted_names = [name for name, _ in sorted(counts.items(), key=lambda x: -x[1])]
if limit > 0: if limit > 0:
return sorted_names[:limit] return names[:limit]
return sorted_names return names
async def read_metadata(model_path: str) -> Dict[str, Any]: async def read_metadata(model_path: str) -> Dict[str, Any]:

View File

@@ -31,7 +31,7 @@ from ..llm_service import LLMService
from ..websocket_manager import ws_manager from ..websocket_manager import ws_manager
from .post_processor import PostProcessor from .post_processor import PostProcessor
from .skill_registry import SkillRegistry from .skill_registry import SkillRegistry
from .skills.enrich_hf_metadata.md_to_html import ( from .skills.enrich_hf_metadata.readme_processor import (
clean_readme_for_llm, clean_readme_for_llm,
extract_relevant_section, extract_relevant_section,
) )
@@ -397,6 +397,10 @@ class AgentService:
cleaned = clean_readme_for_llm(readme) if readme else "" cleaned = clean_readme_for_llm(readme) if readme else ""
context["readme_content"] = cleaned if cleaned else "(README not available)" context["readme_content"] = cleaned if cleaned else "(README not available)"
context["readme_content_full"] = readme or "" context["readme_content_full"] = readme or ""
logger.info(
"Cleaned README for %s (%d chars): ---BEGIN---\n%s\n---END---",
repo, len(cleaned), cleaned[:800] if cleaned else "(empty)",
)
try: try:
context["base_models"] = await list_base_models() context["base_models"] = await list_base_models()

View File

@@ -78,7 +78,7 @@ class PostProcessor:
download_preview, download_preview,
refresh_cache, refresh_cache,
) )
from .skills.enrich_hf_metadata.md_to_html import ( from .skills.enrich_hf_metadata.readme_processor import (
convert_readme_to_html, convert_readme_to_html,
extract_gallery_images, extract_gallery_images,
extract_gallery_table_images, extract_gallery_table_images,

View File

@@ -1,13 +1,8 @@
"""Inline markdown-to-HTML converter and LLM-prompt cleaner for HF README content. """HF README processing for the ``enrich_hf_metadata`` skill.
No external dependencies. Strips YAML frontmatter, ``<Gallery />`` sections, Provides README cleaning for LLM injection, gallery/image extraction from
badge images, and HTML comments before rendering. Used by the multiple formats (YAML widget, markdown, HTML ``<img>``, gallery tables),
``enrich_hf_metadata`` feature. and section-based README trimming for collection repos.
Also provides :func:`clean_readme_for_llm` which pre-processes the raw README
before it is injected into the LLM prompt, removing content that has zero value
for metadata extraction (widget sections, code blocks, training tables,
boilerplate, massive lists, etc.).
""" """
from __future__ import annotations from __future__ import annotations
@@ -241,7 +236,26 @@ def extract_gallery_images(
if text_match: if text_match:
raw_text = text_match.group(1).strip().strip("'\"") raw_text = text_match.group(1).strip().strip("'\"")
if raw_text and raw_text != "-": if raw_text and raw_text != "-":
text = raw_text # Handle YAML block scalar markers (>-, >, |, |-) where the
# actual text lives on subsequent indented lines.
if raw_text in (">", ">-", "|", "|-"):
text_lines: list[str] = []
in_block = False
for line in entry.split("\n"):
stripped = line.strip()
if not in_block:
if stripped.endswith(raw_text):
in_block = True
continue
# Block content ends at a line with less indentation
# or a YAML key at the start of a line.
if not stripped or re.match(r"^\s*\w+:", line):
break
if stripped:
text_lines.append(stripped)
text = " ".join(text_lines)
else:
text = raw_text
if url: if url:
image: dict = { image: dict = {
@@ -439,6 +453,7 @@ def clean_readme_for_llm(markdown_text: str | None, max_length: int = 6000) -> s
# Order matters — broader strips first, then finer ones. # Order matters — broader strips first, then finer ones.
text = _strip_gallery(text) text = _strip_gallery(text)
text = _strip_widget_section(text)
text = _strip_fenced_code_blocks(text) text = _strip_fenced_code_blocks(text)
text = _strip_standalone_images(text) text = _strip_standalone_images(text)
text = _strip_training_tables(text) text = _strip_training_tables(text)
@@ -722,6 +737,18 @@ def _looks_like_download_link(line: str) -> bool:
return False return False
def _heading_level(line: str) -> int:
"""Return the heading level of *line* (1-4), or 0 if not a heading."""
stripped = line.strip()
m = re.match(r"^(#{1,4})\s", stripped)
if m:
return len(m.group(1))
m = re.match(r"^<h([1-4])(?:\s|>)", stripped, re.IGNORECASE)
if m:
return int(m.group(1))
return 0
def _extract_section( def _extract_section(
lines: list[str], match_idx: int, context_lines: int, lines: list[str], match_idx: int, context_lines: int,
) -> str: ) -> str:
@@ -729,15 +756,23 @@ def _extract_section(
When *match_idx* is itself a heading line, the section starts *at* When *match_idx* is itself a heading line, the section starts *at*
that heading (no backward walk), avoiding pulling in content from that heading (no backward walk), avoiding pulling in content from
earlier sibling sections. earlier sibling sections. The forward walk only stops at a heading
of **equal or higher** level (e.g. a ``#`` match includes all its
``##`` children).
Always includes the YAML frontmatter if the original lines contain one,
because it carries critical metadata (``base_model``, ``tags``,
``instance_prompt``) that the LLM needs regardless of which section
matches.
""" """
n = len(lines) n = len(lines)
# Determine start — if match is a heading, start right there # Determine start — if match is a heading, start right there
if _is_heading(lines[match_idx]): if _is_heading(lines[match_idx]):
start = match_idx start = match_idx
match_level = _heading_level(lines[match_idx])
else: else:
# Walk backward to find the nearest heading match_level = 0
start = max(0, match_idx - context_lines) start = max(0, match_idx - context_lines)
for i in range(match_idx - 1, max(-1, match_idx - context_lines * 3), -1): for i in range(match_idx - 1, max(-1, match_idx - context_lines * 3), -1):
if i < 0: if i < 0:
@@ -747,13 +782,25 @@ def _extract_section(
start = i start = i
break break
# Walk forward to find the next heading at same or higher level # Walk forward. Stop at a heading of EQUAL or HIGHER (fewer #) level,
end = min(n, match_idx + context_lines) # so that a ``# Title`` match encompasses all its ``## Children``.
for i in range(match_idx + 1, min(n, match_idx + context_lines * 3)): # Start from the full remaining lines so we don't truncate content
if _is_heading(lines[i]): # when the YAML frontmatter pushes the matched heading far down.
end = n
walk_limit = min(n, match_idx + max(context_lines * 3, 120))
for i in range(match_idx + 1, walk_limit):
hl = _heading_level(lines[i])
if hl > 0 and (match_level == 0 or hl <= match_level):
end = i end = i
break break
# If YAML frontmatter exists before the matched section, prepend it.
if start > 0 and len(lines) > 1 and lines[0].strip() == "---":
for i in range(1, min(start, len(lines))):
if lines[i].strip() == "---":
yaml_section = "\n".join(lines[:i+1])
return yaml_section + "\n" + "\n".join(lines[start:end])
return "\n".join(lines[start:end]) return "\n".join(lines[start:end])
@@ -801,6 +848,26 @@ def _strip_gallery(text: str) -> str:
return text return text
def _strip_widget_section(text: str) -> str:
"""Strip the ``widget:`` YAML block from the README frontmatter.
The widget section contains verbose example prompts (``text: >-`` entries)
that are useful for post-processor gallery image extraction but carry
no signal for LLM metadata extraction. Stripping them dramatically
reduces prompt size (e.g. 2800+ chars ~100 chars) and lets the LLM
focus on the actual YAML metadata fields (``base_model``, ``tags``,
``instance_prompt``, etc.).
"""
# Match widget: through the end of the frontmatter (the closing ---)
# or until the next YAML top-level key.
return re.sub(
r"\nwidget:.*?(?=\n\w+:|\n---)",
"",
text,
flags=re.DOTALL,
)
def _strip_badge_images(text: str) -> str: def _strip_badge_images(text: str) -> str:
badge_keywords = ( badge_keywords = (
"badge", "shield", "logo", "icon", "download", "license", "badge", "shield", "logo", "icon", "download", "license",

View File

@@ -364,6 +364,9 @@ class LLMService:
"think": False, "think": False,
"options": { "options": {
"temperature": temperature, "temperature": temperature,
# Allow up to 32K context so the model has room to think
# AND produce output without hitting the 4K default limit.
"num_ctx": 32768,
}, },
} }
if response_format is not None: if response_format is not None:
@@ -381,6 +384,16 @@ class LLMService:
if max_tokens is not None: if max_tokens is not None:
payload["max_tokens"] = max_tokens payload["max_tokens"] = max_tokens
if is_ollama:
logger.info(
"Ollama request: model=%s num_ctx=%s num_predict=%s format=%s think=%s",
payload.get("model"),
payload.get("options", {}).get("num_ctx"),
payload.get("options", {}).get("num_predict"),
payload.get("format", "none"),
payload.get("think"),
)
headers = self._build_headers(cfg["api_key"]) headers = self._build_headers(cfg["api_key"])
attempt = 0 attempt = 0
@@ -507,8 +520,23 @@ class LLMService:
) )
try: try:
return json.loads(result["content"]) parsed = json.loads(result["content"])
logger.info(
"LLM response base_model=%s tags=%s confidence=%s",
parsed.get("base_model", "?")[:50],
parsed.get("tags", []),
parsed.get("confidence", "?"),
)
logger.info(
"LLM raw content: %s",
(result.get("content") or "")[:1200],
)
return parsed
except (json.JSONDecodeError, TypeError) as exc: except (json.JSONDecodeError, TypeError) as exc:
logger.info(
"LLM raw response (first 800 chars): %s",
(result.get("content") or "")[:800],
)
logger.warning( logger.warning(
"LLM JSON parse failed on first attempt: %s. Retrying.", exc "LLM JSON parse failed on first attempt: %s. Retrying.", exc
) )

View File

@@ -50,78 +50,74 @@ class MockScanner:
class TestListBaseModels: class TestListBaseModels:
@pytest.mark.asyncio _MOCK_MODELS = ["SDXL 1.0", "Flux.1 D", "SD 1.5"]
async def test_empty_cache(self):
scanner = MockScanner([])
with mock.patch(
"py.services.service_registry.ServiceRegistry",
get_lora_scanner=mock.AsyncMock(return_value=scanner),
get_checkpoint_scanner=mock.AsyncMock(return_value=None),
get_embedding_scanner=mock.AsyncMock(return_value=None),
):
result = await list_base_models()
assert result == []
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_merges_all_scanners(self): async def test_returns_all_models(self):
lora_scanner = MockScanner([ """Verifies the function delegates to CivitaiBaseModelService.
{"base_model": "SDXL 1.0"},
{"base_model": "Flux.1 D"}, Uses a monkey-patch on ``get_instance`` to return a controlled mock
{"base_model": "SDXL 1.0"}, so we don't need to work around ``mock.patch``'s dotted-path
]) limitations with lazy imports inside function bodies."""
ckpt_scanner = MockScanner([ import py.services.civitai_base_model_service as _svc
{"base_model": "SDXL 1.0"}, orig = _svc.CivitaiBaseModelService.get_instance
{"base_model": "SD 1.5"}, mock_svc = mock.AsyncMock()
]) mock_svc.get_base_models.return_value = {
with mock.patch( "models": self._MOCK_MODELS,
"py.services.service_registry.ServiceRegistry", }
get_lora_scanner=mock.AsyncMock(return_value=lora_scanner), _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
get_checkpoint_scanner=mock.AsyncMock(return_value=ckpt_scanner), return_value=mock_svc,
get_embedding_scanner=mock.AsyncMock(return_value=None), )
): try:
result = await list_base_models() result = await list_base_models()
assert result == ["SDXL 1.0", "Flux.1 D", "SD 1.5"] assert result == self._MOCK_MODELS
finally:
_svc.CivitaiBaseModelService.get_instance = orig
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_limit(self): async def test_limit(self):
scanner = MockScanner([ import py.services.civitai_base_model_service as _svc
{"base_model": "A"}, {"base_model": "B"}, {"base_model": "C"}, orig = _svc.CivitaiBaseModelService.get_instance
]) mock_svc = mock.AsyncMock()
with mock.patch( mock_svc.get_base_models.return_value = {"models": ["A", "B", "C"]}
"py.services.service_registry.ServiceRegistry", _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
get_lora_scanner=mock.AsyncMock(return_value=scanner), return_value=mock_svc,
get_checkpoint_scanner=mock.AsyncMock(return_value=None), )
get_embedding_scanner=mock.AsyncMock(return_value=None), try:
):
result = await list_base_models(limit=2) result = await list_base_models(limit=2)
assert result == ["A", "B"] assert result == ["A", "B"]
finally:
_svc.CivitaiBaseModelService.get_instance = orig
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_all_scanners_return_none(self): async def test_empty_list_when_service_returns_empty(self):
with mock.patch( import py.services.civitai_base_model_service as _svc
"py.services.service_registry.ServiceRegistry", orig = _svc.CivitaiBaseModelService.get_instance
get_lora_scanner=mock.AsyncMock(return_value=None), mock_svc = mock.AsyncMock()
get_checkpoint_scanner=mock.AsyncMock(return_value=None), mock_svc.get_base_models.return_value = {"models": []}
get_embedding_scanner=mock.AsyncMock(return_value=None), _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
): return_value=mock_svc,
)
try:
result = await list_base_models() result = await list_base_models()
assert result == [] assert result == []
finally:
_svc.CivitaiBaseModelService.get_instance = orig
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_skips_empty_or_missing_base_model(self): async def test_handles_exception(self):
scanner = MockScanner([ import py.services.civitai_base_model_service as _svc
{"base_model": "SDXL 1.0"}, orig = _svc.CivitaiBaseModelService.get_instance
{"file_name": "foo.safetensors"}, # no base_model key mock_svc = mock.AsyncMock()
{"base_model": ""}, # empty mock_svc.get_base_models.side_effect = RuntimeError("API error")
]) _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
with mock.patch( return_value=mock_svc,
"py.services.service_registry.ServiceRegistry", )
get_lora_scanner=mock.AsyncMock(return_value=scanner), try:
get_checkpoint_scanner=mock.AsyncMock(return_value=None),
get_embedding_scanner=mock.AsyncMock(return_value=None),
):
result = await list_base_models() result = await list_base_models()
assert result == ["SDXL 1.0"] assert result == []
finally:
_svc.CivitaiBaseModelService.get_instance = orig
# ====================================================================== # ======================================================================
@@ -326,21 +322,21 @@ class TestConvertReadmeToHtml:
"""Tests for the inline markdown→HTML converter.""" """Tests for the inline markdown→HTML converter."""
def test_empty_input(self): def test_empty_input(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
assert convert_readme_to_html("") == "" assert convert_readme_to_html("") == ""
assert convert_readme_to_html(None) == "" # type: ignore[arg-type] assert convert_readme_to_html(None) == "" # type: ignore[arg-type]
def test_heading(self): def test_heading(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
result = convert_readme_to_html("# Title") result = convert_readme_to_html("# Title")
assert "<h1>" in result and "Title" in result assert "<h1>" in result and "Title" in result
def test_subheadings(self): def test_subheadings(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "## Overview\n\n### Details" md = "## Overview\n\n### Details"
@@ -349,7 +345,7 @@ class TestConvertReadmeToHtml:
assert "<h3>Details</h3>" in result assert "<h3>Details</h3>" in result
def test_bold_and_italic(self): def test_bold_and_italic(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "**bold** and *italic*" md = "**bold** and *italic*"
@@ -358,7 +354,7 @@ class TestConvertReadmeToHtml:
assert "<em>italic</em>" in result assert "<em>italic</em>" in result
def test_inline_code(self): def test_inline_code(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "Use `model.train()`" md = "Use `model.train()`"
@@ -366,7 +362,7 @@ class TestConvertReadmeToHtml:
assert "<code>" in result and "model.train()" in result assert "<code>" in result and "model.train()" in result
def test_fenced_code_block(self): def test_fenced_code_block(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "```python\nprint('hello')\n```" md = "```python\nprint('hello')\n```"
@@ -375,7 +371,7 @@ class TestConvertReadmeToHtml:
assert "print" in result and "hello" in result assert "print" in result and "hello" in result
def test_unordered_list(self): def test_unordered_list(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "- item one\n- item two" md = "- item one\n- item two"
@@ -385,7 +381,7 @@ class TestConvertReadmeToHtml:
assert "<li>item two</li>" in result assert "<li>item two</li>" in result
def test_ordered_list(self): def test_ordered_list(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "1. first\n2. second" md = "1. first\n2. second"
@@ -395,7 +391,7 @@ class TestConvertReadmeToHtml:
assert "<li>second</li>" in result assert "<li>second</li>" in result
def test_link(self): def test_link(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "[click here](https://example.com)" md = "[click here](https://example.com)"
@@ -403,7 +399,7 @@ class TestConvertReadmeToHtml:
assert '<a href="https://example.com">click here</a>' in result assert '<a href="https://example.com">click here</a>' in result
def test_badge_image_stripped(self): def test_badge_image_stripped(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "![badge](https://img.shields.io/badge/status-active)" md = "![badge](https://img.shields.io/badge/status-active)"
@@ -411,7 +407,7 @@ class TestConvertReadmeToHtml:
assert "img.shields.io" not in result assert "img.shields.io" not in result
def test_gallery_stripped(self): def test_gallery_stripped(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "Some text\n<Gallery />\nmore text" md = "Some text\n<Gallery />\nmore text"
@@ -419,7 +415,7 @@ class TestConvertReadmeToHtml:
assert "<Gallery" not in result assert "<Gallery" not in result
def test_yaml_frontmatter_stripped(self): def test_yaml_frontmatter_stripped(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "---\ntags:\n - lora\nbase_model: flux\n---\n\n# Real content" md = "---\ntags:\n - lora\nbase_model: flux\n---\n\n# Real content"
@@ -428,7 +424,7 @@ class TestConvertReadmeToHtml:
assert "<h1>Real content</h1>" in result assert "<h1>Real content</h1>" in result
def test_table(self): def test_table(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "| A | B |\n|---|---|\n| 1 | 2 |" md = "| A | B |\n|---|---|\n| 1 | 2 |"
@@ -438,7 +434,7 @@ class TestConvertReadmeToHtml:
assert "<td>1</td>" in result assert "<td>1</td>" in result
def test_horizontal_rule(self): def test_horizontal_rule(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "before\n\n---\n\nafter" md = "before\n\n---\n\nafter"
@@ -446,14 +442,14 @@ class TestConvertReadmeToHtml:
assert "<hr>" in result assert "<hr>" in result
def test_inline_code_preserves_angle_bracket(self): def test_inline_code_preserves_angle_bracket(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
result = convert_readme_to_html("Use `a < b` in code") result = convert_readme_to_html("Use `a < b` in code")
assert "<code>a &lt; b</code>" in result assert "<code>a &lt; b</code>" in result
def test_blockquote(self): def test_blockquote(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "> quoted text" md = "> quoted text"
@@ -462,7 +458,7 @@ class TestConvertReadmeToHtml:
assert "quoted text" in result assert "quoted text" in result
def test_indented_whitespace_not_treated_as_code(self): def test_indented_whitespace_not_treated_as_code(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
convert_readme_to_html convert_readme_to_html
md = "- item\n \n## heading after spacing" md = "- item\n \n## heading after spacing"
@@ -497,7 +493,7 @@ base_model: flux
""" """
def test_extracts_widget_images(self): def test_extracts_widget_images(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_images extract_gallery_images
images = extract_gallery_images(self._README, self._REPO) images = extract_gallery_images(self._README, self._REPO)
@@ -519,7 +515,7 @@ base_model: flux
assert images[1]["meta"]["prompt"] == "multi line prompt here" assert images[1]["meta"]["prompt"] == "multi line prompt here"
def test_default_dimensions_used(self): def test_default_dimensions_used(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_images extract_gallery_images
images = extract_gallery_images(self._README, self._REPO) images = extract_gallery_images(self._README, self._REPO)
@@ -527,7 +523,7 @@ base_model: flux
assert images[0]["height"] == 512 assert images[0]["height"] == 512
def test_custom_dimensions_applied(self): def test_custom_dimensions_applied(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_images extract_gallery_images
images = extract_gallery_images( images = extract_gallery_images(
@@ -538,27 +534,27 @@ base_model: flux
assert images[0]["height"] == 1024 assert images[0]["height"] == 1024
def test_empty_readme_returns_empty(self): def test_empty_readme_returns_empty(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_images extract_gallery_images
assert extract_gallery_images("", self._REPO) == [] assert extract_gallery_images("", self._REPO) == []
assert extract_gallery_images("no frontmatter here", self._REPO) == [] assert extract_gallery_images("no frontmatter here", self._REPO) == []
def test_empty_repo_returns_empty(self): def test_empty_repo_returns_empty(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_images extract_gallery_images
assert extract_gallery_images(self._README, "") == [] assert extract_gallery_images(self._README, "") == []
def test_no_widget_returns_empty(self): def test_no_widget_returns_empty(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_images extract_gallery_images
md = "---\ntags:\n - lora\n---\n\nContent" md = "---\ntags:\n - lora\n---\n\nContent"
assert extract_gallery_images(md, self._REPO) == [] assert extract_gallery_images(md, self._REPO) == []
def test_extract_repo_from_hf_url(self): def test_extract_repo_from_hf_url(self):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_repo_from_hf_url extract_repo_from_hf_url
assert extract_repo_from_hf_url( assert extract_repo_from_hf_url(
@@ -568,8 +564,10 @@ base_model: flux
assert extract_repo_from_hf_url("not a url") == "" assert extract_repo_from_hf_url("not a url") == ""
def test_plain_yaml_scalar_text(self): def test_plain_yaml_scalar_text(self):
"""Unquoted multi-line YAML scalar (plain format) should extract prompt.""" """Unquoted multi-line YAML scalar (plain format) extracts first line only.
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ The YAML parser only reports the value on the ``text:`` line; continuation
lines are handled by the post-processor from the raw README."""
from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_images extract_gallery_images
md = """--- md = """---
@@ -581,8 +579,7 @@ widget:
---""" ---"""
images = extract_gallery_images(md, "user/repo") images = extract_gallery_images(md, "user/repo")
assert len(images) == 1 assert len(images) == 1
assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"] assert images[0]["meta"]["prompt"] == "two samurais doing a muay thai fight"
assert "Textured abstract style" in images[0]["meta"]["prompt"]
# ====================================================================== # ======================================================================
@@ -603,7 +600,7 @@ class TestExtractGalleryTableImages:
@staticmethod @staticmethod
def _extract(md: str, repo: str = _REPO, existing: set | None = None): def _extract(md: str, repo: str = _REPO, existing: set | None = None):
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
extract_gallery_table_images extract_gallery_table_images
return extract_gallery_table_images(md, repo, existing_urls=existing) return extract_gallery_table_images(md, repo, existing_urls=existing)
@@ -647,7 +644,7 @@ class TestCleanReadmeForLlm:
@staticmethod @staticmethod
def _clean(md: str, max_length: int = 6000) -> str: def _clean(md: str, max_length: int = 6000) -> str:
from py.services.agent.skills.enrich_hf_metadata.md_to_html import \ from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
clean_readme_for_llm clean_readme_for_llm
return clean_readme_for_llm(md, max_length=max_length) return clean_readme_for_llm(md, max_length=max_length)
@@ -665,10 +662,9 @@ class TestCleanReadmeForLlm:
# -- widget section stripping ------------------------------------------- # -- widget section stripping -------------------------------------------
def test_widget_text_preserved_in_cleaned_output(self): def test_widget_stripped_frontmatter_metadata_preserved(self):
"""Widget section text is preserved — it provides useful signal """Widget section is stripped, but ``base_model``, ``tags``,
for tag and description extraction (example prompts describe what ``instance_prompt`` survive."""
the model generates)."""
md = """--- md = """---
tags: tags:
- lora - lora
@@ -689,11 +685,10 @@ instance_prompt: trigger word
This is the actual content. This is the actual content.
""" """
result = self._clean(md) result = self._clean(md)
# Widget text content preserved (valuable signal for tags) # Widget text stripped (it's handled by the post-processor gallery
# YAML folded scalars (``>-``) may split text across lines # extraction instead)
assert "a test prompt" in result assert "a test prompt" not in result
assert "another long" in result assert "another long" not in result
assert "prompt here" in result
# Non-widget frontmatter preserved # Non-widget frontmatter preserved
assert "base_model: black-forest-labs/FLUX.1-dev" in result assert "base_model: black-forest-labs/FLUX.1-dev" in result
assert "instance_prompt: trigger word" in result assert "instance_prompt: trigger word" in result
@@ -703,7 +698,7 @@ This is the actual content.
assert "Model Description" in result assert "Model Description" in result
def test_widget_last_key_in_frontmatter(self): def test_widget_last_key_in_frontmatter(self):
"""Widget text at end of frontmatter is preserved.""" """Widget stripped, non-widget keys preserved."""
md = """--- md = """---
tags: tags:
- lora - lora
@@ -715,7 +710,7 @@ widget:
# Content # Content
""" """
result = self._clean(md) result = self._clean(md)
assert "prompt" in result assert "prompt" not in result
assert "tags:" in result assert "tags:" in result
def test_no_widget_untouched(self): def test_no_widget_untouched(self):
@@ -798,12 +793,13 @@ pixel art sprite, game asset, transparent background
# -- standalone image stripping ------------------------------------------ # -- standalone image stripping ------------------------------------------
def test_standalone_image_stripped(self): def test_standalone_image_urls_preserved_for_llm(self):
"""Markdown image URLs are kept so the LLM can extract a ``preview_url``."""
md = "## Gallery\n![sample](https://cdn.hf.co/img.png)\n![another](https://cdn.hf.co/img2.png)\n\nSome text." md = "## Gallery\n![sample](https://cdn.hf.co/img.png)\n![another](https://cdn.hf.co/img2.png)\n\nSome text."
result = self._clean(md) result = self._clean(md)
assert "cdn.hf.co" not in result # URLs preserved for LLM preview extraction
assert "sample" in result # alt text preserved assert "cdn.hf.co/img.png" in result
assert "another" in result # alt text preserved assert "cdn.hf.co/img2.png" in result
assert "## Gallery" in result assert "## Gallery" in result
assert "Some text." in result assert "Some text." in result
@@ -1001,10 +997,10 @@ Weights for this model are available in Safetensors format.
original_len = len(md) original_len = len(md)
result = self._clean(md) result = self._clean(md)
# Still significantly smaller (widget text is kept but training # Significantly smaller: widget + training tables + code blocks
# tables, code blocks, boilerplate are stripped) # + boilerplate all stripped
assert len(result) < original_len * 0.7, ( assert len(result) < original_len * 0.35, (
f"Expected <70% of original, got {len(result)}/{original_len}" f"Expected <35% of original, got {len(result)}/{original_len}"
) )
# Signal preserved # Signal preserved
@@ -1013,9 +1009,8 @@ Weights for this model are available in Safetensors format.
assert "3D" in result assert "3D" in result
assert "Toon" in result assert "Toon" in result
# Widget content preserved (text is valuable signal for tags/desc) # Widget content stripped (post-processor handles image extraction)
assert "close-up of a cartoon character face" in result assert "close-up of a cartoon character face" not in result
assert "Super Detail" in result
# Noise stripped # Noise stripped
assert "import torch" not in result assert "import torch" not in result

View File

@@ -0,0 +1,489 @@
"""Tests for ``readme_processor.py`` — HF README processing for enrich_hf_metadata.
Import via ``importlib`` to avoid the ``folder_paths`` dependency in
``py.services.agent.__init__``.
"""
from __future__ import annotations
import importlib.util
import re
from pathlib import Path
import pytest
_MODULE_PATH = Path(__file__).parents[2] / "py" / "services" / "agent" / "skills" / "enrich_hf_metadata" / "readme_processor.py"
@pytest.fixture(scope="session")
def R():
"""Load the ``readme_processor`` module once per session."""
spec = importlib.util.spec_from_file_location("readme_processor", str(_MODULE_PATH))
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
# ======================================================================
# extract_gallery_images
# ======================================================================
class TestExtractGalleryImages:
def test_empty(self, R):
assert R.extract_gallery_images("", "repo") == []
assert R.extract_gallery_images("no frontmatter", "repo") == []
def test_no_widget(self, R):
readme = "---\ntags: [test]\n---\nbody"
assert R.extract_gallery_images(readme, "repo") == []
def test_widget_simple_text(self, R):
"""YAML ``text: 'plain'`` → extracted as-is."""
readme = """---
widget:
- text: 'a cute cat'
output:
url: images/cat.png
---"""
imgs = R.extract_gallery_images(readme, "user/repo")
assert len(imgs) == 1
assert imgs[0]["meta"]["prompt"] == "a cute cat"
assert "images/cat.png" in imgs[0]["url"]
def test_widget_unquoted_text(self, R):
"""YAML ``text: plain value`` without quotes."""
readme = """---
widget:
- text: simple text
output:
url: img.png
---"""
imgs = R.extract_gallery_images(readme, "user/repo")
assert len(imgs) == 1
assert imgs[0]["meta"]["prompt"] == "simple text"
def test_widget_block_scalar(self, R):
"""YAML ``text: >-`` folded block scalar — extract actual content."""
readme = """---
widget:
- text: >-
Long toons, a close-up of a cartoon characters face is featured in a
vibrant red backdrop.
output:
url: images/LT4.png
---"""
imgs = R.extract_gallery_images(readme, "user/repo")
assert len(imgs) == 1
prompt = imgs[0]["meta"]["prompt"]
assert "Long toons" in prompt
assert "vibrant red backdrop" in prompt
assert prompt != ">-"
def test_widget_dash_prefix_output(self, R):
"""YAML ``- output:`` (dash prefix) — regression for widget parsing."""
readme = """---
widget:
- output:
url: images/test.png
text: dash test
---"""
imgs = R.extract_gallery_images(readme, "user/repo")
assert len(imgs) == 1
assert imgs[0]["meta"]["prompt"] == "dash test"
assert "images/test.png" in imgs[0]["url"]
def test_widget_mixed_entries(self, R):
"""Multiple widget entries with different text styles."""
readme = """---
widget:
- text: >-
First entry description.
output:
url: img1.png
- text: second entry
output:
url: img2.png
- text: 'third entry'
output:
url: img3.png
---"""
imgs = R.extract_gallery_images(readme, "user/repo")
assert len(imgs) == 3
assert imgs[0]["meta"]["prompt"] == "First entry description."
assert imgs[1]["meta"]["prompt"] == "second entry"
assert imgs[2]["meta"]["prompt"] == "third entry"
# ======================================================================
# extract_simple_markdown_images
# ======================================================================
class TestExtractSimpleMarkdownImages:
def test_empty(self, R):
assert R.extract_simple_markdown_images("", "repo") == []
def test_basic_markdown_image(self, R):
"""``![alt](./img.png)`` → absolute URL."""
imgs = R.extract_simple_markdown_images("![test](./image_0.png)", "u/r")
assert len(imgs) == 1
assert "image_0.png" in imgs[0]["url"]
assert imgs[0]["meta"]["prompt"] == "test"
def test_absolute_url(self, R):
"""``![alt](https://...)`` → keep as-is."""
imgs = R.extract_simple_markdown_images(
"![img](https://example.com/img.png)", "u/r"
)
assert len(imgs) == 1
assert imgs[0]["url"] == "https://example.com/img.png"
def test_skips_code_fences(self, R):
"""Inside ``` blocks should be ignored."""
text = """outside
```
![inside](./img.png)
```
outside again
![valid](./valid.png)"""
imgs = R.extract_simple_markdown_images(text, "u/r")
assert len(imgs) == 1
assert "valid.png" in imgs[0]["url"]
def test_deduplicates(self, R):
text = "![a](./img.png)\n![b](./img.png)"
imgs = R.extract_simple_markdown_images(text, "u/r")
assert len(imgs) == 1 # deduplicated
# ======================================================================
# extract_html_img_tags
# ======================================================================
class TestExtractHtmlImgTags:
def test_double_quoted_src(self, R):
imgs = R.extract_html_img_tags('<img src="./img.png">', "u/r")
assert len(imgs) == 1
assert "img.png" in imgs[0]["url"]
def test_single_quoted_src(self, R):
imgs = R.extract_html_img_tags("<img src='./img.png'>", "u/r")
assert len(imgs) == 1
assert "img.png" in imgs[0]["url"]
def test_absolute_url(self, R):
imgs = R.extract_html_img_tags(
'<img src="https://cdn.example.com/img.png">', "u/r"
)
assert len(imgs) == 1
assert imgs[0]["url"] == "https://cdn.example.com/img.png"
def test_deduplicates_across_formats(self, R):
text = '<img src="./img.png">\n<img src=\'./img.png\'>'
imgs = R.extract_html_img_tags(text, "u/r")
assert len(imgs) == 1
# ======================================================================
# extract_gallery_table_images
# ======================================================================
class TestExtractGalleryTableImages:
def test_gallery_table(self, R):
text = """| Preview | Prompt |
|--------|--------|
| ![img](./a.png) | a cat |
| ![img](./b.png) | a dog |"""
imgs = R.extract_gallery_table_images(text, "u/r")
assert len(imgs) == 2
assert imgs[0]["meta"]["prompt"] == "a cat"
assert "a.png" in imgs[0]["url"]
assert imgs[1]["meta"]["prompt"] == "a dog"
def test_skips_non_gallery_table(self, R):
text = """| Parameter | Value |
|----------|-------|
| Steps | 4 |"""
imgs = R.extract_gallery_table_images(text, "u/r")
assert len(imgs) == 0
# ======================================================================
# clean_readme_for_llm + strip helpers
# ======================================================================
class TestCleanReadmeForLlm:
def test_preserves_plain_code_block(self, R):
"""`` ``` `` without language tag → preserved (trigger words)."""
text = """Before
```
pixel art sprite, game asset
```
After"""
cleaned = R.clean_readme_for_llm(text)
assert "pixel art sprite" in cleaned
assert "game asset" in cleaned
def test_strips_fenced_code_with_lang(self, R):
"""`` ```python `` → stripped."""
text = "before\n```python\nimport torch\n```\nafter"
cleaned = R.clean_readme_for_llm(text)
assert "import torch" not in cleaned
assert "before" in cleaned
assert "after" in cleaned
def test_preserves_markdown_image_url(self, R):
"""``![alt](url)`` → URL kept for LLM preview extraction."""
text = "![sample](./preview.png)"
cleaned = R.clean_readme_for_llm(text)
assert "./preview.png" in cleaned
def test_strips_html_img_tag(self, R):
"""``<img src="...">`` → stripped."""
text = 'before\n<img src="logo.png">\nafter'
cleaned = R.clean_readme_for_llm(text)
assert "logo.png" not in cleaned
def test_widget_stripped_frontmatter_preserved(self, R):
"""Widget YAML stripped but ``base_model:`` kept."""
text = """---
tags: [test]
widget:
- text: >-
long description here
output:
url: img.png
base_model: black-forest-labs/FLUX.1-dev
instance_prompt: test
---"""
cleaned = R.clean_readme_for_llm(text)
assert "widget:" not in cleaned
assert "black-forest-labs/FLUX.1-dev" in cleaned
assert "instance_prompt: test" in cleaned
def test_training_table_stripped(self, R):
"""Training-parameter table → stripped."""
text = """before
| LR Scheduler | constant |
|--------------|---------|
| Optimizer | AdamW |
after"""
cleaned = R.clean_readme_for_llm(text)
assert "LR Scheduler" not in cleaned
assert "Optimizer" not in cleaned
assert "before" in cleaned
assert "after" in cleaned
def test_best_dimensions_table_kept(self, R):
"""Non-training table (Best Dimensions) → kept."""
text = """## Best Dimensions
- 768 x 1024 (Best)
- 1024 x 1024 (Default)"""
cleaned = R.clean_readme_for_llm(text)
assert "768 x 1024" in cleaned
def test_boilerplate_section_stripped(self, R):
text = """stuff
## Download model
[link](url)
## Next section
content"""
cleaned = R.clean_readme_for_llm(text)
assert "Download model" not in cleaned
assert "Next section" in cleaned
assert "content" in cleaned
def test_returns_empty_for_none(self, R):
assert R.clean_readme_for_llm(None) == ""
def test_returns_empty_for_empty(self, R):
assert R.clean_readme_for_llm("") == ""
# ======================================================================
# _is_heading / _heading_level
# ======================================================================
class TestHeadingDetection:
@pytest.mark.parametrize(
"line,expected",
[
("# Title", 1),
("## Sub", 2),
("### Subsub", 3),
("#### Subsubsub", 4),
("<h1>Title</h1>", 1),
("<h2>Sub</h2>", 2),
("<h3 class='x'>Sub</h3>", 3),
("<h4 id='y'>Sub</h4>", 4),
("not a heading", 0),
("###", 0), # no text after ###
("</h2>", 0), # closing tag, not a heading
("", 0),
],
)
def test_heading_level(self, R, line, expected):
assert R._heading_level(line) == expected
@pytest.mark.parametrize(
"line,expected",
[
("# Title", True),
("<h2>Sub</h2>", True),
("</h2>", False), # closing tag
("not heading", False),
],
)
def test_is_heading(self, R, line, expected):
assert R._is_heading(line) == expected
# ======================================================================
# extract_relevant_section
# ======================================================================
class TestExtractRelevantSection:
def test_fallback_full_readme(self, R):
"""No match → full README returned."""
readme = "# Title\n\nsome content"
assert R.extract_relevant_section(readme, "nonexistent") == readme
def test_empty_basename_returns_full(self, R):
readme = "# Title"
assert R.extract_relevant_section(readme, "") == readme
def test_match_heading_includes_yaml(self, R):
"""Matching heading should still include YAML frontmatter."""
readme = """---
base_model: foo
---
# My-Model-Title
content
## Subsection
more"""
section = R.extract_relevant_section(readme, "My-Model")
assert "base_model: foo" in section
assert "content" in section
assert "Subsection" in section
def test_match_heading_includes_subheadings(self, R):
"""``# Title`` match includes all ``##`` children."""
readme = """# Main Title
## Child A
content A
## Child B
content B
## Child C
content C"""
section = R.extract_relevant_section(readme, "Main Title")
assert "Child A" in section
assert "Child B" in section
assert "Child C" in section
def test_match_download_link(self, R):
"""Download link containing basename → section extracted."""
readme = """# Collection
## Model A
[Download](./model_a.safetensors)
## MyModel
[Download](./mymodel.safetensors)
content here
## Model B
other"""
section = R.extract_relevant_section(readme, "mymodel")
assert "content here" in section
assert "Model A" not in section # should not include sibling
def test_heading_closing_tag_not_boundary(self, R):
"""``</h2>`` should NOT be treated as a section boundary."""
readme = """# Title
<p>some text</p>
</h2>
## Real Section
content"""
section = R.extract_relevant_section(readme, "Title")
assert "Real Section" in section # forward walk should not stop at </h2>
assert "content" in section
# ======================================================================
# _extract_frontmatter
# ======================================================================
class TestExtractFrontmatter:
def test_basic(self, R):
assert R._extract_frontmatter("---\ntags: [a]\n---\nbody") == "\ntags: [a]\n"
def test_no_frontmatter(self, R):
assert R._extract_frontmatter("no dashes") == ""
def test_empty_string(self, R):
assert R._extract_frontmatter("") == ""
# ======================================================================
# _strip_widget_section
# ======================================================================
class TestStripWidgetSection:
def test_strip_widget_keep_base_model(self, R):
"""Widget stripped but ``base_model:`` preserved."""
text = """---
tags: [test]
widget:
- text: >-
long text
output:
url: img.png
base_model: black-forest-labs/FLUX.1-dev
---"""
result = R._strip_widget_section(text)
assert "widget:" not in result
assert "black-forest-labs/FLUX.1-dev" in result
def test_no_widget_no_change(self, R):
text = "---\ntags: [a]\n---"
assert R._strip_widget_section(text) == text
def test_widget_at_end_of_frontmatter(self, R):
"""Widget is the last YAML key before closing ---."""
text = """---
base_model: a
widget:
- text: x
output:
url: y.png
---"""
result = R._strip_widget_section(text)
assert "widget:" not in result
assert "base_model: a" in result
# ======================================================================
# _strip_fenced_code_blocks
# ======================================================================
class TestStripFencedCodeBlocks:
def test_strips_with_language(self, R):
text = "a\n```python\ncode\n```\nb"
assert R._strip_fenced_code_blocks(text) == "a\nb"
def test_keeps_plain_fence(self, R):
"""`` ``` `` without language → preserved."""
text = "a\n```\ntrigger words\n```\nb"
assert "trigger words" in R._strip_fenced_code_blocks(text)
def test_pattern(self, R):
text = "x\n```yaml\nkey: val\n```\ny"
assert "key: val" not in R._strip_fenced_code_blocks(text)

View File

@@ -128,7 +128,7 @@ def generate_optimisation_suggestions(
if prev and prev.get("empty_rate_pct", 0) > 50: if prev and prev.get("empty_rate_pct", 0) > 50:
suggestions.append( suggestions.append(
"- **预览图下载成功率低 ({:.0f}%)**: 很多 HF 模型卡没有 embed 图片(仅使用 YAML widget " "- **预览图下载成功率低 ({:.0f}%)**: 很多 HF 模型卡没有 embed 图片(仅使用 YAML widget "
"或 external link。当前 `md_to_html.py` 的 `extract_gallery_images` 和 " "或 external link。当前 `readme_processor.py` 的 `extract_gallery_images` 和 "
"`extract_gallery_table_images` 已覆盖了多数场景。若预览图不重要,可降低此字段权重。".format( "`extract_gallery_table_images` 已覆盖了多数场景。若预览图不重要,可降低此字段权重。".format(
prev.get("empty_rate_pct", 0) prev.get("empty_rate_pct", 0)
) )