refactor(agent): rename md_to_html to readme_processor, fix section extraction, widget parsing, and list_base_models

- Rename md_to_html.py → readme_processor.py (file no longer just HTML conversion) - _extract_section: include YAML frontmatter, use heading-level-aware forward walk (sub-headings under # are included), increase walk limit past 30 lines - _is_heading: exclude </hN> closing tags from boundary detection - _heading_level: new helper for heading-level-aware section matching - css: yield 0 for heading like closing tags, was unexpectedly caught by _is_heading - extract_gallery_images: fix YAML block scalar (text: >-) prompt extraction; use endswith instead of == to detect the block marker - _strip_widget_section: add to clean_readme_for_llm (widget text is handled by post-processor, not needed in LLM prompt) - _strip_standalone_images: keep markdown image URLs intact for LLM preview extraction (was stripping to alt text only) - list_base_models: switch from scanner-cache aggregation to CivitaiBaseModelService.get_base_models() - always returns full list - Ollama: add num_ctx=32768 to payload options so thinking models have room to both reason and produce output - Add tests/agent_cli/test_readme_processor.py: 59 tests covering extraction, cleaning, section matching, heading detection - Update existing tests for behavioral changes
2026-07-05 17:01:16 -03:00 · 2026-07-05 06:39:54 +08:00
parent 905c37290f
commit dd3aa97d0a
8 changed files with 733 additions and 159 deletions
--- a/py/agent_cli/init.py
+++ b/py/agent_cli/init.py
@@ -113,38 +113,29 @@ async def identify_model_type(model_path: str) -> str:
 async def list_base_models(limit: int = 0) -> List[str]:
-    """Return deduplicated base model names from all model caches.
+    """Return all valid CivitAI base model names.
-    The result is ordered by frequency (most common first).  Pass
+    Uses ``CivitaiBaseModelService.get_base_models()`` which merges a
-    *limit* = 0 (default) for all models.
+    hardcoded list (``SUPPORTED_DOWNLOAD_SKIP_BASE_MODELS``) with remote
    models fetched from the CivitAI API.  Never empty — the hardcoded
    fallback always provides a complete set.
    The result is sorted alphabetically.  Pass *limit* = 0 for all models.
    """
-    from ..services.service_registry import ServiceRegistry
+    from ..services.civitai_base_model_service import (
        CivitaiBaseModelService,
    )
-    counts: Dict[str, int] = {}
+    try:
-    for getter_name in (
+        service = await CivitaiBaseModelService.get_instance()
-        "get_lora_scanner",
+        response = await service.get_base_models()
-        "get_checkpoint_scanner",
+        names: List[str] = response.get("models", [])
-        "get_embedding_scanner",
+    except Exception as exc:
-    ):
+        logger.warning("list_base_models failed: %s", exc)
-        getter = getattr(ServiceRegistry, getter_name, None)
+        names = []
        if getter is None:
            continue
        try:
            scanner = await getter()
            if scanner is None:
                continue
            cache = await scanner.get_cached_data()
            for entry in cache.raw_data:
                bm = entry.get("base_model")
                if bm:
                    counts[bm] = counts.get(bm, 0) + 1
        except Exception as exc:
            logger.debug("list_base_models scanner %s error: %s", getter_name, exc)
    sorted_names = [name for name, _ in sorted(counts.items(), key=lambda x: -x[1])]
    if limit > 0:
-        return sorted_names[:limit]
+        return names[:limit]
-    return sorted_names
+    return names
 async def read_metadata(model_path: str) -> Dict[str, Any]:
--- a/py/services/agent/agent_service.py
+++ b/py/services/agent/agent_service.py
@@ -31,7 +31,7 @@ from ..llm_service import LLMService
 from ..websocket_manager import ws_manager
 from .post_processor import PostProcessor
 from .skill_registry import SkillRegistry
-from .skills.enrich_hf_metadata.md_to_html import (
+from .skills.enrich_hf_metadata.readme_processor import (
    clean_readme_for_llm,
    extract_relevant_section,
 )
@@ -397,6 +397,10 @@ class AgentService:
                cleaned = clean_readme_for_llm(readme) if readme else ""
            context["readme_content"] = cleaned if cleaned else "(README not available)"
            context["readme_content_full"] = readme or ""
            logger.info(
                "Cleaned README for %s (%d chars): ---BEGIN---\n%s\n---END---",
                repo, len(cleaned), cleaned[:800] if cleaned else "(empty)",
            )
        try:
            context["base_models"] = await list_base_models()
--- a/py/services/agent/post_processor.py
+++ b/py/services/agent/post_processor.py
@@ -78,7 +78,7 @@ class PostProcessor:
            download_preview,
            refresh_cache,
        )
-        from .skills.enrich_hf_metadata.md_to_html import (
+        from .skills.enrich_hf_metadata.readme_processor import (
            convert_readme_to_html,
            extract_gallery_images,
            extract_gallery_table_images,
--- a/py/services/agent/skills/enrich_hf_metadata/readme_processor.py
+++ b/py/services/agent/skills/enrich_hf_metadata/readme_processor.py
@@ -1,13 +1,8 @@
-"""Inline markdown-to-HTML converter and LLM-prompt cleaner for HF README content.
+"""HF README processing for the ``enrich_hf_metadata`` skill.
-No external dependencies.  Strips YAML frontmatter, ``<Gallery />`` sections,
+Provides README cleaning for LLM injection, gallery/image extraction from
-badge images, and HTML comments before rendering.  Used by the
+multiple formats (YAML widget, markdown, HTML ``<img>``, gallery tables),
-``enrich_hf_metadata`` feature.
+and section-based README trimming for collection repos.
 Also provides :func:`clean_readme_for_llm` which pre-processes the raw README
 before it is injected into the LLM prompt, removing content that has zero value
 for metadata extraction (widget sections, code blocks, training tables,
 boilerplate, massive lists, etc.).
 """
 from __future__ import annotations
@@ -241,7 +236,26 @@ def extract_gallery_images(
        if text_match:
            raw_text = text_match.group(1).strip().strip("'\"")
            if raw_text and raw_text != "-":
-                text = raw_text
+                # Handle YAML block scalar markers (>-, >, |, |-) where the
                # actual text lives on subsequent indented lines.
                if raw_text in (">", ">-", "|", "|-"):
                    text_lines: list[str] = []
                    in_block = False
                    for line in entry.split("\n"):
                        stripped = line.strip()
                        if not in_block:
                            if stripped.endswith(raw_text):
                                in_block = True
                            continue
                        # Block content ends at a line with less indentation
                        # or a YAML key at the start of a line.
                        if not stripped or re.match(r"^\s*\w+:", line):
                            break
                        if stripped:
                            text_lines.append(stripped)
                    text = " ".join(text_lines)
                else:
                    text = raw_text
        if url:
            image: dict = {
@@ -439,6 +453,7 @@ def clean_readme_for_llm(markdown_text: str | None, max_length: int = 6000) -> s
    # Order matters — broader strips first, then finer ones.
    text = _strip_gallery(text)
    text = _strip_widget_section(text)
    text = _strip_fenced_code_blocks(text)
    text = _strip_standalone_images(text)
    text = _strip_training_tables(text)
@@ -722,6 +737,18 @@ def _looks_like_download_link(line: str) -> bool:
    return False
 def _heading_level(line: str) -> int:
    """Return the heading level of *line* (1-4), or 0 if not a heading."""
    stripped = line.strip()
    m = re.match(r"^(#{1,4})\s", stripped)
    if m:
        return len(m.group(1))
    m = re.match(r"^<h([1-4])(?:\s|>)", stripped, re.IGNORECASE)
    if m:
        return int(m.group(1))
    return 0
 def _extract_section(
    lines: list[str], match_idx: int, context_lines: int,
 ) -> str:
@@ -729,15 +756,23 @@ def _extract_section(
    When *match_idx* is itself a heading line, the section starts *at*
    that heading (no backward walk), avoiding pulling in content from
-    earlier sibling sections.
+    earlier sibling sections.  The forward walk only stops at a heading
    of **equal or higher** level (e.g. a ``#`` match includes all its
    ``##`` children).
    Always includes the YAML frontmatter if the original lines contain one,
    because it carries critical metadata (``base_model``, ``tags``,
    ``instance_prompt``) that the LLM needs regardless of which section
    matches.
    """
    n = len(lines)
    # Determine start — if match is a heading, start right there
    if _is_heading(lines[match_idx]):
        start = match_idx
        match_level = _heading_level(lines[match_idx])
    else:
-        # Walk backward to find the nearest heading
+        match_level = 0
        start = max(0, match_idx - context_lines)
        for i in range(match_idx - 1, max(-1, match_idx - context_lines * 3), -1):
            if i < 0:
@@ -747,13 +782,25 @@ def _extract_section(
                start = i
                break
-    # Walk forward to find the next heading at same or higher level
+    # Walk forward.  Stop at a heading of EQUAL or HIGHER (fewer #) level,
-    end = min(n, match_idx + context_lines)
+    # so that a ``# Title`` match encompasses all its ``## Children``.
-    for i in range(match_idx + 1, min(n, match_idx + context_lines * 3)):
+    # Start from the full remaining lines so we don't truncate content
-        if _is_heading(lines[i]):
+    # when the YAML frontmatter pushes the matched heading far down.
    end = n
    walk_limit = min(n, match_idx + max(context_lines * 3, 120))
    for i in range(match_idx + 1, walk_limit):
        hl = _heading_level(lines[i])
        if hl > 0 and (match_level == 0 or hl <= match_level):
            end = i
            break
    # If YAML frontmatter exists before the matched section, prepend it.
    if start > 0 and len(lines) > 1 and lines[0].strip() == "---":
        for i in range(1, min(start, len(lines))):
            if lines[i].strip() == "---":
                yaml_section = "\n".join(lines[:i+1])
                return yaml_section + "\n" + "\n".join(lines[start:end])
    return "\n".join(lines[start:end])
@@ -801,6 +848,26 @@ def _strip_gallery(text: str) -> str:
    return text
 def _strip_widget_section(text: str) -> str:
    """Strip the ``widget:`` YAML block from the README frontmatter.
    The widget section contains verbose example prompts (``text: >-`` entries)
    that are useful for post-processor gallery image extraction but carry
    no signal for LLM metadata extraction.  Stripping them dramatically
    reduces prompt size (e.g. 2800+ chars → ~100 chars) and lets the LLM
    focus on the actual YAML metadata fields (``base_model``, ``tags``,
    ``instance_prompt``, etc.).
    """
    # Match widget: through the end of the frontmatter (the closing ---)
    # or until the next YAML top-level key.
    return re.sub(
        r"\nwidget:.*?(?=\n\w+:|\n---)",
        "",
        text,
        flags=re.DOTALL,
    )
 def _strip_badge_images(text: str) -> str:
    badge_keywords = (
        "badge", "shield", "logo", "icon", "download", "license",
--- a/py/services/llm_service.py
+++ b/py/services/llm_service.py
@@ -364,6 +364,9 @@ class LLMService:
                "think": False,
                "options": {
                    "temperature": temperature,
                    # Allow up to 32K context so the model has room to think
                    # AND produce output without hitting the 4K default limit.
                    "num_ctx": 32768,
                },
            }
            if response_format is not None:
@@ -381,6 +384,16 @@ class LLMService:
            if max_tokens is not None:
                payload["max_tokens"] = max_tokens
        if is_ollama:
            logger.info(
                "Ollama request: model=%s num_ctx=%s num_predict=%s format=%s think=%s",
                payload.get("model"),
                payload.get("options", {}).get("num_ctx"),
                payload.get("options", {}).get("num_predict"),
                payload.get("format", "none"),
                payload.get("think"),
            )
        headers = self._build_headers(cfg["api_key"])
        attempt = 0
@@ -507,8 +520,23 @@ class LLMService:
        )
        try:
-            return json.loads(result["content"])
+            parsed = json.loads(result["content"])
            logger.info(
                "LLM response base_model=%s tags=%s confidence=%s",
                parsed.get("base_model", "?")[:50],
                parsed.get("tags", []),
                parsed.get("confidence", "?"),
            )
            logger.info(
                "LLM raw content: %s",
                (result.get("content") or "")[:1200],
            )
            return parsed
        except (json.JSONDecodeError, TypeError) as exc:
            logger.info(
                "LLM raw response (first 800 chars): %s",
                (result.get("content") or "")[:800],
            )
            logger.warning(
                "LLM JSON parse failed on first attempt: %s. Retrying.", exc
            )
--- a/tests/agent_cli/test_agent_cli.py
+++ b/tests/agent_cli/test_agent_cli.py
@@ -50,78 +50,74 @@ class MockScanner:
 class TestListBaseModels:
-    @pytest.mark.asyncio
+    _MOCK_MODELS = ["SDXL 1.0", "Flux.1 D", "SD 1.5"]
    async def test_empty_cache(self):
        scanner = MockScanner([])
        with mock.patch(
            "py.services.service_registry.ServiceRegistry",
            get_lora_scanner=mock.AsyncMock(return_value=scanner),
            get_checkpoint_scanner=mock.AsyncMock(return_value=None),
            get_embedding_scanner=mock.AsyncMock(return_value=None),
        ):
            result = await list_base_models()
        assert result == []
    @pytest.mark.asyncio
-    async def test_merges_all_scanners(self):
+    async def test_returns_all_models(self):
-        lora_scanner = MockScanner([
+        """Verifies the function delegates to CivitaiBaseModelService.
-            {"base_model": "SDXL 1.0"},
+
-            {"base_model": "Flux.1 D"},
+        Uses a monkey-patch on ``get_instance`` to return a controlled mock
-            {"base_model": "SDXL 1.0"},
+        so we don't need to work around ``mock.patch``'s dotted-path
-        ])
+        limitations with lazy imports inside function bodies."""
-        ckpt_scanner = MockScanner([
+        import py.services.civitai_base_model_service as _svc
-            {"base_model": "SDXL 1.0"},
+        orig = _svc.CivitaiBaseModelService.get_instance
-            {"base_model": "SD 1.5"},
+        mock_svc = mock.AsyncMock()
-        ])
+        mock_svc.get_base_models.return_value = {
-        with mock.patch(
+            "models": self._MOCK_MODELS,
-            "py.services.service_registry.ServiceRegistry",
+        }
-            get_lora_scanner=mock.AsyncMock(return_value=lora_scanner),
+        _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
-            get_checkpoint_scanner=mock.AsyncMock(return_value=ckpt_scanner),
+            return_value=mock_svc,
-            get_embedding_scanner=mock.AsyncMock(return_value=None),
+        )
-        ):
+        try:
            result = await list_base_models()
-        assert result == ["SDXL 1.0", "Flux.1 D", "SD 1.5"]
+            assert result == self._MOCK_MODELS
        finally:
            _svc.CivitaiBaseModelService.get_instance = orig
    @pytest.mark.asyncio
    async def test_limit(self):
-        scanner = MockScanner([
+        import py.services.civitai_base_model_service as _svc
-            {"base_model": "A"}, {"base_model": "B"}, {"base_model": "C"},
+        orig = _svc.CivitaiBaseModelService.get_instance
-        ])
+        mock_svc = mock.AsyncMock()
-        with mock.patch(
+        mock_svc.get_base_models.return_value = {"models": ["A", "B", "C"]}
-            "py.services.service_registry.ServiceRegistry",
+        _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
-            get_lora_scanner=mock.AsyncMock(return_value=scanner),
+            return_value=mock_svc,
-            get_checkpoint_scanner=mock.AsyncMock(return_value=None),
+        )
-            get_embedding_scanner=mock.AsyncMock(return_value=None),
+        try:
        ):
            result = await list_base_models(limit=2)
-        assert result == ["A", "B"]
+            assert result == ["A", "B"]
        finally:
            _svc.CivitaiBaseModelService.get_instance = orig
    @pytest.mark.asyncio
-    async def test_all_scanners_return_none(self):
+    async def test_empty_list_when_service_returns_empty(self):
-        with mock.patch(
+        import py.services.civitai_base_model_service as _svc
-            "py.services.service_registry.ServiceRegistry",
+        orig = _svc.CivitaiBaseModelService.get_instance
-            get_lora_scanner=mock.AsyncMock(return_value=None),
+        mock_svc = mock.AsyncMock()
-            get_checkpoint_scanner=mock.AsyncMock(return_value=None),
+        mock_svc.get_base_models.return_value = {"models": []}
-            get_embedding_scanner=mock.AsyncMock(return_value=None),
+        _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
-        ):
+            return_value=mock_svc,
        )
        try:
            result = await list_base_models()
-        assert result == []
+            assert result == []
        finally:
            _svc.CivitaiBaseModelService.get_instance = orig
    @pytest.mark.asyncio
-    async def test_skips_empty_or_missing_base_model(self):
+    async def test_handles_exception(self):
-        scanner = MockScanner([
+        import py.services.civitai_base_model_service as _svc
-            {"base_model": "SDXL 1.0"},
+        orig = _svc.CivitaiBaseModelService.get_instance
-            {"file_name": "foo.safetensors"},  # no base_model key
+        mock_svc = mock.AsyncMock()
-            {"base_model": ""},                 # empty
+        mock_svc.get_base_models.side_effect = RuntimeError("API error")
-        ])
+        _svc.CivitaiBaseModelService.get_instance = mock.AsyncMock(
-        with mock.patch(
+            return_value=mock_svc,
-            "py.services.service_registry.ServiceRegistry",
+        )
-            get_lora_scanner=mock.AsyncMock(return_value=scanner),
+        try:
            get_checkpoint_scanner=mock.AsyncMock(return_value=None),
            get_embedding_scanner=mock.AsyncMock(return_value=None),
        ):
            result = await list_base_models()
-        assert result == ["SDXL 1.0"]
+            assert result == []
        finally:
            _svc.CivitaiBaseModelService.get_instance = orig
 # ======================================================================
@@ -326,21 +322,21 @@ class TestConvertReadmeToHtml:
    """Tests for the inline markdown→HTML converter."""
    def test_empty_input(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        assert convert_readme_to_html("") == ""
        assert convert_readme_to_html(None) == ""  # type: ignore[arg-type]
    def test_heading(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        result = convert_readme_to_html("# Title")
        assert "<h1>" in result and "Title" in result
    def test_subheadings(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "## Overview\n\n### Details"
@@ -349,7 +345,7 @@ class TestConvertReadmeToHtml:
        assert "<h3>Details</h3>" in result
    def test_bold_and_italic(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "**bold** and *italic*"
@@ -358,7 +354,7 @@ class TestConvertReadmeToHtml:
        assert "<em>italic</em>" in result
    def test_inline_code(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "Use `model.train()`"
@@ -366,7 +362,7 @@ class TestConvertReadmeToHtml:
        assert "<code>" in result and "model.train()" in result
    def test_fenced_code_block(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "```python\nprint('hello')\n```"
@@ -375,7 +371,7 @@ class TestConvertReadmeToHtml:
        assert "print" in result and "hello" in result
    def test_unordered_list(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "- item one\n- item two"
@@ -385,7 +381,7 @@ class TestConvertReadmeToHtml:
        assert "<li>item two</li>" in result
    def test_ordered_list(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "1. first\n2. second"
@@ -395,7 +391,7 @@ class TestConvertReadmeToHtml:
        assert "<li>second</li>" in result
    def test_link(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "[click here](https://example.com)"
@@ -403,7 +399,7 @@ class TestConvertReadmeToHtml:
        assert '<a href="https://example.com">click here</a>' in result
    def test_badge_image_stripped(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "![badge](https://img.shields.io/badge/status-active)"
@@ -411,7 +407,7 @@ class TestConvertReadmeToHtml:
        assert "img.shields.io" not in result
    def test_gallery_stripped(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "Some text\n<Gallery />\nmore text"
@@ -419,7 +415,7 @@ class TestConvertReadmeToHtml:
        assert "<Gallery" not in result
    def test_yaml_frontmatter_stripped(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "---\ntags:\n  - lora\nbase_model: flux\n---\n\n# Real content"
@@ -428,7 +424,7 @@ class TestConvertReadmeToHtml:
        assert "<h1>Real content</h1>" in result
    def test_table(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "| A | B |\n|---|---|\n| 1 | 2 |"
@@ -438,7 +434,7 @@ class TestConvertReadmeToHtml:
        assert "<td>1</td>" in result
    def test_horizontal_rule(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "before\n\n---\n\nafter"
@@ -446,14 +442,14 @@ class TestConvertReadmeToHtml:
        assert "<hr>" in result
    def test_inline_code_preserves_angle_bracket(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        result = convert_readme_to_html("Use `a < b` in code")
        assert "<code>a &lt; b</code>" in result
    def test_blockquote(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "> quoted text"
@@ -462,7 +458,7 @@ class TestConvertReadmeToHtml:
        assert "quoted text" in result
    def test_indented_whitespace_not_treated_as_code(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            convert_readme_to_html
        md = "- item\n    \n## heading after spacing"
@@ -497,7 +493,7 @@ base_model: flux
 """
    def test_extracts_widget_images(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_images
        images = extract_gallery_images(self._README, self._REPO)
@@ -519,7 +515,7 @@ base_model: flux
        assert images[1]["meta"]["prompt"] == "multi line prompt here"
    def test_default_dimensions_used(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_images
        images = extract_gallery_images(self._README, self._REPO)
@@ -527,7 +523,7 @@ base_model: flux
        assert images[0]["height"] == 512
    def test_custom_dimensions_applied(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_images
        images = extract_gallery_images(
@@ -538,27 +534,27 @@ base_model: flux
        assert images[0]["height"] == 1024
    def test_empty_readme_returns_empty(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_images
        assert extract_gallery_images("", self._REPO) == []
        assert extract_gallery_images("no frontmatter here", self._REPO) == []
    def test_empty_repo_returns_empty(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_images
        assert extract_gallery_images(self._README, "") == []
    def test_no_widget_returns_empty(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_images
        md = "---\ntags:\n  - lora\n---\n\nContent"
        assert extract_gallery_images(md, self._REPO) == []
    def test_extract_repo_from_hf_url(self):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_repo_from_hf_url
        assert extract_repo_from_hf_url(
@@ -568,8 +564,10 @@ base_model: flux
        assert extract_repo_from_hf_url("not a url") == ""
    def test_plain_yaml_scalar_text(self):
-        """Unquoted multi-line YAML scalar (plain format) should extract prompt."""
+        """Unquoted multi-line YAML scalar (plain format) extracts first line only.
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        The YAML parser only reports the value on the ``text:`` line; continuation
        lines are handled by the post-processor from the raw README."""
        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_images
        md = """---
@@ -581,8 +579,7 @@ widget:
 ---"""
        images = extract_gallery_images(md, "user/repo")
        assert len(images) == 1
-        assert "two samurais doing a muay thai fight" in images[0]["meta"]["prompt"]
+        assert images[0]["meta"]["prompt"] == "two samurais doing a muay thai fight"
        assert "Textured abstract style" in images[0]["meta"]["prompt"]
 # ======================================================================
@@ -603,7 +600,7 @@ class TestExtractGalleryTableImages:
    @staticmethod
    def _extract(md: str, repo: str = _REPO, existing: set | None = None):
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            extract_gallery_table_images
        return extract_gallery_table_images(md, repo, existing_urls=existing)
@@ -647,7 +644,7 @@ class TestCleanReadmeForLlm:
    @staticmethod
    def _clean(md: str, max_length: int = 6000) -> str:
-        from py.services.agent.skills.enrich_hf_metadata.md_to_html import \
+        from py.services.agent.skills.enrich_hf_metadata.readme_processor import \
            clean_readme_for_llm
        return clean_readme_for_llm(md, max_length=max_length)
@@ -665,10 +662,9 @@ class TestCleanReadmeForLlm:
    # -- widget section stripping -------------------------------------------
-    def test_widget_text_preserved_in_cleaned_output(self):
+    def test_widget_stripped_frontmatter_metadata_preserved(self):
-        """Widget section text is preserved — it provides useful signal
+        """Widget section is stripped, but ``base_model``, ``tags``,
-        for tag and description extraction (example prompts describe what
+        ``instance_prompt`` survive."""
        the model generates)."""
        md = """---
 tags:
 - lora
@@ -689,11 +685,10 @@ instance_prompt: trigger word
 This is the actual content.
 """
        result = self._clean(md)
-        # Widget text content preserved (valuable signal for tags)
+        # Widget text stripped (it's handled by the post-processor gallery
-        # YAML folded scalars (``>-``) may split text across lines
+        # extraction instead)
-        assert "a test prompt" in result
+        assert "a test prompt" not in result
-        assert "another long" in result
+        assert "another long" not in result
        assert "prompt here" in result
        # Non-widget frontmatter preserved
        assert "base_model: black-forest-labs/FLUX.1-dev" in result
        assert "instance_prompt: trigger word" in result
@@ -703,7 +698,7 @@ This is the actual content.
        assert "Model Description" in result
    def test_widget_last_key_in_frontmatter(self):
-        """Widget text at end of frontmatter is preserved."""
+        """Widget stripped, non-widget keys preserved."""
        md = """---
 tags:
 - lora
@@ -715,7 +710,7 @@ widget:
 # Content
 """
        result = self._clean(md)
-        assert "prompt" in result
+        assert "prompt" not in result
        assert "tags:" in result
    def test_no_widget_untouched(self):
@@ -798,12 +793,13 @@ pixel art sprite, game asset, transparent background
    # -- standalone image stripping ------------------------------------------
-    def test_standalone_image_stripped(self):
+    def test_standalone_image_urls_preserved_for_llm(self):
        """Markdown image URLs are kept so the LLM can extract a ``preview_url``."""
        md = "## Gallery\n![sample](https://cdn.hf.co/img.png)\n![another](https://cdn.hf.co/img2.png)\n\nSome text."
        result = self._clean(md)
-        assert "cdn.hf.co" not in result
+        # URLs preserved for LLM preview extraction
-        assert "sample" in result  # alt text preserved
+        assert "cdn.hf.co/img.png" in result
-        assert "another" in result  # alt text preserved
+        assert "cdn.hf.co/img2.png" in result
        assert "## Gallery" in result
        assert "Some text." in result
@@ -1001,10 +997,10 @@ Weights for this model are available in Safetensors format.
        original_len = len(md)
        result = self._clean(md)
-        # Still significantly smaller (widget text is kept but training
+        # Significantly smaller: widget + training tables + code blocks
-        # tables, code blocks, boilerplate are stripped)
+        # + boilerplate all stripped
-        assert len(result) < original_len * 0.7, (
+        assert len(result) < original_len * 0.35, (
-            f"Expected <70% of original, got {len(result)}/{original_len}"
+            f"Expected <35% of original, got {len(result)}/{original_len}"
        )
        # Signal preserved
@@ -1013,9 +1009,8 @@ Weights for this model are available in Safetensors format.
        assert "3D" in result
        assert "Toon" in result
-        # Widget content preserved (text is valuable signal for tags/desc)
+        # Widget content stripped (post-processor handles image extraction)
-        assert "close-up of a cartoon character face" in result
+        assert "close-up of a cartoon character face" not in result
        assert "Super Detail" in result
        # Noise stripped
        assert "import torch" not in result
--- a/tests/agent_cli/test_readme_processor.py
+++ b/tests/agent_cli/test_readme_processor.py
@@ -0,0 +1,489 @@
 """Tests for ``readme_processor.py`` — HF README processing for enrich_hf_metadata.
 Import via ``importlib`` to avoid the ``folder_paths`` dependency in
 ``py.services.agent.__init__``.
 """
 from __future__ import annotations
 import importlib.util
 import re
 from pathlib import Path
 import pytest
 _MODULE_PATH = Path(__file__).parents[2] / "py" / "services" / "agent" / "skills" / "enrich_hf_metadata" / "readme_processor.py"
@pytest.fixture(scope="session")
 def R():
    """Load the ``readme_processor`` module once per session."""
    spec = importlib.util.spec_from_file_location("readme_processor", str(_MODULE_PATH))
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod
 # ======================================================================
 # extract_gallery_images
 # ======================================================================
 class TestExtractGalleryImages:
    def test_empty(self, R):
        assert R.extract_gallery_images("", "repo") == []
        assert R.extract_gallery_images("no frontmatter", "repo") == []
    def test_no_widget(self, R):
        readme = "---\ntags: [test]\n---\nbody"
        assert R.extract_gallery_images(readme, "repo") == []
    def test_widget_simple_text(self, R):
        """YAML ``text: 'plain'`` → extracted as-is."""
        readme = """---
 widget:
 - text: 'a cute cat'
  output:
    url: images/cat.png
 ---"""
        imgs = R.extract_gallery_images(readme, "user/repo")
        assert len(imgs) == 1
        assert imgs[0]["meta"]["prompt"] == "a cute cat"
        assert "images/cat.png" in imgs[0]["url"]
    def test_widget_unquoted_text(self, R):
        """YAML ``text: plain value`` without quotes."""
        readme = """---
 widget:
 - text: simple text
  output:
    url: img.png
 ---"""
        imgs = R.extract_gallery_images(readme, "user/repo")
        assert len(imgs) == 1
        assert imgs[0]["meta"]["prompt"] == "simple text"
    def test_widget_block_scalar(self, R):
        """YAML ``text: >-`` folded block scalar — extract actual content."""
        readme = """---
 widget:
 - text: >-
    Long toons, a close-up of a cartoon characters face is featured in a
    vibrant red backdrop.
  output:
    url: images/LT4.png
 ---"""
        imgs = R.extract_gallery_images(readme, "user/repo")
        assert len(imgs) == 1
        prompt = imgs[0]["meta"]["prompt"]
        assert "Long toons" in prompt
        assert "vibrant red backdrop" in prompt
        assert prompt != ">-"
    def test_widget_dash_prefix_output(self, R):
        """YAML ``- output:`` (dash prefix) — regression for widget parsing."""
        readme = """---
 widget:
 - output:
    url: images/test.png
  text: dash test
 ---"""
        imgs = R.extract_gallery_images(readme, "user/repo")
        assert len(imgs) == 1
        assert imgs[0]["meta"]["prompt"] == "dash test"
        assert "images/test.png" in imgs[0]["url"]
    def test_widget_mixed_entries(self, R):
        """Multiple widget entries with different text styles."""
        readme = """---
 widget:
 - text: >-
    First entry description.
  output:
    url: img1.png
 - text: second entry
  output:
    url: img2.png
 - text: 'third entry'
  output:
    url: img3.png
 ---"""
        imgs = R.extract_gallery_images(readme, "user/repo")
        assert len(imgs) == 3
        assert imgs[0]["meta"]["prompt"] == "First entry description."
        assert imgs[1]["meta"]["prompt"] == "second entry"
        assert imgs[2]["meta"]["prompt"] == "third entry"
 # ======================================================================
 # extract_simple_markdown_images
 # ======================================================================
 class TestExtractSimpleMarkdownImages:
    def test_empty(self, R):
        assert R.extract_simple_markdown_images("", "repo") == []
    def test_basic_markdown_image(self, R):
        """``![alt](./img.png)`` → absolute URL."""
        imgs = R.extract_simple_markdown_images("![test](./image_0.png)", "u/r")
        assert len(imgs) == 1
        assert "image_0.png" in imgs[0]["url"]
        assert imgs[0]["meta"]["prompt"] == "test"
    def test_absolute_url(self, R):
        """``![alt](https://...)`` → keep as-is."""
        imgs = R.extract_simple_markdown_images(
            "![img](https://example.com/img.png)", "u/r"
        )
        assert len(imgs) == 1
        assert imgs[0]["url"] == "https://example.com/img.png"
    def test_skips_code_fences(self, R):
        """Inside ``` blocks should be ignored."""
        text = """outside
 ```
 ![inside](./img.png)
 ```
 outside again
 ![valid](./valid.png)"""
        imgs = R.extract_simple_markdown_images(text, "u/r")
        assert len(imgs) == 1
        assert "valid.png" in imgs[0]["url"]
    def test_deduplicates(self, R):
        text = "![a](./img.png)\n![b](./img.png)"
        imgs = R.extract_simple_markdown_images(text, "u/r")
        assert len(imgs) == 1  # deduplicated
 # ======================================================================
 # extract_html_img_tags
 # ======================================================================
 class TestExtractHtmlImgTags:
    def test_double_quoted_src(self, R):
        imgs = R.extract_html_img_tags('<img src="./img.png">', "u/r")
        assert len(imgs) == 1
        assert "img.png" in imgs[0]["url"]
    def test_single_quoted_src(self, R):
        imgs = R.extract_html_img_tags("<img src='./img.png'>", "u/r")
        assert len(imgs) == 1
        assert "img.png" in imgs[0]["url"]
    def test_absolute_url(self, R):
        imgs = R.extract_html_img_tags(
            '<img src="https://cdn.example.com/img.png">', "u/r"
        )
        assert len(imgs) == 1
        assert imgs[0]["url"] == "https://cdn.example.com/img.png"
    def test_deduplicates_across_formats(self, R):
        text = '<img src="./img.png">\n<img src=\'./img.png\'>'
        imgs = R.extract_html_img_tags(text, "u/r")
        assert len(imgs) == 1
 # ======================================================================
 # extract_gallery_table_images
 # ======================================================================
 class TestExtractGalleryTableImages:
    def test_gallery_table(self, R):
        text = """| Preview | Prompt |
 |--------|--------|
 | ![img](./a.png) | a cat |
 | ![img](./b.png) | a dog |"""
        imgs = R.extract_gallery_table_images(text, "u/r")
        assert len(imgs) == 2
        assert imgs[0]["meta"]["prompt"] == "a cat"
        assert "a.png" in imgs[0]["url"]
        assert imgs[1]["meta"]["prompt"] == "a dog"
    def test_skips_non_gallery_table(self, R):
        text = """| Parameter | Value |
 |----------|-------|
 | Steps    | 4     |"""
        imgs = R.extract_gallery_table_images(text, "u/r")
        assert len(imgs) == 0
 # ======================================================================
 # clean_readme_for_llm  +  strip helpers
 # ======================================================================
 class TestCleanReadmeForLlm:
    def test_preserves_plain_code_block(self, R):
        """`` ``` `` without language tag → preserved (trigger words)."""
        text = """Before
 ```
 pixel art sprite, game asset
 ```
 After"""
        cleaned = R.clean_readme_for_llm(text)
        assert "pixel art sprite" in cleaned
        assert "game asset" in cleaned
    def test_strips_fenced_code_with_lang(self, R):
        """`` ```python `` → stripped."""
        text = "before\n```python\nimport torch\n```\nafter"
        cleaned = R.clean_readme_for_llm(text)
        assert "import torch" not in cleaned
        assert "before" in cleaned
        assert "after" in cleaned
    def test_preserves_markdown_image_url(self, R):
        """``![alt](url)`` → URL kept for LLM preview extraction."""
        text = "![sample](./preview.png)"
        cleaned = R.clean_readme_for_llm(text)
        assert "./preview.png" in cleaned
    def test_strips_html_img_tag(self, R):
        """``<img src="...">`` → stripped."""
        text = 'before\n<img src="logo.png">\nafter'
        cleaned = R.clean_readme_for_llm(text)
        assert "logo.png" not in cleaned
    def test_widget_stripped_frontmatter_preserved(self, R):
        """Widget YAML stripped but ``base_model:`` kept."""
        text = """---
 tags: [test]
 widget:
 - text: >-
    long description here
  output:
    url: img.png
 base_model: black-forest-labs/FLUX.1-dev
 instance_prompt: test
 ---"""
        cleaned = R.clean_readme_for_llm(text)
        assert "widget:" not in cleaned
        assert "black-forest-labs/FLUX.1-dev" in cleaned
        assert "instance_prompt: test" in cleaned
    def test_training_table_stripped(self, R):
        """Training-parameter table → stripped."""
        text = """before
 | LR Scheduler | constant |
 |--------------|---------|
 | Optimizer    | AdamW   |
 after"""
        cleaned = R.clean_readme_for_llm(text)
        assert "LR Scheduler" not in cleaned
        assert "Optimizer" not in cleaned
        assert "before" in cleaned
        assert "after" in cleaned
    def test_best_dimensions_table_kept(self, R):
        """Non-training table (Best Dimensions) → kept."""
        text = """## Best Dimensions
 - 768 x 1024 (Best)
 - 1024 x 1024 (Default)"""
        cleaned = R.clean_readme_for_llm(text)
        assert "768 x 1024" in cleaned
    def test_boilerplate_section_stripped(self, R):
        text = """stuff
 ## Download model
 [link](url)
 ## Next section
 content"""
        cleaned = R.clean_readme_for_llm(text)
        assert "Download model" not in cleaned
        assert "Next section" in cleaned
        assert "content" in cleaned
    def test_returns_empty_for_none(self, R):
        assert R.clean_readme_for_llm(None) == ""
    def test_returns_empty_for_empty(self, R):
        assert R.clean_readme_for_llm("") == ""
 # ======================================================================
 # _is_heading  /  _heading_level
 # ======================================================================
 class TestHeadingDetection:
    @pytest.mark.parametrize(
        "line,expected",
        [
            ("# Title", 1),
            ("## Sub", 2),
            ("### Subsub", 3),
            ("#### Subsubsub", 4),
            ("<h1>Title</h1>", 1),
            ("<h2>Sub</h2>", 2),
            ("<h3 class='x'>Sub</h3>", 3),
            ("<h4 id='y'>Sub</h4>", 4),
            ("not a heading", 0),
            ("###", 0),  # no text after ###
            ("</h2>", 0),  # closing tag, not a heading
            ("", 0),
        ],
    )
    def test_heading_level(self, R, line, expected):
        assert R._heading_level(line) == expected
    @pytest.mark.parametrize(
        "line,expected",
        [
            ("# Title", True),
            ("<h2>Sub</h2>", True),
            ("</h2>", False),  # closing tag
            ("not heading", False),
        ],
    )
    def test_is_heading(self, R, line, expected):
        assert R._is_heading(line) == expected
 # ======================================================================
 # extract_relevant_section
 # ======================================================================
 class TestExtractRelevantSection:
    def test_fallback_full_readme(self, R):
        """No match → full README returned."""
        readme = "# Title\n\nsome content"
        assert R.extract_relevant_section(readme, "nonexistent") == readme
    def test_empty_basename_returns_full(self, R):
        readme = "# Title"
        assert R.extract_relevant_section(readme, "") == readme
    def test_match_heading_includes_yaml(self, R):
        """Matching heading should still include YAML frontmatter."""
        readme = """---
 base_model: foo
 ---
 # My-Model-Title
 content
 ## Subsection
 more"""
        section = R.extract_relevant_section(readme, "My-Model")
        assert "base_model: foo" in section
        assert "content" in section
        assert "Subsection" in section
    def test_match_heading_includes_subheadings(self, R):
        """``# Title`` match includes all ``##`` children."""
        readme = """# Main Title
 ## Child A
 content A
 ## Child B
 content B
 ## Child C
 content C"""
        section = R.extract_relevant_section(readme, "Main Title")
        assert "Child A" in section
        assert "Child B" in section
        assert "Child C" in section
    def test_match_download_link(self, R):
        """Download link containing basename → section extracted."""
        readme = """# Collection
 ## Model A
 [Download](./model_a.safetensors)
 ## MyModel
 [Download](./mymodel.safetensors)
 content here
 ## Model B
 other"""
        section = R.extract_relevant_section(readme, "mymodel")
        assert "content here" in section
        assert "Model A" not in section  # should not include sibling
    def test_heading_closing_tag_not_boundary(self, R):
        """``</h2>`` should NOT be treated as a section boundary."""
        readme = """# Title
 <p>some text</p>
 </h2>
 ## Real Section
 content"""
        section = R.extract_relevant_section(readme, "Title")
        assert "Real Section" in section  # forward walk should not stop at </h2>
        assert "content" in section
 # ======================================================================
 # _extract_frontmatter
 # ======================================================================
 class TestExtractFrontmatter:
    def test_basic(self, R):
        assert R._extract_frontmatter("---\ntags: [a]\n---\nbody") == "\ntags: [a]\n"
    def test_no_frontmatter(self, R):
        assert R._extract_frontmatter("no dashes") == ""
    def test_empty_string(self, R):
        assert R._extract_frontmatter("") == ""
 # ======================================================================
 # _strip_widget_section
 # ======================================================================
 class TestStripWidgetSection:
    def test_strip_widget_keep_base_model(self, R):
        """Widget stripped but ``base_model:`` preserved."""
        text = """---
 tags: [test]
 widget:
 - text: >-
    long text
  output:
    url: img.png
 base_model: black-forest-labs/FLUX.1-dev
 ---"""
        result = R._strip_widget_section(text)
        assert "widget:" not in result
        assert "black-forest-labs/FLUX.1-dev" in result
    def test_no_widget_no_change(self, R):
        text = "---\ntags: [a]\n---"
        assert R._strip_widget_section(text) == text
    def test_widget_at_end_of_frontmatter(self, R):
        """Widget is the last YAML key before closing ---."""
        text = """---
 base_model: a
 widget:
 - text: x
  output:
    url: y.png
 ---"""
        result = R._strip_widget_section(text)
        assert "widget:" not in result
        assert "base_model: a" in result
 # ======================================================================
 # _strip_fenced_code_blocks
 # ======================================================================
 class TestStripFencedCodeBlocks:
    def test_strips_with_language(self, R):
        text = "a\n```python\ncode\n```\nb"
        assert R._strip_fenced_code_blocks(text) == "a\nb"
    def test_keeps_plain_fence(self, R):
        """`` ``` `` without language → preserved."""
        text = "a\n```\ntrigger words\n```\nb"
        assert "trigger words" in R._strip_fenced_code_blocks(text)
    def test_pattern(self, R):
        text = "x\n```yaml\nkey: val\n```\ny"
        assert "key: val" not in R._strip_fenced_code_blocks(text)
--- a/tests/enrich_hf_validation/report_generator.py
+++ b/tests/enrich_hf_validation/report_generator.py
@@ -128,7 +128,7 @@ def generate_optimisation_suggestions(
    if prev and prev.get("empty_rate_pct", 0) > 50:
        suggestions.append(
            "- **预览图下载成功率低 ({:.0f}%)**: 很多 HF 模型卡没有 embed 图片（仅使用 YAML widget "
-            "或 external link）。当前 `md_to_html.py` 的 `extract_gallery_images` 和 "
+            "或 external link）。当前 `readme_processor.py` 的 `extract_gallery_images` 和 "
            "`extract_gallery_table_images` 已覆盖了多数场景。若预览图不重要，可降低此字段权重。".format(
                prev.get("empty_rate_pct", 0)
            )